잠재 디리클레 할당(Latent Dirichlet Allocation, LDA)

In [2]:
#토픽을 추출하지만 단서 순서는 고려하지 않음
#LDA 알고리즘은 토픽 k가 M개의 문서에 걸쳐 분포되었다고 가정

#LSA : 단어 문서 행렬을 차원 축소하여 축소 차원에서 근접 단어들을 토픽으로 묶는다
#LDA : 단어가 특정 토픽에 존재할 확률과 문서에 특정 토픽이 존재할 확률을 결합확률로 추정하여 토픽을 추출

In [4]:
import pandas as pd
data = pd.read_csv("data/abcnews-date-text.csv", error_bad_lines=False)
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [5]:
text = data[['headline_text']]
text.head()

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [6]:
#preprocessing _ tokenizing
import nltk
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,headline_text
0,"[aba, decides, against, community, broadcastin..."
1,"[act, fire, witnesses, must, be, aware, of, de..."
2,"[a, g, calls, for, infrastructure, protection,..."
3,"[air, nz, staff, in, aust, strike, for, pay, r..."
4,"[air, nz, strike, to, affect, australian, trav..."


In [7]:
#delete stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop)])
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,headline_text
0,"[aba, decides, community, broadcasting, licence]"
1,"[act, fire, witnesses, must, aware, defamation]"
2,"[g, calls, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


In [8]:
#표제어 추출 : lemmatizer
from nltk.stem import WordNetLemmatizer
text['headline_text'] = text['headline_text'].apply(lambda x : [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
text.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,headline_text
0,"[aba, decide, community, broadcast, licence]"
1,"[act, fire, witness, must, aware, defamation]"
2,"[g, call, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


In [9]:
#delete words less then 3
tokenized_doc = text['headline_text'].apply(lambda x : [word for word in x if len(word) > 3])
tokenized_doc[:5]

0       [decide, community, broadcast, licence]
1      [fire, witness, must, aware, defamation]
2    [call, infrastructure, protection, summit]
3                   [staff, aust, strike, rise]
4      [strike, affect, australian, travellers]
Name: headline_text, dtype: object

In [10]:
#TF-IDF Matrix를 만들기 위해서 역토큰화(detokenization)
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)
text['headline_text'] = detokenized_doc
# 다시 text['headline_text'] 에 저장
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,headline_text
0,decide community broadcast licence
1,fire witness must aware defamation
2,call infrastructure protection summit
3,staff aust strike rise
4,strike affect australian travellers


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(text['headline_text'])
X.shape

(1186018, 1000)

In [13]:
#TOPIC MODELING
#finallllly run LDA 

from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', random_state=777, max_iter=1)
lda_top = lda_model.fit_transform(X)
print(lda_model.components_)
print(lda_model.components_.shape)

[[1.00001251e-01 1.00000870e-01 1.00000959e-01 ... 1.00003789e-01
  1.00005244e-01 1.00005701e-01]
 [1.00001186e-01 1.00000321e-01 1.00001492e-01 ... 1.00008495e-01
  1.00003773e-01 5.28131341e+02]
 [1.00002566e-01 1.00000691e-01 1.00001989e-01 ... 1.00004725e-01
  1.00004888e-01 1.00003502e-01]
 ...
 [1.00001872e-01 1.00000609e-01 1.00004551e-01 ... 1.00006062e-01
  1.00004418e-01 1.00005045e-01]
 [1.39239821e+02 1.00000829e-01 1.00002009e-01 ... 1.00005526e-01
  1.00004846e-01 1.00004377e-01]
 [1.00001172e-01 2.98823953e+02 1.00002656e-01 ... 1.00006216e-01
  1.00003730e-01 1.00006511e-01]]
(10, 1000)


In [14]:
terms = vectorizer.get_feature_names() 
# 단어 집합. 1,000개의 단어가 저장되어있음.

In [15]:
def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d :" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n -1:-1]])

In [16]:
get_topics(lda_model.components_, terms)

Topic 1 : [('sydney', 9617.34), ('queensland', 8104.47), ('kill', 7338.07), ('court', 6475.51), ('open', 5572.5)]
Topic 2 : [('australia', 15449.55), ('australian', 13312.79), ('government', 7949.7), ('home', 6582.47), ('leave', 4909.81)]
Topic 3 : [('donald', 7654.71), ('live', 6199.76), ('south', 5924.7), ('federal', 4876.11), ('help', 4841.53)]
Topic 4 : [('melbourne', 7084.73), ('canberra', 6100.01), ('report', 5558.55), ('people', 5271.16), ('time', 4731.3)]
Topic 5 : [('police', 13274.17), ('attack', 6849.04), ('speak', 5367.5), ('family', 5250.09), ('warn', 5147.29)]
Topic 6 : [('house', 6402.18), ('test', 5756.46), ('tasmania', 5385.18), ('plan', 4782.07), ('talk', 4215.42)]
Topic 7 : [('charge', 8704.62), ('murder', 6698.9), ('shoot', 6287.13), ('years', 6079.1), ('north', 5422.0)]
Topic 8 : [('trump', 15036.78), ('death', 6809.39), ('change', 6625.03), ('crash', 6418.79), ('year', 6139.46)]
Topic 9 : [('election', 8921.67), ('market', 7054.2), ('make', 6682.56), ('adelaide', 