all exercises based on [Introduction to Deep Learning for NLP](https://wikidocs.net/30708)

In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle = True, random_state=1, remove=('headers,', 'footers', 'quotes'))
documents = dataset.data
len(documents)

11314

In [2]:
df = pd.DataFrame(documents, columns = ['data'])
df['data']

0        From: ab4z@Virginia.EDU ("Andi Beyer")\nSubjec...
1        From: timmbake@mcl.ucsb.edu (Bake Timmons)\nSu...
2        From: bc744@cleveland.Freenet.Edu (Mark Ira Ka...
3        From: ray@ole.cdac.com (Ray Berry)\nSubject: C...
4        From: kkeller@mail.sas.upenn.edu (Keith Keller...
                               ...                        
11309    From: adams@bellini.berkeley.edu (Adam L. Schw...
11310    From: levin@bbn.com (Joel B Levin)\nSubject: R...
11311    From: tedward@cs.cornell.edu (Edward [Ted] Fis...
11312    From: mori@volga.mfd.cs.fujitsu.co.jp (Tsuyosh...
11313    From: marc@yogi.austin.ibm.com (Marc J. Stephe...
Name: data, Length: 11314, dtype: object

In [3]:
# 특수문자 제거
df['data'] = df['data'].str.replace('[^A-Za-z]',' ')

# 길이 짧은 단어 제거
df['data'] = df['data'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# 전체 단어 소문자화
df['data'] = df['data'].apply(lambda x: x.lower())

In [5]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
df['data'] = df['data'].apply(lambda x: x.split()) # 토큰화..
df['data'] = df['data'].apply(lambda x: [w for w in x if w not in stop_words])

In [6]:
df['data']

0        [virginia, andi, beyer, subject, israeli, terr...
1        [timmbake, ucsb, bake, timmons, subject, amusi...
2        [cleveland, freenet, mark, kaufman, subject, r...
3        [cdac, berry, subject, clipper, business, usua...
4        [kkeller, mail, upenn, keith, keller, subject,...
                               ...                        
11309    [adams, bellini, berkeley, adam, schwartz, sub...
11310    [levin, joel, levin, subject, selective, place...
11311    [tedward, cornell, edward, fischer, subject, b...
11312    [mori, volga, fujitsu, tsuyoshi, mori, subject...
11313    [marc, yogi, austin, marc, stephenson, subject...
Name: data, Length: 11314, dtype: object

### 정수 인코딩, 단어 집합 만들기

In [7]:
# 각 단어를 (word_id, word_freqeuncey)로 만들 것 -> gensim의 corpora.Dictionary()로 손쉽게 구현 가능
# gensim의 ldamodel의 입력으로 위와 같은 형식 넣어줘야해서

In [8]:
from gensim import corpora

In [9]:
dictionary = corpora.Dictionary(df['data'])
corpus = [dictionary.doc2bow(text) for text in df['data']]


In [20]:
print(corpus[1]
     )

[(3, 1), (5, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 2), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 2), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 2), (43, 1), (44, 1), (45, 1), (46, 2), (47, 1), (48, 1), (49, 1)]


In [21]:
print(dictionary[3]) # word_id 3인 단어 확인

lines


In [22]:
len(dictionary)

67688

### LDA 모델 훈련시키기

In [24]:
import gensim
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 20, id2word = dictionary, passes = 15)
topics = ldamodel.print_topics(num_words=4) # num_word: 각 토픽 당 몇 개의 단어를 확인할 것인지 (단어별 해당 topic에의 기여도)
for topic in topics:
    print(topic)

(0, '0.060*"space" + 0.042*"nasa" + 0.013*"launch" + 0.011*"orbit"')
(1, '0.014*"adobe" + 0.013*"borland" + 0.012*"stevens" + 0.010*"liar"')
(2, '0.029*"file" + 0.015*"program" + 0.014*"window" + 0.013*"output"')
(3, '0.020*"clipper" + 0.020*"encryption" + 0.019*"access" + 0.019*"chip"')
(4, '0.008*"people" + 0.007*"government" + 0.006*"israel" + 0.005*"state"')
(5, '0.081*"lines" + 0.081*"subject" + 0.080*"organization" + 0.058*"posting"')
(6, '0.045*"andrew" + 0.020*"wire" + 0.018*"ground" + 0.017*"mellon"')
(7, '0.012*"would" + 0.009*"people" + 0.008*"subject" + 0.007*"think"')
(8, '0.026*"drive" + 0.018*"scsi" + 0.017*"disk" + 0.014*"sale"')
(9, '0.027*"period" + 0.014*"play" + 0.013*"power" + 0.010*"scorer"')
(10, '0.014*"said" + 0.009*"went" + 0.009*"know" + 0.009*"people"')
(11, '0.017*"subject" + 0.017*"lines" + 0.016*"organization" + 0.011*"would"')
(12, '0.012*"game" + 0.012*"team" + 0.011*"year" + 0.009*"games"')
(13, '0.012*"available" + 0.010*"information" + 0.008*"data" +

In [25]:
# LDA 시각화
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
  Downloading pyLDAvis-3.3.0.tar.gz (1.7 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
  Downloading pyLDAvis-3.2.2.tar.gz (1.7 MB)
Collectin

In [26]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

# 토픽 별 단어 분포

In [31]:
print(len(ldamodel[corpus])) # 이건 어디서 나온거지?

11314


In [32]:
ldamodel.per_word_topics

False

In [27]:
# 문서 별 토필 분포

for i, topic_list in enumerate(ldamodel[corpus]):
    if i == 6:
        break
    print(i,'번째 문서의 topic 비율은', topic_list)

0 번째 문서의 topic 비율은 [(4, 0.18939018), (5, 0.7287741)]
1 번째 문서의 topic 비율은 [(7, 0.688508), (10, 0.21055417), (12, 0.033887412), (14, 0.027232163), (15, 0.023844225)]
2 번째 문서의 topic 비율은 [(4, 0.2840416), (5, 0.11365779), (7, 0.42628542), (12, 0.16634876)]
3 번째 문서의 topic 비율은 [(3, 0.25881258), (7, 0.28639224), (8, 0.053637262), (11, 0.2786027), (14, 0.024175107), (16, 0.08902741)]
4 번째 문서의 topic 비율은 [(2, 0.04790099), (5, 0.15205674), (12, 0.7567419), (13, 0.02695426)]
5 번째 문서의 topic 비율은 [(1, 0.23436886), (5, 0.15568405), (7, 0.4469804), (10, 0.09805209), (14, 0.042831965)]


In [41]:
# dataframe 형식으로
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()
    
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc= topic_list[0] if ldamodel.per_word_topics else topic_list # .per_word_topics ?
        doc = sorted(doc, key=lambda x : (x[1]), reverse = True)
        
        
        for j, (topic_num, prop_topic) in enumerate(doc):
            if j == 0:
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic, 4), topic_list]), ignore_index = True)
            else:
                break
    print(type(topic_table))
    return(topic_table)

In [40]:
type(topictable)

NoneType

In [42]:
topictable = make_topictable_per_doc(ldamodel, corpus)

topictable = topictable.reset_index()

<class 'pandas.core.frame.DataFrame'>


In [43]:
topictable

Unnamed: 0,index,0,1,2
0,0,5.0,0.7288,"[(4, 0.18939021), (5, 0.7287741)]"
1,1,7.0,0.6885,"[(7, 0.68850106), (10, 0.21055464), (12, 0.033..."
2,2,7.0,0.4263,"[(4, 0.2840295), (5, 0.113653414), (7, 0.42632..."
3,3,7.0,0.2864,"[(3, 0.25881246), (7, 0.2864015), (8, 0.053637..."
4,4,12.0,0.7567,"[(2, 0.047901314), (5, 0.15206514), (12, 0.756..."
...,...,...,...,...
11309,11309,15.0,0.2960,"[(2, 0.030567605), (4, 0.29006645), (5, 0.2317..."
11310,11310,5.0,0.6014,"[(5, 0.6014331), (15, 0.18755853), (17, 0.1259..."
11311,11311,7.0,0.2584,"[(5, 0.15384236), (7, 0.25837898), (9, 0.24987..."
11312,11312,5.0,0.5180,"[(5, 0.51797646), (6, 0.031342607), (7, 0.1492..."


In [44]:
topictable.columns = ['문서 번호', '가장 비중 높은 토픽', '가장 높은 토픽의 비중' ,'각 토픽의 비중']
topictable.head()

Unnamed: 0,문서 번호,가장 비중 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,5.0,0.7288,"[(4, 0.18939021), (5, 0.7287741)]"
1,1,7.0,0.6885,"[(7, 0.68850106), (10, 0.21055464), (12, 0.033..."
2,2,7.0,0.4263,"[(4, 0.2840295), (5, 0.113653414), (7, 0.42632..."
3,3,7.0,0.2864,"[(3, 0.25881246), (7, 0.2864015), (8, 0.053637..."
4,4,12.0,0.7567,"[(2, 0.047901314), (5, 0.15206514), (12, 0.756..."
