# Topic Modeling - LDA(Latent Dirichlet Allocation)

https://wikidocs.net/30708

    - LSA : DTM을 차원 축소하여 축소 차원에서 근접 단어들을 토픽으로 묶음
    - LDA : 단어가 특정 토픽에 존재할 확률과 특정 확률이 존재할 확률을 결합확률로 추정하여 토픽 추출

In [1]:
import pandas as pd
import nltk

from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers','footers','quotes'))
documents = dataset.data

print(f" sample : {len(documents)}")

 sample : 11314


## `Text processing`

In [3]:
news_df = pd.DataFrame({'document':documents})
display(news_df.head(3))

Unnamed: 0,document
0,Well i'm not sure about the story nad it did s...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,Although I realize that principle is not one o...


In [4]:
news_df['clean_doc'] = news_df['document'].str.replace('[^a-zA-Z]', ' ')
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

In [5]:
# 불용어 가져오기

stop_words = stopwords.words('english')
tokenized_doc= news_df['clean_doc'].apply(lambda x: x.split())
tokenzied_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])


In [113]:
tokenzied_doc

0        [well, sure, story, seem, biased., disagree, s...
1        [yeah,, expect, people, read, faq,, etc., actu...
2        [although, realize, principle, strongest, poin...
3        [notwithstanding, legitimate, fuss, proposal,,...
4        [well,, change, scoring, playoff, pool., unfor...
                               ...                        
11309    [danny, rubenstein,, israeli, journalist,, spe...
11310                                                   []
11311    [agree., home, runs, clemens, always, memorabl...
11312    [used, deskjet, orange, micros, grappler, syst...
11313    [^^^^^^, argument, murphy., scared, hell, came...
Name: clean_doc, Length: 11314, dtype: object

## `정수 인코딩과 단어 집합 만들기`

In [6]:
tokenized_doc[:5]

0    [well, sure, about, story, seem, biased., what...
1    [yeah,, expect, people, read, faq,, etc., actu...
2    [although, realize, that, principle, your, str...
3    [notwithstanding, legitimate, fuss, about, thi...
4    [well,, will, have, change, scoring, playoff, ...
Name: clean_doc, dtype: object

In [7]:
from gensim import corpora

    - (word_id, word_frequency)의 형태로 바꿈. 
    word_id : 정수 인코딩된 값 / word_frequency : 해당 뉴스에서의 해당 단어의 빈도 수 

In [8]:
dictionary = corpora.Dictionary(tokenized_doc)

    - 사전에서 생성된 각각의 단어(token)에 대한 id => dictionary.token2id 
    : 단어 사전을 구성할 때 각각의 단어에 id가 생성됨
    
    - 문서(documents) 데이터 수치화 => dictionary.doc2bow
     : 문서를 단어의 id와 빈도수로 수치화함
    

    - dictionary.cfs => 전체 document에서 포함된 token의 인스턴스 수
    - dictionary.dfs => 해당 token을 언급한 documnet의 수 

In [126]:
dictionary.cfs

{69: 1121,
 61: 977,
 0: 5486,
 58: 157,
 53: 570,
 7: 7,
 70: 5932,
 14: 91,
 73: 12314,
 75: 5244,
 57: 241,
 62: 25833,
 67: 381,
 35: 165,
 51: 10,
 28: 6,
 50: 4,
 47: 3,
 38: 1945,
 42: 1,
 74: 133,
 22: 717,
 32: 136,
 15: 77,
 44: 169,
 25: 3,
 60: 1842,
 13: 159,
 31: 195,
 21: 12605,
 39: 5,
 72: 524,
 24: 84,
 63: 559,
 59: 5,
 17: 21,
 16: 25,
 30: 842,
 52: 1601,
 12: 1,
 65: 2746,
 36: 1185,
 45: 533,
 64: 8608,
 48: 223,
 37: 3771,
 9: 212,
 4: 3,
 54: 20,
 5: 4,
 11: 74,
 49: 173,
 26: 6,
 1: 134,
 10: 15,
 27: 325,
 55: 139,
 8: 7,
 46: 207,
 18: 7769,
 19: 959,
 34: 534,
 56: 4691,
 23: 35,
 20: 19,
 6: 79,
 2: 1669,
 3: 280,
 33: 968,
 29: 380,
 66: 31,
 40: 3628,
 43: 14,
 71: 3866,
 41: 58,
 68: 2,
 123: 118,
 90: 285,
 110: 3387,
 113: 985,
 93: 23,
 88: 545,
 77: 658,
 76: 264,
 98: 734,
 80: 6,
 108: 1556,
 104: 919,
 103: 31,
 92: 54,
 101: 1,
 105: 143,
 114: 298,
 117: 2,
 99: 15,
 115: 160,
 84: 1080,
 111: 16,
 124: 351,
 100: 9,
 118: 2570,
 94: 44,
 86: 2

In [127]:
dictionary.dfs

{69: 848,
 61: 764,
 0: 2964,
 58: 129,
 53: 495,
 7: 6,
 70: 2858,
 14: 84,
 73: 4797,
 75: 2286,
 57: 185,
 62: 6330,
 67: 212,
 35: 113,
 51: 8,
 28: 5,
 50: 4,
 47: 3,
 38: 1245,
 42: 1,
 74: 111,
 22: 596,
 32: 97,
 15: 60,
 44: 154,
 25: 3,
 60: 1127,
 13: 117,
 31: 132,
 21: 5193,
 39: 5,
 72: 405,
 24: 78,
 63: 426,
 59: 4,
 17: 19,
 16: 17,
 30: 681,
 52: 1144,
 12: 1,
 65: 1733,
 36: 878,
 45: 440,
 64: 3186,
 48: 149,
 37: 2113,
 9: 174,
 4: 3,
 54: 20,
 5: 4,
 11: 64,
 49: 133,
 26: 5,
 1: 94,
 10: 12,
 27: 127,
 55: 73,
 8: 7,
 46: 170,
 18: 3307,
 19: 469,
 34: 452,
 56: 2561,
 23: 24,
 20: 19,
 6: 75,
 2: 1125,
 3: 261,
 33: 712,
 29: 153,
 66: 31,
 40: 2008,
 43: 13,
 71: 2121,
 41: 52,
 68: 2,
 123: 106,
 90: 235,
 110: 1641,
 113: 697,
 93: 22,
 88: 418,
 77: 523,
 76: 207,
 98: 499,
 80: 6,
 108: 1123,
 104: 713,
 103: 28,
 92: 38,
 101: 1,
 105: 103,
 114: 188,
 117: 2,
 99: 13,
 115: 157,
 84: 832,
 111: 16,
 124: 292,
 100: 6,
 118: 1491,
 94: 40,
 86: 23,
 91: 14

In [128]:
# token 별 id 값 

dictionary.token2id

{'about': 0,
 'acts': 1,
 'after': 2,
 'all,': 3,
 'atrocities.': 4,
 'austria,': 5,
 'away.': 6,
 'biased.': 7,
 'blessing': 8,
 'clearly': 9,
 'commited': 10,
 'daily': 11,
 'degree).': 12,
 'described': 13,
 'disagree': 14,
 'europe': 15,
 'europeans': 16,
 'existance': 17,
 'from': 18,
 'government': 19,
 'guilt': 20,
 'have': 21,
 'having': 22,
 'holocaust': 23,
 'ignore': 24,
 'incidences': 25,
 'inhuman': 26,
 'israeli': 27,
 'israels': 28,
 'jews': 29,
 'least': 30,
 'letter': 31,
 'lived': 32,
 'look': 33,
 'makes': 34,
 'media': 35,
 'might': 36,
 'more': 37,
 'most': 38,
 'occured.': 39,
 'other': 40,
 'power.': 41,
 'pro-israeli': 42,
 'races': 43,
 'realize': 44,
 'reason': 45,
 'received': 46,
 'rediculous.': 47,
 'report': 48,
 'reports': 49,
 'reputation.': 50,
 'ruin': 51,
 'same': 52,
 'seem': 53,
 'shame': 54,
 'soldiers': 55,
 'some': 56,
 'statement': 57,
 'story': 58,
 'subsidizing': 59,
 'such': 60,
 'sure': 61,
 'that': 62,
 'them.': 63,
 'they': 64,
 'think': 6

In [121]:
dictionary.num_nnz

918707

In [119]:
tokenized_doc

0        [well, sure, about, story, seem, biased., what...
1        [yeah,, expect, people, read, faq,, etc., actu...
2        [although, realize, that, principle, your, str...
3        [notwithstanding, legitimate, fuss, about, thi...
4        [well,, will, have, change, scoring, playoff, ...
                               ...                        
11309    [danny, rubenstein,, israeli, journalist,, wil...
11310                                                   []
11311    [agree., home, runs, clemens, always, memorabl...
11312    [used, deskjet, with, orange, micros, grappler...
11313    [^^^^^^, argument, with, murphy., scared, hell...
Name: clean_doc, Length: 11314, dtype: object

181952

In [11]:
dictionary.doc2bow(['about','europe', 'they', 'they', '0', 'disagree'])

[(0, 1), (14, 1), (15, 1), (64, 2)]

In [12]:
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]

In [13]:
# 첫 번째 뉴스 
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 2), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 4), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 2), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 5), (63, 1), (64, 2), (65, 1), (66, 1), (67, 4), (68, 1), (69, 1), (70, 2), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1)]


In [14]:
# 두 번째 뉴스
print(corpus[1])

[(0, 1), (2, 1), (21, 1), (62, 2), (75, 2), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 2), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 2), (109, 1), (110, 1), (111, 1), (112, 1), (113, 1), (114, 1), (115, 2), (116, 1), (117, 1), (118, 1), (119, 1), (120, 1), (121, 1), (122, 1), (123, 1), (124, 1)]


In [15]:
# 두 번째 뉴스에서 (62,2)는 정수 인코딩이 62으로 할당된 단어가 두 번째 뉴스에서 2번 나왔음을 의미함

In [16]:
dictionary[62]

'that'

## `LDA model training`

In [17]:
import gensim

In [65]:
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics= NUM_TOPICS, id2word=dictionary, passes=15, per_word_topics= True )
topics = ldamodel.print_topics(num_words=4)

for topic in topics:
    print(topic)

(0, '0.012*"team" + 0.012*"game" + 0.008*"will" + 0.008*"games"')
(1, '0.004*"devils" + 0.004*"hartford," + 0.003*"lemieux" + 0.003*"captain"')
(2, '0.002*""who" + 0.001*"austria" + 0.001*"snapped" + 0.001*"one)."')
(3, '0.016*"period" + 0.012*"power" + 0.010*"---------------" + 0.009*"play:"')
(4, '0.005*"------" + 0.005*"*******" + 0.003*"3.5"" + 0.003*"(205)"')
(5, '0.008*"water" + 0.006*"each" + 0.004*"copies)" + 0.004*"u.s.a."')
(6, '0.016*"55.0" + 0.003*"astros" + 0.003*"carson" + 0.003*"managing"')
(7, '0.086*"max>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'" + 0.016*"encryption" + 0.010*"security" + 0.010*"chip"')
(8, '0.007*"radar" + 0.007*"scsi-2" + 0.006*"cubs" + 0.006*"scsi-1"')
(9, '0.022*"space" + 0.008*"april" + 0.007*"center" + 0.007*"nasa"')
(10, '0.003*"mild" + 0.003*"third." + 0.002*"summaries" + 0.002*"believer"')
(11, '0.031*"were" + 0.019*"they" + 0.013*"armenian" + 0.010*"their"')
(12, '0.004*"financial" + 0.004*"period:" + 0.004*"arra

     - 각 단어 앞에 붙은 수치는 단어의 해당 토픽에 대한 기여도임
     - 총 20개의 토픽이고, passes는 알고리즘의 동작 횟수

In [66]:
ldamodel.print_topics(num_words=3)

[(0, '0.012*"team" + 0.012*"game" + 0.008*"will"'),
 (1, '0.004*"devils" + 0.004*"hartford," + 0.003*"lemieux"'),
 (2, '0.002*""who" + 0.001*"austria" + 0.001*"snapped"'),
 (3, '0.016*"period" + 0.012*"power" + 0.010*"---------------"'),
 (4, '0.005*"------" + 0.005*"*******" + 0.003*"3.5""'),
 (5, '0.008*"water" + 0.006*"each" + 0.004*"copies)"'),
 (6, '0.016*"55.0" + 0.003*"astros" + 0.003*"carson"'),
 (7,
  '0.086*"max>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'" + 0.016*"encryption" + 0.010*"security"'),
 (8, '0.007*"radar" + 0.007*"scsi-2" + 0.006*"cubs"'),
 (9, '0.022*"space" + 0.008*"april" + 0.007*"center"'),
 (10, '0.003*"mild" + 0.003*"third." + 0.002*"summaries"'),
 (11, '0.031*"were" + 0.019*"they" + 0.013*"armenian"'),
 (12, '0.004*"financial" + 0.004*"period:" + 0.004*"array"'),
 (13, '0.022*"with" + 0.013*"this" + 0.011*"have"'),
 (14, '0.039*"that" + 0.023*"have" + 0.017*"this"'),
 (15,
  '0.004*"[kk]" + 0.002*"------------------------------

In [79]:
NUM_TOPICS = 20
ldamodel2 = gensim.models.ldamodel.LdaModel(corpus, num_topics= NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel2.print_topics(num_words=4)

for topic in topics:
    print(topic)

(0, '0.002*"yassin" + 0.002*"brains" + 0.002*"accelerator" + 0.002*"adjective"')
(1, '0.046*"that" + 0.019*"have" + 0.018*"this" + 0.017*"they"')
(2, '0.020*"that" + 0.012*"from" + 0.010*"their" + 0.009*"were"')
(3, '0.015*"space" + 0.007*"information" + 0.006*"from" + 0.006*"research"')
(4, '0.007*"pens" + 0.005*"icon" + 0.004*"inner" + 0.004*"cleveland"')
(5, '0.016*"55.0" + 0.008*"smokeless" + 0.006*"dept" + 0.005*"------"')
(6, '0.026*"----------------------------------------------------------------------" + 0.010*"entries" + 0.008*"ripem" + 0.007*"each"')
(7, '0.003*"vram" + 0.003*"fleet" + 0.003*"third." + 0.002*"(deletion)"')
(8, '0.011*"with" + 0.010*"were" + 0.009*"from" + 0.007*"will"')
(9, '0.007*"radar" + 0.005*"detector" + 0.005*"helmet" + 0.004*"ottawa"')
(11, '0.003*"static" + 0.003*"grass" + 0.002*"creed" + 0.002*"pain,"')
(12, '0.010*"----" + 0.005*"flyers" + 0.004*"vancouver" + 0.004*"games,"')
(13, '0.002*"zealand" + 0.002*"comfort" + 0.002*"sincerely," + 0.002*"boot

## `LDA 시각화`

In [67]:
import pyLDAvis.gensim_models

In [68]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

    - 좌측의 원은 20개의 토픽을 나타내고, 각 원과의 거리는 각 토픽들이 서로 얼마나 다른지 보여줌
    - 두 개의 원이 겹치면, 유사한 토픽임.
    - LDA 모델의 출력 결과에서는 토픽 번호가 0부터 할당되어 0~19의 숫자가 사용된 것과 달리, 위의 LDA 시각화에서는 토픽의 번호가 1부터 시작해서
    각 토픽 번호는 이제 +1이 된 값인 1~20까지의 값을 가짐

**문서 별 토픽 보기**

    - 위는 토픽 별 단어 분포이고, 문서 별 토픽 분포를 확인하려면 
    훈련된 LDA 모델인 ldamnodel[]에 전체 데이터가 정수 인코딩 된 결과를 넣은 후에 확인이 가능

In [69]:
print(ldamodel[corpus[0]])
print(ldamodel[corpus[1]])

([(4, 0.035575617), (9, 0.027489582), (11, 0.16650015), (14, 0.19199422), (16, 0.5695922)], [(0, [16, 14]), (1, [16]), (2, [16, 14, 11]), (3, [16, 14]), (4, [11]), (5, []), (6, [14, 11, 16]), (7, [11]), (8, [11]), (9, [16, 14]), (10, [16]), (11, [16, 14]), (12, []), (13, [16, 14]), (14, [16, 14]), (15, [11, 16, 14]), (16, [14]), (17, [16, 14]), (18, [16, 11, 14]), (19, [16, 11]), (20, [16]), (21, [16, 14]), (22, [16, 14]), (23, [11]), (24, [16, 14]), (25, []), (26, []), (27, [16]), (28, [9]), (29, [11, 16]), (30, [16, 14]), (31, [16, 11]), (32, [11, 16, 14]), (33, [14, 16]), (34, [16, 14]), (35, [16, 14]), (36, [14, 16]), (37, [16, 14]), (38, [16, 14]), (39, [4]), (40, [16, 14]), (41, [16, 14]), (42, []), (43, [11]), (44, [16, 14]), (45, [16, 14]), (46, [16, 11]), (47, []), (48, [16, 11, 9]), (49, [16, 11, 9]), (50, [11]), (51, [4]), (52, [16, 14]), (53, [14, 16]), (54, [11, 16]), (55, [11, 16]), (56, [16, 14]), (57, [16, 14]), (58, [16, 14]), (59, [4]), (60, [16, 14]), (61, [14, 16]),

In [70]:
for idx, topic_list in enumerate(ldamodel[corpus]):
    if idx==5:
        break
    
    print(f" {idx} 번째의 문서의 topic 비율은 {topic_list}")

 0 번째의 문서의 topic 비율은 ([(4, 0.035575617), (9, 0.027489442), (11, 0.1664997), (14, 0.1919813), (16, 0.56960565)], [(0, [16, 14]), (1, [16]), (2, [16, 14, 11]), (3, [16, 14]), (4, [11]), (5, []), (6, [14, 11, 16]), (7, [11]), (8, [11]), (9, [16, 14]), (10, [16]), (11, [16, 14]), (12, []), (13, [16, 14]), (14, [16, 14]), (15, [11, 16, 14]), (16, [14]), (17, [16, 14]), (18, [16, 11, 14]), (19, [16, 11]), (20, [16]), (21, [16, 14]), (22, [16, 14]), (23, [11]), (24, [16, 14]), (25, []), (26, []), (27, [16]), (28, [9]), (29, [11, 16]), (30, [16, 14]), (31, [16, 11]), (32, [11, 16, 14]), (33, [14, 16]), (34, [16, 14]), (35, [16, 14]), (36, [14, 16]), (37, [16, 14]), (38, [16, 14]), (39, [4]), (40, [16, 14]), (41, [16, 14]), (42, []), (43, [11]), (44, [16, 14]), (45, [16, 14]), (46, [16, 11]), (47, []), (48, [16, 11, 9]), (49, [16, 11, 9]), (50, [11]), (51, [4]), (52, [16, 14]), (53, [14, 16]), (54, [11, 16]), (55, [11, 16]), (56, [16, 14]), (57, [16, 14]), (58, [16, 14]), (59, [4]), (60, [16, 1

    - 위의 (숫자, 확률)은 각 토픽 번호와 해당 토픽이 해당 문서에서 차지하는 분포도
    0 번째 문서의 (3, 0.61321086)은 3번 토픽이 61%의 분포도를 가지는 것을 의미함

In [73]:
topic_table = pd.DataFrame()

In [80]:
ldamodel.per_word_topics

True

In [81]:
ldamodel2.per_word_topics

False

In [83]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        docs = sorted(doc, key=lambda x: (x[1]), reverse=True)
        
        for j, (topic_num, prop_topic) in enumerate(docs):
            if j == 0:
                topic_table = pd.concat([topic_table, pd.DataFrame({'가장 비중이 높은 토픽' : int(topic_num), 
                                                                    '가장 높은 토픽의 비중' : round(prop_topic,4),
                                                                    '각 토픽의 비중': topic_list})])                
            else:
                break
    return(topic_table)

In [84]:
topictable = make_topictable_per_doc(ldamodel, corpus)
topictable

Unnamed: 0,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,16,0.5696,"[(4, 0.035575617), (9, 0.027489001), (11, 0.16..."
1,16,0.5696,"[(0, [16, 14]), (1, [16]), (2, [16, 14, 11]), ..."
2,16,0.5696,"[(0, [(14, 0.46880564), (16, 0.5311675)]), (1,..."
0,14,0.5727,"[(8, 0.12781653), (13, 0.058805898), (14, 0.57..."
1,14,0.5727,"[(0, [14, 16, 13]), (2, [14, 16, 13]), (21, [1..."
...,...,...,...
1,13,0.5453,"[(73, [13, 18, 0]), (488, [18, 13]), (545, [13..."
2,13,0.5453,"[(73, [(0, 0.023749243), (13, 1.865151), (18, ..."
0,14,0.6912,"[(0, 0.080968894), (6, 0.02213592), (13, 0.061..."
1,14,0.6912,"[(21, [14, 16, 13]), (30, [14, 16, 13, 0]), (3..."


In [85]:
topictable2 = make_topictable_per_doc(ldamodel2, corpus)
topictable2

Unnamed: 0,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,1,0.5366,"(1, 0.536569)"
1,1,0.5366,"(2, 0.31210178)"
2,1,0.5366,"(8, 0.14141634)"
0,1,0.7631,"(1, 0.7631492)"
1,1,0.7631,"(2, 0.056715023)"
...,...,...,...
0,1,0.7401,"(1, 0.74005336)"
1,1,0.7401,"(8, 0.13396129)"
2,1,0.7401,"(12, 0.016326005)"
3,1,0.7401,"(14, 0.022430671)"


# LDA measure

In [96]:
cm = CoherenceModel(model=ldamodel, corpus=corpus, coherence='u_mass')
coherence = cm.get_coherence()
print("Cpherence",coherence)
print('\nPerplexity: ', ldamodel.log_perplexity(corpus))

Cpherence -10.22160874297403

Perplexity:  -13.07836889188972


### U_mass coherence

### c_v coherence

In [99]:
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_20newsgroups

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

  if LooseVersion(mpl.__version__) >= "3.0":
  other = LooseVersion(other)


In [100]:

texts, _ = fetch_20newsgroups( subset='all', remove=('headers', 'footers', 'quotes'), return_X_y=True )
tokenizer = lambda s: re.findall( '\w+', s.lower() )
texts = [ tokenizer(t) for t in  texts ]

  tokenizer = lambda s: re.findall( '\w+', s.lower() )


In [102]:
texts[:10]

[['i',
  'am',
  'sure',
  'some',
  'bashers',
  'of',
  'pens',
  'fans',
  'are',
  'pretty',
  'confused',
  'about',
  'the',
  'lack',
  'of',
  'any',
  'kind',
  'of',
  'posts',
  'about',
  'the',
  'recent',
  'pens',
  'massacre',
  'of',
  'the',
  'devils',
  'actually',
  'i',
  'am',
  'bit',
  'puzzled',
  'too',
  'and',
  'a',
  'bit',
  'relieved',
  'however',
  'i',
  'am',
  'going',
  'to',
  'put',
  'an',
  'end',
  'to',
  'non',
  'pittsburghers',
  'relief',
  'with',
  'a',
  'bit',
  'of',
  'praise',
  'for',
  'the',
  'pens',
  'man',
  'they',
  'are',
  'killing',
  'those',
  'devils',
  'worse',
  'than',
  'i',
  'thought',
  'jagr',
  'just',
  'showed',
  'you',
  'why',
  'he',
  'is',
  'much',
  'better',
  'than',
  'his',
  'regular',
  'season',
  'stats',
  'he',
  'is',
  'also',
  'a',
  'lot',
  'fo',
  'fun',
  'to',
  'watch',
  'in',
  'the',
  'playoffs',
  'bowman',
  'should',
  'let',
  'jagr',
  'have',
  'a',
  'lot',
  'of',


In [107]:
# Creating some random topics
topics = [ ['space', 'planet', 'mars', 'galaxy'],
           ['cold', 'medicine', 'doctor', 'health', 'water'],
           ['cats', 'health', 'keyboard', 'car', 'banana'],
           ['windows', 'mac', 'computer', 'operating', 'system'],
          ]

# Creating a dictionary with the vocabulary
word2id = Dictionary( texts )

# Coherence model
cm = CoherenceModel(topics=topics, 
                    texts=texts,
                    coherence='c_v',  
                    dictionary=word2id)

coherence_per_topic = cm.get_coherence_per_topic()

KeyError: 'apple'

In [None]:
coherence_per_topic 

In [None]:
topics_str = [ '\n '.join(t) for t in topics ]
data_topic_score = pd.DataFrame( data=zip(topics_str, coherence_per_topic), columns=['Topic', 'Coherence'] )
data_topic_score = data_topic_score.set_index('Topic')

fig, ax = plt.subplots( figsize=(2,6) )
ax.set_title("Topics coherence\n $C_v$")
sns.heatmap(data=data_topic_score, annot=True, square=True,
            cmap='Reds', fmt='.2f',
            linecolor='black', ax=ax )
plt.yticks( rotation=0 )
ax.set_xlabel('')
ax.set_ylabel('')
fig.show()

# LDA parameter tuning

In [93]:
import re
import warnings

from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

In [None]:
lda_params = {
    num_topics=
    chuksize =
    passes = 
    iterations =
    eval_every = None
}