In [8]:
import pandas as pd

data = pd.read_csv('../lda/data/text.events.csv', error_bad_lines=False);
data_text = data[['text']]
data_text['index'] = data_text.index
documents = data_text

In [9]:
len(documents)

8929

In [10]:
documents[:5]

Unnamed: 0,text,index
0,"31 августа @ Бар ""Bristle"", 19:30\nSDL\n• Вход...",0
1,Лето в Минске начинается с августа!,1
2,Появился новый мурал от Рамона Мартинса. Как в...,2
3,The topic for the next debate club meeting is ...,3
4,IMPORTANT NEWS!!!\n\nThe tomorrow 's meeting i...,4


### Data Preprocessing

In [11]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [12]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/happylol/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Lemmatize example

In [14]:
print(WordNetLemmatizer().lemmatize('went', pos='v'))

пойти


#### Stemmer Example

In [15]:
stemmer = SnowballStemmer('russian')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caresses
1,flies,flies
2,dies,dies
3,mules,mules
4,denied,denied
5,died,died
6,agreed,agreed
7,owned,owned
8,humbled,humbled
9,sized,sized


In [45]:
from html.parser import HTMLParser

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def preprocess(text):
    if (type(text) == float):
        print(text);
        text = 'тест';
    text = strip_tags(text);
    
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [46]:
doc_sample = documents[documents['index'] == 4310].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['\\n', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '<p>18', '&#x43C;&#x430;&#x44F;', '&#x432;', '17.00', '&#x414;&#x43C;&#x438;&#x442;&#x440;&#x438;&#x439;', '&#x41F;&#x43E;&#x431;&#x435;&#x434;&#x438;&#x43D;&#x441;&#x43A;&#x438;&#x439;', '&#x432;', '&#x441;&#x43A;&#x432;&#x435;&#x440;&#x435;', '&#x43F;&#x435;&#x440;&#x435;&#x434;', '&#x433;&#x43B;&#x430;&#x432;&#x43D;&#x44B;&#x43C;', '&#x43A;&#x43E;&#x440;&#x43F;&#x443;&#x441;&#x43E;&#x43C;', '&#x411;&#x41D;&#x422;&#x423;', '&#x43D;&#x430;', '&#x41C;&#x43E;&#x43B;&#x43E;&#x434;&#x435;&#x436;&#x43D;&#x43E;&#x439;', '&#x43F;&#x43B;&#x43E;&#x449;&#x430;&#x434;&#x43A;&#x435;', '&#x43F;&#x440;&#x43E;&#x447;&#x438;&#x442;&#x430;&#x435;&#x442;', '&#x43B;&#x435;&#x43A;&#x446;&#x438;&#x44E;', '&#xAB;&#x421;&#x43A;&#x430;&#x437;&#x43A;&#x430;', '&#x43B;&#x43E;&#x436;&#x44C;,', '&#x434;&#x430;', '&#x432;', '&#x43D;&#x435;&#x439;', '&#x43D;&#x430;&#x43C;&#x435;&#

In [47]:

map(lambda x,y,z : print(x), [1, 2, 3, 4]) #Output [2, 4, 6, 8]


<map at 0x1a2bffd438>

In [48]:
processed_docs = documents['text'].map(preprocess)

nan


In [49]:
processed_docs[:10]

0    [август, bristl, nsdl, вход, свободн, нзаказ, ...
1                         [лет, минск, начина, август]
2                    [появ, нов, мура, рамон, мартинс]
3    [topic, debat, club, meet, euthanas, justif, l...
4    [important, news, nthe, tomorrow, meet, cancel...
5                                               [март]
6    [sorr, lat, announcement, meet, hold, toda, us...
7         [topic, husband, wiv, cook, dinner, weekend]
8       [meet, hold, toda, usua, worr, cancel, librar]
9    [подписыва, https, facebook, alfabankb, сообще...
Name: text, dtype: object

### Bag of words on the dataset

In [50]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [51]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 bristl
1 nsdl
2 август
3 вход
4 нзаказ
5 свободн
6 стол
7 лет
8 минск
9 начина
10 мартинс


In [52]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [53]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(44, 1),
 (59, 1),
 (60, 2),
 (72, 2),
 (81, 1),
 (82, 2),
 (83, 1),
 (100, 1),
 (106, 1),
 (110, 1),
 (115, 1),
 (126, 1),
 (137, 1),
 (139, 1),
 (146, 1),
 (148, 2),
 (167, 3),
 (179, 1),
 (183, 1),
 (207, 1),
 (209, 1),
 (215, 1),
 (226, 1),
 (235, 1),
 (237, 1),
 (243, 1),
 (280, 1),
 (325, 1),
 (339, 1),
 (488, 1),
 (499, 1),
 (511, 1),
 (527, 1),
 (528, 1),
 (533, 1),
 (534, 1),
 (537, 1),
 (588, 2),
 (668, 1),
 (670, 2),
 (738, 1),
 (739, 1),
 (810, 1),
 (814, 1),
 (821, 1),
 (851, 2),
 (905, 2),
 (933, 1),
 (1017, 1),
 (1037, 1),
 (1093, 1),
 (1115, 1),
 (1127, 1),
 (1134, 2),
 (1242, 1),
 (1248, 1),
 (1277, 3),
 (1330, 1),
 (1345, 1),
 (1352, 1),
 (1366, 1),
 (1398, 1),
 (1422, 1),
 (1471, 1),
 (1596, 1),
 (1606, 1),
 (1610, 1),
 (1613, 1),
 (1632, 1),
 (1659, 1),
 (1826, 1),
 (1878, 1),
 (1881, 1),
 (1895, 1),
 (1921, 2),
 (2072, 2),
 (2111, 1),
 (2191, 5),
 (2272, 1),
 (2448, 2),
 (2449, 1),
 (2462, 1),
 (2527, 2),
 (2665, 1),
 (2669, 1),
 (2767, 1),
 (2849, 1),
 (3138, 1),

In [54]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 44 ("компан") appears 1 time.
Word 59 ("ближайш") appears 1 time.
Word 60 ("был") appears 2 time.
Word 72 ("звезд") appears 2 time.
Word 81 ("мечта") appears 1 time.
Word 82 ("мног") appears 2 time.
Word 83 ("молодежн") appears 1 time.
Word 100 ("русск") appears 1 time.
Word 106 ("стат") appears 1 time.
Word 110 ("фильм") appears 1 time.
Word 115 ("чтоб") appears 1 time.
Word 126 ("встреч") appears 1 time.
Word 137 ("магазин") appears 1 time.
Word 139 ("можн") appears 1 time.
Word 146 ("путешеств") appears 1 time.
Word 148 ("сдела") appears 2 time.
Word 167 ("год") appears 3 time.
Word 179 ("котор") appears 1 time.
Word 183 ("люд") appears 1 time.
Word 207 ("планет") appears 1 time.
Word 209 ("поддержк") appears 1 time.
Word 215 ("проект") appears 1 time.
Word 226 ("созда") appears 1 time.
Word 235 ("цел") appears 1 time.
Word 237 ("эт") appears 1 time.
Word 243 ("вниман") appears 1 time.
Word 280 ("программ") appears 1 time.
Word 325 ("лучш") appears 1 time.
Word 339 ("популярн")

### TF-IDF

In [55]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [56]:
corpus_tfidf = tfidf[bow_corpus]

In [57]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.3652728867821245),
 (1, 0.15328224415414574),
 (2, 0.7701415527749449),
 (3, 0.1508904738877443),
 (4, 0.4766490589398566)]


### Running LDA using Bag of Words

In [58]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [59]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.011*"занят" + 0.009*"котор" + 0.008*"сво" + 0.007*"курс" + 0.007*"вход" + 0.007*"бесплатн" + 0.007*"свободн" + 0.007*"навык" + 0.006*"нов" + 0.005*"язык"
Topic: 1 
Words: 0.011*"вход" + 0.010*"сторон" + 0.009*"мероприят" + 0.008*"котор" + 0.008*"можн" + 0.007*"сво" + 0.007*"чтоб" + 0.007*"cub" + 0.007*"чут" + 0.007*"свободн"
Topic: 2 
Words: 0.008*"котор" + 0.008*"проект" + 0.007*"минск" + 0.007*"бесплатн" + 0.006*"вход" + 0.006*"нов" + 0.005*"такж" + 0.005*"будет" + 0.005*"пройдет" + 0.005*"создан"
Topic: 3 
Words: 0.011*"котор" + 0.009*"сво" + 0.008*"компан" + 0.007*"минск" + 0.006*"будет" + 0.005*"можн" + 0.005*"как" + 0.005*"мероприят" + 0.004*"бесплатн" + 0.004*"эт"
Topic: 4 
Words: 0.018*"проект" + 0.009*"бизнес" + 0.008*"управлен" + 0.007*"курс" + 0.006*"работ" + 0.006*"компан" + 0.005*"сво" + 0.004*"участ" + 0.004*"команд" + 0.004*"club"
Topic: 5 
Words: 0.022*"дизайн" + 0.011*"част" + 0.010*"практическ" + 0.008*"market" + 0.008*"основ" + 0.008*"продуктов" + 

Cool! Can you distinguish different topics using the words in each topic and their corresponding weights?

### Running LDA using TF-IDF

In [60]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [61]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.003*"проект" + 0.003*"компан" + 0.003*"бизнес" + 0.002*"сво" + 0.002*"ноябр" + 0.002*"сам" + 0.002*"котор" + 0.002*"приз" + 0.002*"минск" + 0.002*"регистрац"
Topic: 1 Word: 0.003*"тренировк" + 0.003*"криптовалют" + 0.003*"фитнес" + 0.002*"занят" + 0.002*"нача" + 0.002*"club" + 0.002*"ноябр" + 0.002*"бесплатн" + 0.002*"минск" + 0.002*"developer"
Topic: 2 Word: 0.004*"дизайн" + 0.002*"занят" + 0.002*"курс" + 0.002*"групп" + 0.002*"танц" + 0.002*"класс" + 0.002*"продуктов" + 0.002*"част" + 0.002*"вход" + 0.002*"бесплатн"
Topic: 3 Word: 0.007*"вход" + 0.006*"свободн" + 0.005*"wall" + 0.003*"https" + 0.003*"групп" + 0.003*"занят" + 0.003*"концерт" + 0.002*"ноябр" + 0.002*"проект" + 0.002*"билет"
Topic: 4 Word: 0.004*"билет" + 0.004*"drupa" + 0.003*"frontend" + 0.003*"минск" + 0.002*"розыгрыш" + 0.002*"colour" + 0.002*"club" + 0.002*"котор" + 0.002*"свободн" + 0.002*"вход"
Topic: 5 Word: 0.010*"вход" + 0.010*"свободн" + 0.005*"club" + 0.003*"розыгрыш" + 0.003*"репост" + 0.00

### Classification of the topics

### Performance evaluation by classifying sample document using LDA Bag of Words model

In [62]:
processed_docs[4310]

['дмитр',
 'побединск',
 'сквер',
 'перед',
 'главн',
 'корпус',
 'бнту',
 'молодежн',
 'площадк',
 'прочита',
 'лекц',
 'сказк',
 'лож',
 'намек',
 'нчит',
 'фантастик',
 'просматрив',
 'фильм',
 'част',
 'мечта',
 'чтоб',
 'удивительн',
 'вещ',
 'котор',
 'происход',
 'происход',
 'наяв',
 'стат',
 'невидимк',
 'помаха',
 'лазерн',
 'меч',
 'чита',
 'мысл',
 'люд',
 'путешествова',
 'времен',
 'мног',
 'друг',
 'оказыва',
 'мног',
 'эт',
 'можн',
 'сдела',
 'хорош',
 'разбир',
 'физик',
 'сдела',
 'сказк',
 'был',
 'намн',
 'прощ',
 'кажет',
 'площадк',
 'начнет',
 'лекц',
 'наскольк',
 'огромн',
 'вселен',
 'ноч',
 'небосвод',
 'нам',
 'рассыпа',
 'миллион',
 'звезд',
 'туман',
 'галактик',
 'кажут',
 'так',
 'холодн',
 'далек',
 'есл',
 'быт',
 'точн',
 'как',
 'расстоян',
 'наскольк',
 'далек',
 'близк',
 'наход',
 'скольк',
 'времен',
 'летет',
 'лун',
 'друг',
 'планет',
 'ближайш',
 'звезд',
 'галактик',
 'лекц',
 'соверш',
 'маленьк',
 'космическ',
 'путешеств',
 'чут',
 'кра'

In [63]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9324227571487427	 
Topic: 0.011*"котор" + 0.009*"сво" + 0.008*"компан" + 0.007*"минск" + 0.006*"будет" + 0.005*"можн" + 0.005*"как" + 0.005*"мероприят" + 0.004*"бесплатн" + 0.004*"эт"

Score: 0.06090875342488289	 
Topic: 0.010*"минск" + 0.008*"котор" + 0.007*"проект" + 0.007*"нов" + 0.006*"билет" + 0.006*"розыгрыш" + 0.006*"будет" + 0.006*"бизнес" + 0.005*"групп" + 0.005*"сво"


Our test document has the highest probability to be part of the topic on the top.

### Performance evaluation by classifying sample document using LDA TF-IDF model

In [64]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.47126305103302	 
Topic: 0.003*"тренировк" + 0.003*"криптовалют" + 0.003*"фитнес" + 0.002*"занят" + 0.002*"нача" + 0.002*"club" + 0.002*"ноябр" + 0.002*"бесплатн" + 0.002*"минск" + 0.002*"developer"

Score: 0.41878241300582886	 
Topic: 0.004*"seavus" + 0.004*"свободн" + 0.004*"вход" + 0.003*"опыт" + 0.003*"наш" + 0.003*"codetalks" + 0.003*"brest" + 0.003*"встреч" + 0.003*"специалист" + 0.003*"сво"

Score: 0.10411956161260605	 
Topic: 0.004*"занят" + 0.003*"язык" + 0.003*"студент" + 0.003*"пауз" + 0.003*"коф" + 0.003*"будут" + 0.003*"встреч" + 0.002*"английск" + 0.002*"нов" + 0.002*"котор"


Our test document has the highest probability to be part of the topic on the top.

### Testing model on unseen document

In [90]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.show_topic(index, 5)))

Score: 0.5499588251113892	 Topic: [('дизайн', 0.022040477), ('част', 0.01067833), ('практическ', 0.009945214), ('market', 0.008382885), ('основ', 0.008192588)]
Score: 0.05001766234636307	 Topic: [('https', 0.011197289), ('wall', 0.0075064427), ('класс', 0.006875984), ('мастер', 0.0060046487), ('регистрац', 0.0058637937)]
Score: 0.05000755935907364	 Topic: [('котор', 0.010753264), ('сво', 0.009428455), ('компан', 0.008309926), ('минск', 0.0072592236), ('будет', 0.00648109)]
Score: 0.05000488832592964	 Topic: [('минск', 0.009603944), ('котор', 0.007834163), ('проект', 0.006955929), ('нов', 0.0069356035), ('билет', 0.0064230976)]
Score: 0.050003860145807266	 Topic: [('проект', 0.018113881), ('бизнес', 0.008508109), ('управлен', 0.008262795), ('курс', 0.0067854123), ('работ', 0.00601669)]
Score: 0.0500032976269722	 Topic: [('котор', 0.008194484), ('проект', 0.007674819), ('минск', 0.0073017804), ('бесплатн', 0.0066998606), ('вход', 0.00559491)]
Score: 0.05000164732336998	 Topic: [('вход', 

In [82]:
# from gensim.corpora import Dictionary
# tmp_fname = 'l23321';
# dictionary.save_as_text(tmp_fname)
# loaded_dct = Dictionary.load_from_text(tmp_fname)
# loaded_dct
lda = lda_model
from gensim.test.utils import datapath

 # Save model to disk.
temp_file = datapath("./model")
lda.save('model')
# Load a potentially pretrained model from disk.

lda = gensim.models.LdaMulticore.load('model')
lda

<gensim.models.ldamulticore.LdaMulticore at 0x1a31f5a4a8>