In [1]:
import pickle
import pandas as pd
from sklearn import datasets
import gensim
import pyLDAvis
from pyLDAvis import gensim as gensimvis

import logging
from tqdm import tqdm
from pprint import pprint

with open('data.pickle','rb') as read_file:
    df = pickle.load(read_file)
    
df.head(5)

Unnamed: 0,text,Date,Label
0,georgia down two russia warplane a country mov...,2008-08-08,0
1,why wont united_states and nato help united_st...,2008-08-11,1
2,remember that adorable yearold who sang at the...,2008-08-12,0
3,united_states refuse israel weapon to attack i...,2008-08-13,0
4,all the expert admit that we should legalise d...,2008-08-14,1


In [2]:
df_list = df['text'].tolist()

In [3]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
df_list = list(sent_to_words(df_list))

In [4]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(df_list, min_count=20) # higher threshold fewer phrases.

trigram = gensim.models.Phrases(bigram[df_list], min_count=10)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)



In [5]:
clean_sents = [trigram_model[bigram_model[t]] for t in df_list]

In [6]:
import spacy

nlp = spacy.load('en', disable=['parser', 'ner'])

# NOUN, ADJ, VERB, ADV
def lemmatization(texts, allowed_postags=['NOUN']):
    texts_out = []
    for sent in texts:
#         print(sent)
        doc = nlp(" ".join(sent)) 
#         print(doc)
        output_text = [token.lemma_ for token in doc if token.pos_ in allowed_postags and token.is_stop == False]
#         print(output_text)
        if len(output_text) > 0:
            texts_out.append(output_text)
    return texts_out

In [7]:
clean_sents = lemmatization(clean_sents)

### Now for the LDA part

In [8]:
id2word = gensim.corpora.Dictionary(clean_sents)

corpus = [id2word.doc2bow(t) for t in clean_sents]

In [9]:
# [(id2word[id], freq) for id, freq in corpus[0]]

In [10]:
import warnings
warnings.filterwarnings('ignore')

lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=42,
                                           chunksize=100,
                                           passes=128,
                                           per_word_topics=True)

In [11]:
pprint(lda_model.print_topics())

[(0,
  '0.014*"united_state" + 0.010*"china" + 0.007*"government" + 0.006*"police" '
  '+ 0.006*"country" + 0.006*"people" + 0.005*"world" + 0.005*"man" + '
  '0.004*"iran" + 0.004*"woman"'),
 (1,
  '0.004*"georgia" + 0.001*"south_ossetia" + 0.001*"cholera" + 0.001*"bali" + '
  '0.001*"promubarak" + 0.001*"retreat" + 0.001*"bureaucrat" + 0.001*"turban" '
  '+ 0.001*"fivestar" + 0.001*"entrance"'),
 (2,
  '0.016*"united_state" + 0.013*"russia" + 0.010*"china" + 0.006*"country" + '
  '0.006*"government" + 0.005*"world" + 0.005*"people" + 0.005*"syria" + '
  '0.005*"year" + 0.004*"police"')]


In [12]:
top_topics = lda_model.get_document_topics(corpus[0])
top_topics.sort(key=lambda x: x[1], reverse=True)

print(top_topics)

[(0, 0.6280558), (2, 0.20239744), (1, 0.16954675)]


In [13]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, 'lda.html')