In [1]:
# Importing the required libraries
import spacy
from spacy.lang.en import English

In [2]:
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [3]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/diptanu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [1]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [5]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/diptanu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [25]:
import csv
import random
text_data = []
with open('cleaned_test_data_v1.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        line = row[4]
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            text_data.append(tokens)

In [7]:
import random
text_data = []
with open('dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            text_data.append(tokens)

In [26]:
from gensim import corpora

In [27]:
dictionary = corpora.Dictionary(text_data)

In [28]:
corpus = [dictionary.doc2bow(text) for text in text_data]


In [29]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [34]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [36]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.057*"control" + 0.057*"legal" + 0.057*"pushing" + 0.057*"brief"')
(1, '0.046*"circus" + 0.046*"democrat" + 0.046*"receive" + 0.046*"safety"')
(2, '0.061*"conflict" + 0.033*"kavanaughconfirmationhearings" + 0.033*"postpone" + 0.033*"book"')
(3, '0.034*"extremely" + 0.034*"controversy" + 0.034*"controversial" + 0.034*"going"')
(4, '0.031*"conservative" + 0.031*"legal" + 0.031*"address" + 0.031*"license"')


In [37]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[]
[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]


In [38]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.030*"conflict" + 0.030*"antifa" + 0.017*"nathaliegdrouin" + 0.017*"investigation"')
(1, '0.056*"control" + 0.032*"legal" + 0.032*"school" + 0.032*"current"')
(2, '0.036*"conservative" + 0.036*"doctrinaire" + 0.036*"lose" + 0.036*"policy"')


In [39]:

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.081*"legal" + 0.081*"video" + 0.081*"brief" + 0.081*"pushing"')
(1, '0.095*"postpone" + 0.095*"vote" + 0.095*"book" + 0.095*"kavanaughconfirmationhearings"')
(2, '0.013*"farticus" + 0.013*"maybe" + 0.013*"book" + 0.013*"kavanaughconfirmationhearings"')
(3, '0.013*"maybe" + 0.013*"farticus" + 0.013*"vote" + 0.013*"book"')
(4, '0.013*"farticus" + 0.013*"maybe" + 0.013*"brief" + 0.013*"control"')
(5, '0.038*"antifa" + 0.038*"project" + 0.038*"conservative" + 0.038*"socialist"')
(6, '0.062*"control" + 0.062*"controversy" + 0.062*"shooting" + 0.062*"current"')
(7, '0.013*"farticus" + 0.013*"maybe" + 0.013*"postpone" + 0.013*"book"')
(8, '0.059*"conflict" + 0.031*"general" + 0.031*"explanation" + 0.031*"corruption"')
(9, '0.056*"going" + 0.056*"doctrinaire" + 0.056*"prosperity" + 0.056*"forward"')


In [40]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [41]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [21]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

  and should_run_async(code)
