In [2]:
import spacy
spacy.load('en_core_web_sm')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [3]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ed\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [4]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [5]:
for w in ['dogs', 'ran', 'discouraged']:
    print(w, get_lemma(w), get_lemma2(w))

dogs dog dog
ran run ran
discouraged discourage discouraged


In [6]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ed\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [7]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [8]:
import random
text_data = []
with open('dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['superpeer', 'network', 'emerge']
['supporting', 'list', 'model', 'timely', 'approach']
['opinion', 'receive', 'online', 'community', 'study', 'helpfulness', 'vote']
['decoder', 'density', 'parity', 'check', 'convolutional', 'code', 'large', 'memory']
['personal', 'voice', 'assistant', 'voicexml', 'distribute', 'environment']
['efficient', 'hardware', 'architecture', 'address', 'lookup']
['shuffling', 'stack', 'partially', 'randomize', 'ranking', 'search', 'engine', 'result']
['trajectory', 'improve', 'delivery', 'vehicular', 'network']
['integrate', 'converter', 'compact', 'efficient', 'hybrid', 'power', 'system']
['peceptual', 'distortion', 'metric', 'base', 'wavelet', 'frequency', 'sensitivity', 'multiple', 'visual', 'fixation']
['iolaus', 'secure', 'online', 'content', 'rating', 'system']
['novel', 'mosfet', 'bandgap', 'voltage', 'reference']
['effect', 'skew', 'access', 'buffer', 'contention', 'sharing', 'environment']
['analyze', 'heading', 'consider', 'various', 'presentation']

In [10]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [11]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [12]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

### Try 5 topics

In [13]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [14]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.037*"environment" + 0.037*"sharing" + 0.037*"effect" + 0.037*"access"')
(1, '0.022*"online" + 0.022*"system" + 0.022*"opinion" + 0.022*"application"')
(2, '0.036*"efficient" + 0.036*"hardware" + 0.020*"network" + 0.020*"content"')
(3, '0.021*"sensitivity" + 0.021*"base" + 0.021*"frequency" + 0.021*"wavelet"')
(4, '0.044*"consider" + 0.044*"various" + 0.044*"presentation" + 0.044*"heading"')


In [15]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(87, 1)]
[(0, 0.10003936), (1, 0.59986997), (2, 0.10002134), (3, 0.10002281), (4, 0.10004652)]


In [16]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.017*"online" + 0.017*"system" + 0.017*"wavelet" + 0.017*"metric"')
(1, '0.029*"efficient" + 0.029*"hardware" + 0.017*"filter" + 0.017*"length"')
(2, '0.033*"environment" + 0.019*"network" + 0.019*"effect" + 0.019*"contention"')


In [17]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.052*"online" + 0.052*"receive" + 0.052*"study" + 0.052*"community"')
(1, '0.040*"decoder" + 0.040*"convolutional" + 0.040*"memory" + 0.040*"engine"')
(2, '0.009*"network" + 0.009*"list" + 0.009*"hardware" + 0.009*"primary"')
(3, '0.068*"voltage" + 0.068*"novel" + 0.068*"reference" + 0.068*"bandgap"')
(4, '0.050*"environment" + 0.050*"voicexml" + 0.050*"distribute" + 0.050*"voice"')
(5, '0.045*"efficient" + 0.024*"digital" + 0.024*"implementation" + 0.024*"filter"')
(6, '0.052*"hardware" + 0.052*"network" + 0.052*"address" + 0.052*"trajectory"')
(7, '0.057*"approach" + 0.057*"supporting" + 0.057*"timely" + 0.057*"model"')
(8, '0.057*"heading" + 0.057*"analyze" + 0.057*"various" + 0.057*"consider"')
(9, '0.040*"system" + 0.040*"visual" + 0.040*"peceptual" + 0.040*"frequency"')


### pyLDAvis

In [18]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [20]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [21]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [22]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
