In [9]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import itertools
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim import similarities
from gensim import corpora, models, similarities
from gensim.models.ldamulticore import LdaMulticore

# spacy for lemmatization
import spacy

import seaborn as sns
from nltk.corpus import stopwords 
import string
from nltk import word_tokenize
from nltk import FreqDist

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline
import time
start_time = time.time()

### Counting the count of each word in the corpus:

In [10]:
word_dict = {}
def count_words(word_list):
    for w in word_list:
        if w in word_dict.keys():
            word_dict[w] = word_dict[w] + 1
        else:
            word_dict[w] = 1

### Reading each document as token of words(lemmatized) in a list:

In [11]:
import glob
doc_corpus = []
def document_tokens(source):
    files = glob.glob(source+'/*.txt') 
    combined_txt = ''
    for file in files:
        in_file = open(file,'r')
        txt = in_file.read()
        lemmatized_tokens = []
        tokens = word_tokenize(txt)
        lemmatized_tokens = [w for w in tokens if len(w)!=1] 
        count_words(lemmatized_tokens) 
        doc_corpus.append(lemmatized_tokens)

In [12]:
t = time.time()
document_tokens("../pre-processing/clean-source")
document_tokens("../pre-processing/clean-target")
document_tokens("../pre-processing/clean-test/clean-suspected-algo")
document_tokens("../pre-processing/clean-test/clean-suspected-no-algo")
document_tokens("../pre-processing/clean-fullcorpus")
print("Time taken to read:", (time.time()-t)/60)

Time taken to read: 24.906488080819447


In [13]:
print("Number of total documents:",len(doc_corpus))

Number of total documents: 11612


In [14]:
sorted_words = [(k, word_dict[k]) for k in sorted(word_dict, key=word_dict.get, reverse=True)]

### Number of unique words in the whole corpus:

In [15]:
len(sorted_words)

5330376

### Removing top 50 words and the words whose frequency is 1:

In [16]:
words_remove = []
for s in sorted_words[:50]:
    words_remove.append(s[0])
    
for s in sorted_words[::-1]:
    if s[1]==1:
        words_remove.append(s[0])

In [34]:
start_time = time.time()
short_corpus=[]
for d in range(0,len(doc_corpus)):
    short_corpus.append([i for i in doc_corpus[d] if i not in words_remove])
    
print("Time taken to remove words:", (time.time()-start_time)/60)

KeyboardInterrupt: 

### Number of unique words reduced to:

In [None]:
all_words = [word for item in short_corpus for word in item]
# use nltk fdist to get a frequency distribution of all words
fdist = FreqDist(all_words)
len(fdist) # number of unique words

In [None]:
len_doc = []
len_short=[]
for i in range(0,len(doc_corpus)):
    len_doc.append(len(doc_corpus[i]))
    len_short.append(len(short_corpus[i]))

In [None]:
a4_dims = (15, 9)
fig, axes = plt.subplots(1, 2, figsize=a4_dims)
axes[0].plot(len_doc)
axes[0].set_title("Number of words in every document before preprocessing")
axes[1].plot(len_short)
axes[1].set_title("Number of words in every document after preprocessing")

In [None]:
id2word = corpora.Dictionary(short_corpus)

# Create Corpus
texts = short_corpus

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]


### LDA modelling for topics=5 to 15. Best results at 8. (Without using Multicore)

In [None]:
t = time.time()
coherence_score = []
perplexity = []
topics = [i for i in range(5,15)]
for j in range(0,len(topics)):
    t1 = time.time()
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=topics[j],random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)
    #lda_model.save("models/lda_model" + str(j) +".model")
    perplexity.append(lda_model.log_perplexity(corpus))
    coherence_model_lda = CoherenceModel(model=lda_model, texts=short_corpus, dictionary=id2word, coherence='c_v')
    coherence_score.append(coherence_model_lda.get_coherence())
    t2 = time.time()
    print("Time taken to model for",topics[j],'topics=',(t2-t1)/60,"mins.")
    
print("Total time taken:",(time.time()-t)/60)

In [None]:
a4_dims = (15, 6)
fig, axes = plt.subplots(1, 2, figsize=a4_dims)

axes[0].plot(topics,coherence_score ,marker='o', markerfacecolor='blue', markersize=12, color='#E69F00', linewidth=4)
axes[0].set_title('Distribution of Coherence Score by number of topics')
axes[0].set_xlabel('Number of topics')
axes[0].set_ylabel('Coherence score')

axes[1].plot(topics,perplexity ,marker='o', markerfacecolor='blue', markersize=12, color='#E69F00', linewidth=4)
axes[1].set_title('Distribution of Perplexity by number of topics')
axes[1].set_xlabel('Number of topics')
axes[1].set_ylabel('Perplexity')

### LDA modelling for topics=5 to 15. (With using Multicore)

In [None]:
t = time.time()
coherence_score = []
perplexity = []
topics = [i for i in range(5,15)]
for j in range(0,len(topics)):
    t1 = time.time()
    lda_model = LdaMulticore(corpus=corpus,id2word=id2word,num_topics=topics[j],random_state=100,chunksize=100, 
                             passes=10, workers=3)
    #lda_model.save("models/lda_model" + str(j) +".model")
    perplexity.append(lda_model.log_perplexity(corpus))
    coherence_model_lda = CoherenceModel(model=lda_model, texts=short_corpus, dictionary=id2word, coherence='c_v')
    coherence_score.append(coherence_model_lda.get_coherence())
    t2 = time.time()
    print("Time taken to model for",topics[j],'topics=',(t2-t1)/60,"mins.")
    
print("Total time taken:",(time.time()-t)/60)

In [None]:
a4_dims = (15, 6)
fig, axes = plt.subplots(1, 2, figsize=a4_dims)

axes[0].plot(topics,coherence_score ,marker='o', markerfacecolor='blue', markersize=12, color='#E69F00', linewidth=4)
axes[0].set_title('Distribution of Coherence Score by number of topics')
axes[0].set_xlabel('Number of topics')
axes[0].set_ylabel('Coherence score')

axes[1].plot(topics,perplexity ,marker='o', markerfacecolor='blue', markersize=12, color='#E69F00', linewidth=4)
axes[1].set_title('Distribution of Perplexity by number of topics')
axes[1].set_xlabel('Number of topics')
axes[1].set_ylabel('Perplexity')

### LDA modelling with best results, i.e. topics=8

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=8,random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)

### Topic Distribution:

In [None]:
lda_model.show_topics()

### Importance of words shown for 5 topics:

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(3, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()
