<a href="https://colab.research.google.com/github/harikrishnareddymallavarapu/DeepAndNLP/blob/master/Gensim_TopicModelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Run in python console
import nltk; 
nltk.download('stopwords')

# Run in terminal or command prompt
!python3 -m spacy download en

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import spacy

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [3]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


In [4]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [16]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts,bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    nlp = spacy.load('en', disable=['parser', 'ner'])
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [23]:
def createCorpusDict(dataFrame):
    #cleanse the text
    data = dataFrame.content.values.tolist()
    data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
    # Remove new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]
    # Remove distracting single quotes
    data = [re.sub("\'", "", sent) for sent in data]
    data_words = list(sent_to_words(data))
    
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    data_words_nostops = remove_stopwords(data_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops,bigram_mod)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en


    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    #cleanse the dictionary
    dictionary = corpora.Dictionary(data_lemmatized)
    # Create Corpus
    texts = data_lemmatized
    # Term Document Frequency
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    return corpus,dictionary,data_lemmatized

In [7]:
# Read dataFrame
#dataFrame = pd.read_csv("")
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [24]:
#Create Corpus and Dictionary
corpus, dictionary, data_lemmatized = createCorpusDict(df)



In [20]:
#Train the Topic Modelling
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [21]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
doc_lda

[(0,
  '0.051*"report" + 0.027*"black" + 0.020*"fire" + 0.020*"white" + '
  '0.016*"trial" + 0.016*"cover" + 0.015*"medium" + 0.013*"vote" + '
  '0.012*"minor" + 0.012*"title"'),
 (1,
  '0.021*"god" + 0.020*"accept" + 0.016*"member" + 0.015*"man" + '
  '0.014*"israeli" + 0.014*"season" + 0.012*"publish" + 0.012*"lebanese" + '
  '0.012*"jewish" + 0.011*"brain"'),
 (2,
  '0.017*"package" + 0.016*"press" + 0.015*"item" + 0.015*"break" + '
  '0.011*"level" + 0.010*"edge" + 0.009*"hole" + 0.007*"eye" + '
  '0.007*"equipment" + 0.007*"contribute"'),
 (3,
  '0.025*"pc" + 0.022*"contain" + 0.020*"input" + 0.020*"reality" + '
  '0.017*"picture" + 0.016*"object" + 0.016*"level" + 0.015*"box" + '
  '0.015*"quality" + 0.013*"greek"'),
 (4,
  '0.089*"ax" + 0.076*"max" + 0.032*"space" + 0.021*"launch" + 0.018*"di_di" + '
  '0.017*"orbit" + 0.016*"sphere" + 0.015*"satellite" + 0.014*"plane" + '
  '0.014*"mission"'),
 (5,
  '0.019*"people" + 0.017*"kill" + 0.015*"child" + 0.015*"government" + '
  '0.0

<gensim.interfaces.TransformedCorpus at 0x7fb4956e1630>

In [25]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.348722848931784

Coherence Score:  0.4392813747423439


In [None]:
import pyLDAvis.gensim
import pickle
import pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

# Getting the final distributions

In [35]:
gamma, _ = lda_model.inference(corpus)
topics = gamma / gamma.sum(axis=1)[:,None]

In [36]:
topics = pd.DataFrame(topics)
topics.reset_index(drop=True)
topics.rename(columns = lambda x: 'Topic_'+str(x+1).zfill(2), inplace=True)
topics.head()

Unnamed: 0,Topic_01,Topic_02,Topic_03,Topic_04,Topic_05,Topic_06,Topic_07,Topic_08,Topic_09,Topic_10,Topic_11,Topic_12,Topic_13,Topic_14,Topic_15,Topic_16,Topic_17,Topic_18,Topic_19,Topic_20
0,0.005568,0.00886,0.010065,0.008012,0.002323,0.201965,0.01027,0.059479,0.009263,0.087445,0.009112,0.163939,0.009945,0.301126,0.005168,0.005386,0.022772,0.060215,0.010221,0.008866
1,0.021106,0.007232,0.008231,0.006715,0.002043,0.04107,0.02571,0.063513,0.021111,0.114989,0.007561,0.018888,0.048318,0.214393,0.004548,0.031919,0.023886,0.094253,0.236343,0.008171
2,0.002428,0.003708,0.022452,0.003331,0.001022,0.005746,0.129505,0.106848,0.004264,0.086618,0.005886,0.017705,0.004719,0.450351,0.002224,0.002354,0.002775,0.067765,0.039745,0.040554
3,0.006643,0.010067,0.011889,0.009048,0.058412,0.012697,0.014352,0.105799,0.010583,0.054856,0.010499,0.00702,0.030312,0.413654,0.006129,0.006444,0.007568,0.14243,0.050077,0.031523
4,0.011891,0.006986,0.008006,0.006326,0.022238,0.009021,0.086808,0.327276,0.007644,0.048998,0.007352,0.005196,0.021556,0.247435,0.004515,0.005542,0.005448,0.153544,0.006743,0.007473


In [38]:
finalData = pd.concat([df,topics], axis=1,sort=False)
finalData.head()

Unnamed: 0,content,target,target_names,Topic_01,Topic_02,Topic_03,Topic_04,Topic_05,Topic_06,Topic_07,Topic_08,Topic_09,Topic_10,Topic_11,Topic_12,Topic_13,Topic_14,Topic_15,Topic_16,Topic_17,Topic_18,Topic_19,Topic_20
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos,0.005568,0.00886,0.010065,0.008012,0.002323,0.201965,0.01027,0.059479,0.009263,0.087445,0.009112,0.163939,0.009945,0.301126,0.005168,0.005386,0.022772,0.060215,0.010221,0.008866
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware,0.021106,0.007232,0.008231,0.006715,0.002043,0.04107,0.02571,0.063513,0.021111,0.114989,0.007561,0.018888,0.048318,0.214393,0.004548,0.031919,0.023886,0.094253,0.236343,0.008171
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware,0.002428,0.003708,0.022452,0.003331,0.001022,0.005746,0.129505,0.106848,0.004264,0.086618,0.005886,0.017705,0.004719,0.450351,0.002224,0.002354,0.002775,0.067765,0.039745,0.040554
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics,0.006643,0.010067,0.011889,0.009048,0.058412,0.012697,0.014352,0.105799,0.010583,0.054856,0.010499,0.00702,0.030312,0.413654,0.006129,0.006444,0.007568,0.14243,0.050077,0.031523
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space,0.011891,0.006986,0.008006,0.006326,0.022238,0.009021,0.086808,0.327276,0.007644,0.048998,0.007352,0.005196,0.021556,0.247435,0.004515,0.005542,0.005448,0.153544,0.006743,0.007473


In [41]:
topics2 = pd.DataFrame(topics.columns.values[np.argsort(-topics.values, axis=1)[:, :3]],columns = ['1st Max','2nd Max','3rd Max']).reset_index()

In [42]:
topics2.head()

Unnamed: 0,index,1st Max,2nd Max,3rd Max
0,0,Topic_14,Topic_06,Topic_12
1,1,Topic_19,Topic_14,Topic_10
2,2,Topic_14,Topic_07,Topic_08
3,3,Topic_14,Topic_18,Topic_08
4,4,Topic_08,Topic_14,Topic_18


In [44]:
finalData = pd.concat([finalData,topics2], axis=1,sort=False)
finalData.head()

Unnamed: 0,content,target,target_names,Topic_01,Topic_02,Topic_03,Topic_04,Topic_05,Topic_06,Topic_07,Topic_08,Topic_09,Topic_10,Topic_11,Topic_12,Topic_13,Topic_14,Topic_15,Topic_16,Topic_17,Topic_18,Topic_19,Topic_20,index,1st Max,2nd Max,3rd Max,index.1,1st Max.1,2nd Max.1,3rd Max.1
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos,0.005568,0.00886,0.010065,0.008012,0.002323,0.201965,0.01027,0.059479,0.009263,0.087445,0.009112,0.163939,0.009945,0.301126,0.005168,0.005386,0.022772,0.060215,0.010221,0.008866,0,Topic_14,Topic_06,Topic_12,0,Topic_14,Topic_06,Topic_12
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware,0.021106,0.007232,0.008231,0.006715,0.002043,0.04107,0.02571,0.063513,0.021111,0.114989,0.007561,0.018888,0.048318,0.214393,0.004548,0.031919,0.023886,0.094253,0.236343,0.008171,1,Topic_19,Topic_14,Topic_10,1,Topic_19,Topic_14,Topic_10
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware,0.002428,0.003708,0.022452,0.003331,0.001022,0.005746,0.129505,0.106848,0.004264,0.086618,0.005886,0.017705,0.004719,0.450351,0.002224,0.002354,0.002775,0.067765,0.039745,0.040554,2,Topic_14,Topic_07,Topic_08,2,Topic_14,Topic_07,Topic_08
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics,0.006643,0.010067,0.011889,0.009048,0.058412,0.012697,0.014352,0.105799,0.010583,0.054856,0.010499,0.00702,0.030312,0.413654,0.006129,0.006444,0.007568,0.14243,0.050077,0.031523,3,Topic_14,Topic_18,Topic_08,3,Topic_14,Topic_18,Topic_08
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space,0.011891,0.006986,0.008006,0.006326,0.022238,0.009021,0.086808,0.327276,0.007644,0.048998,0.007352,0.005196,0.021556,0.247435,0.004515,0.005542,0.005448,0.153544,0.006743,0.007473,4,Topic_08,Topic_14,Topic_18,4,Topic_08,Topic_14,Topic_18
