In [1]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

In [3]:
import nltk

In [4]:
print(WordNetLemmatizer().lemmatize('went', pos = 'v')) # past tense to present tense

go


In [6]:
import pandas as pd
stemmer = SnowballStemmer("english")
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]

pd.DataFrame(data={'original word':original_words, 'stemmed':singles })

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [7]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [8]:
'''
Preview a document after preprocessing
'''
document_num = 50
doc_sample = 'This disk has failed many times. I would like to get it replaced.'

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['This', 'disk', 'has', 'failed', 'many', 'times.', 'I', 'would', 'like', 'to', 'get', 'it', 'replaced.']


Tokenized and lemmatized document: 
['disk', 'fail', 'time', 'like', 'replac']


In [43]:
data = pd.read_csv('./y1819.csv', error_bad_lines=False)
datacolumn = "Text"
# print(data)

data_text = data[[datacolumn]]
# print(data_text)

data_text = data_text.astype('str')

In [44]:
print(data_text)
processed_docs=[]
for datas in data_text["Text"]:
    processed_docs.append(preprocess(datas))
    
print(len(processed_docs))

                                                    Text
0        The Indian economy is witnessing a “cyclical...
1        Citizens’ date with democracy falls on a wee...
2        India on April 24 said protectionism in all ...
3        The government on Tuesday opened for public ...
4       The price of petrol rose to a four-year high ...
...                                                  ...
59141   As an unusually hot summer takes off, outer z...
59142   Japanese electronics company Murata Manufactu...
59143   Ram Janmabhoomi is an emotional issue and can...
59144   The Vidhana Soudha police arrested six people...
59145   Senior Indian Police Service officer Param Bi...

[59146 rows x 1 columns]
59146


In [45]:
processed_docs[:2]

[['indian',
  'economi',
  'wit',
  'cyclic',
  'upsw',
  'countri',
  'like',
  'clock',
  'growth',
  'financi',
  'year',
  'say',
  'deutsch',
  'bank',
  'research',
  'report',
  'current',
  'growth',
  'forecast',
  'estim',
  'mark',
  'improv',
  'like',
  'turn',
  'global',
  'financi',
  'servic',
  'major',
  'say',
  'reserv',
  'bank',
  'expect',
  'india',
  'econom',
  'growth',
  'rate',
  'strengthen',
  'current',
  'fiscal',
  'account',
  'reviv',
  'invest',
  'activ',
  'report',
  'note',
  'higher',
  'global',
  'price',
  'risk',
  'earlier',
  'anticip',
  'rate',
  'hike',
  'cycl',
  'potenti',
  'negat',
  'impact',
  'bank',
  'sector',
  'fraud',
  'credit',
  'overal',
  'growth',
  'factor',
  'pose',
  'downsid',
  'risk',
  'baselin',
  'estim',
  'brent',
  'crude',
  'price',
  'current',
  'hover',
  'barrel',
  'decemb',
  'level',
  'accord',
  'deutsch',
  'bank',
  'research',
  'report',
  'increas',
  'price',
  'shave',
  'growth',
  'f

In [46]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [47]:
'''
Checking dictionary created
'''
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 accord
1 account
2 activ
3 add
4 addit
5 anticip
6 bank
7 barrel
8 baselin
9 bode
10 brent


In [48]:
'''
OPTIONAL STEP
Remove very rare and very common words:

- words appearing less than 15 times
- words appearing in more than 10% of all documents
'''
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)

In [49]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [50]:
document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 29 ("forward") appears 2 time.
Word 58 ("rate") appears 1 time.
Word 59 ("recoveri") appears 3 time.
Word 72 ("turn") appears 1 time.
Word 127 ("reason") appears 1 time.
Word 155 ("agricultur") appears 1 time.
Word 172 ("commit") appears 1 time.
Word 284 ("regim") appears 1 time.
Word 330 ("maharashtra") appears 1 time.
Word 340 ("pump") appears 4 time.
Word 438 ("loan") appears 1 time.
Word 513 ("electr") appears 3 time.
Word 604 ("step") appears 1 time.
Word 606 ("suppli") appears 2 time.
Word 647 ("hour") appears 4 time.
Word 694 ("capabl") appears 1 time.
Word 795 ("despit") appears 1 time.
Word 931 ("onlin") appears 1 time.
Word 1102 ("appeal") appears 3 time.
Word 1103 ("attempt") appears 1 time.
Word 1104 ("authoritarian") appears 1 time.
Word 1105 ("bill") appears 2 time.
Word 1106 ("chandrashekhar") appears 1 time.
Word 1107 ("contempt") appears 1 time.
Word 1108 ("cut") appears 1 time.
Word 1109 ("drastic") appears 1 time.
Word 1110 ("due") appears 5 time.
Word 1111 ("fa

In [52]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 25, 
                                   id2word = dictionary,                                    
                                   passes = 8,
                                   workers = 2)

In [53]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.041*"tamil" + 0.036*"nadu" + 0.018*"aiadmk" + 0.013*"pakistan" + 0.012*"chennai" + 0.008*"stalin" + 0.007*"prime" + 0.007*"talk" + 0.007*"palaniswami" + 0.006*"modi"


Topic: 1 
Words: 0.065*"farmer" + 0.026*"price" + 0.022*"agricultur" + 0.018*"crop" + 0.017*"farm" + 0.011*"land" + 0.010*"suppli" + 0.009*"market" + 0.009*"product" + 0.009*"cultiv"


Topic: 2 
Words: 0.033*"seat" + 0.030*"poll" + 0.030*"candid" + 0.028*"constitu" + 0.027*"sabha" + 0.021*"vote" + 0.019*"contest" + 0.018*"voter" + 0.016*"allianc" + 0.011*"elector"


Topic: 3 
Words: 0.030*"vehicl" + 0.021*"transport" + 0.020*"airport" + 0.015*"traffic" + 0.013*"passeng" + 0.012*"bus" + 0.011*"flight" + 0.010*"travel" + 0.009*"driver" + 0.009*"drive"


Topic: 4 
Words: 0.041*"modi" + 0.028*"gandhi" + 0.024*"prime" + 0.020*"maharashtra" + 0.015*"narendra" + 0.014*"defenc" + 0.014*"deal" + 0.014*"rafal" + 0.013*"mumbai" + 0.013*"bengal"


Topic: 5 
Words: 0.015*"victim" + 0.011*"woman" + 0.010*"hospit" + 

In [59]:
count =0
for i in processed_docs:
    for j in i:
        if("corrupt" in j):
            count+=1
            
print(count)

3214


In [72]:
count=0
for ids,topics in lda_model.print_topics(-1):
    a = (topics.split("+"))
    for j in a:
        s = j.split("*")
        if "student" in a:
            print("yya")

0.041*"tamil" 
 0.036*"nadu" 
 0.018*"aiadmk" 
 0.013*"pakistan" 
 0.012*"chennai" 
 0.008*"stalin" 
 0.007*"prime" 
 0.007*"talk" 
 0.007*"palaniswami" 
 0.006*"modi"
0.065*"farmer" 
 0.026*"price" 
 0.022*"agricultur" 
 0.018*"crop" 
 0.017*"farm" 
 0.011*"land" 
 0.010*"suppli" 
 0.009*"market" 
 0.009*"product" 
 0.009*"cultiv"
0.033*"seat" 
 0.030*"poll" 
 0.030*"candid" 
 0.028*"constitu" 
 0.027*"sabha" 
 0.021*"vote" 
 0.019*"contest" 
 0.018*"voter" 
 0.016*"allianc" 
 0.011*"elector"
0.030*"vehicl" 
 0.021*"transport" 
 0.020*"airport" 
 0.015*"traffic" 
 0.013*"passeng" 
 0.012*"bus" 
 0.011*"flight" 
 0.010*"travel" 
 0.009*"driver" 
 0.009*"drive"
0.041*"modi" 
 0.028*"gandhi" 
 0.024*"prime" 
 0.020*"maharashtra" 
 0.015*"narendra" 
 0.014*"defenc" 
 0.014*"deal" 
 0.014*"rafal" 
 0.013*"mumbai" 
 0.013*"bengal"
0.015*"victim" 
 0.011*"woman" 
 0.010*"hospit" 
 0.010*"girl" 
 0.009*"spot" 
 0.009*"death" 
 0.008*"murder" 
 0.008*"kill" 
 0.008*"villag" 
 0.008*"identifi"
