In [74]:
import matplotlib.pyplot as plt
import gensim
import numpy as np
import spacy
import pandas as pd

from gensim.models import LdaModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
import pyLDAvis.gensim
import wikipedia
import re

import warnings
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now
%matplotlib inline

In [2]:
# Reading the data
text=open('train_news.txt', 'r').read()

In [3]:
text



In [4]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [24]:
def clean(sent):
    sent=decontracted(sent)
    sent=re.sub(r'https?://\S+', '', sent)
    sent=re.sub(r'[\w\.-]+@[\w\.-]+', '', sent)
    sent = re.sub("\S*\d\S*", "", sent)
    sent=re.sub(r'[^A-Za-z0-9]+', ' ', sent)
    sent=sent.lower()
    sent = ' '.join(e for e in sent.split() if len(e)>2 )
    sent=sent.translate(str.maketrans('', '', '"#%&\'()*+,-./:;<=>@[\\]^_`{|}~'))

    return sent

In [None]:
nlp = spacy.load("en")

In [28]:
cleaned=[]
for i in text.split('\n'):
    sent=nlp(clean(i))
    cleaned.append(sent)
print(len(cleaned))

1204


In [29]:
cleaned

[updated sep istthe karnataka state road transport corporation friday said will restart its operations maharashtra from september statement the ksrtc said had stopped operation inter state bus services due coronavirus disease and lockdown lockdown has been relaxed ksrtc will restart the operations maharashtra state from september the statement said the services will operated from bengaluru davangere mangaluru and various other places the state keeping view the density passengers the statement added the ksrtc said mandatory for the passengers wear masks said passengers can book tickets advance online www ksrtc through ksrtc franchisee advance reservation counters for the above services even before the lockdown was effected the ksrtc had stopped its bus services various states and gradually suspended the inter state bus movement due coronavirus outbreak,
 with india china study group csg meeting friday the stage has been set for meeting military commanders the two countries the coming we

In [45]:
article, texts = [], []
for doc in cleaned:
    for w in doc:
        # if it's not a stop word or punctuation mark, add it to our article
        if not w.is_stop and not w.is_punct and not w.like_num:
            # we add the lematized version of the word
            article.append(w.lemma_)
    texts.append(article)
    article=[]

In [46]:
len(texts)

1204

In [47]:
bigram = gensim.models.Phrases(texts)

In [48]:
texts = [bigram[line] for line in texts]

In [49]:
texts[4][:20]

['anushka',
 'shetty',
 'madhavan',
 'starrer',
 'nishabdam',
 'skip',
 'theatrical',
 'release',
 'digital',
 'route',
 'producer',
 'kona',
 'venkat',
 'drop',
 'hint',
 'recently',
 'run',
 'poll',
 'ask',
 'people']

In [50]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [51]:
corpus[30]

[(13, 2),
 (36, 2),
 (57, 1),
 (91, 2),
 (106, 2),
 (181, 1),
 (184, 1),
 (193, 1),
 (194, 1),
 (241, 1),
 (262, 2),
 (266, 1),
 (271, 1),
 (281, 1),
 (304, 1),
 (308, 2),
 (352, 1),
 (368, 2),
 (382, 1),
 (384, 1),
 (437, 1),
 (443, 1),
 (446, 1),
 (454, 1),
 (461, 1),
 (463, 1),
 (474, 1),
 (475, 1),
 (484, 1),
 (489, 2),
 (506, 1),
 (510, 1),
 (530, 2),
 (655, 2),
 (659, 1),
 (671, 2),
 (677, 1),
 (748, 3),
 (781, 2),
 (843, 1),
 (849, 1),
 (868, 1),
 (906, 1),
 (910, 1),
 (912, 1),
 (945, 1),
 (952, 1),
 (962, 1),
 (968, 2),
 (1010, 1),
 (1014, 1),
 (1015, 2),
 (1064, 1),
 (1070, 3),
 (1080, 1),
 (1100, 2),
 (1106, 1),
 (1112, 3),
 (1128, 1),
 (1139, 6),
 (1143, 1),
 (1157, 1),
 (1192, 1),
 (1196, 1),
 (1255, 1),
 (1270, 2),
 (1280, 2),
 (1288, 1),
 (1322, 1),
 (1336, 1),
 (1363, 1),
 (1381, 1),
 (1388, 1),
 (1391, 3),
 (1434, 2),
 (1453, 1),
 (1505, 1),
 (1524, 1),
 (1531, 3),
 (1538, 1),
 (1554, 2),
 (1561, 2),
 (1622, 1),
 (1659, 1),
 (1672, 1),
 (1674, 1),
 (1676, 1),
 (1710, 1

In [40]:
ldamodel = LdaModel(corpus=corpus, num_topics=20, id2word=dictionary, passes=50, random_state=0)

In [41]:
topics=ldamodel.print_topics(num_words=10)
topics

[(0,
  '0.015*"say" + 0.013*"india" + 0.011*"market" + 0.009*"company" + 0.008*"price" + 0.007*"economy" + 0.007*"year" + 0.007*"crore" + 0.007*"cent" + 0.006*"low"'),
 (1,
  '0.015*"candidate" + 0.013*"september" + 0.011*"examination" + 0.009*"say" + 0.009*"exam" + 0.008*"official_website" + 0.007*"read" + 0.007*"admit_card" + 0.007*"post" + 0.007*"result"'),
 (2,
  '0.018*"say" + 0.012*"student" + 0.010*"state" + 0.009*"government" + 0.009*"exam" + 0.007*"centre" + 0.007*"year" + 0.006*"school" + 0.005*"woman" + 0.005*"college"'),
 (3,
  '0.019*"say" + 0.007*"time" + 0.006*"ipo" + 0.005*"capital" + 0.005*"startup" + 0.004*"business" + 0.004*"food" + 0.004*"number" + 0.004*"company" + 0.003*"investor"'),
 (4,
  '0.036*"trump" + 0.022*"biden" + 0.017*"say" + 0.012*"president" + 0.011*"election" + 0.009*"state" + 0.009*"vote" + 0.008*"democrats" + 0.007*"voter" + 0.006*"poll"'),
 (5,
  '0.014*"state" + 0.012*"say" + 0.011*"case" + 0.007*"police" + 0.006*"saturday" + 0.006*"party" + 0.00

In [42]:
# The word say and cent is unnecesserily occuring again and again
# Lets remove it from the corpus

In [43]:
# Also, lets take 15 topics and increase the number of passes

In [44]:
my_stop_words = [u'say', u'cent', u'\'s', u'Mr', u'be', u'said', u'says', u'saying']
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [52]:
ldamodel1 = LdaModel(corpus=corpus, num_topics=15, id2word=dictionary, passes=80, random_state=0)

In [53]:
topics1=ldamodel1.print_topics(num_words=10)
topics1

[(0,
  '0.010*"play" + 0.009*"team" + 0.008*"player" + 0.008*"ipl" + 0.008*"good" + 0.007*"year" + 0.006*"game" + 0.006*"come" + 0.006*"india" + 0.006*"time"'),
 (1,
  '0.019*"student" + 0.013*"exam" + 0.010*"examination" + 0.010*"candidate" + 0.010*"september" + 0.008*"university" + 0.007*"college" + 0.006*"question" + 0.006*"teacher" + 0.006*"online"'),
 (2,
  '0.009*"state" + 0.006*"sharma" + 0.006*"india" + 0.004*"police" + 0.004*"project" + 0.003*"take" + 0.003*"centre" + 0.003*"arrest" + 0.003*"serve" + 0.003*"like"'),
 (3,
  '0.012*"company" + 0.009*"market" + 0.008*"year" + 0.007*"price" + 0.006*"india" + 0.006*"government" + 0.006*"business" + 0.005*"economy" + 0.005*"month" + 0.004*"high"'),
 (4,
  '0.010*"party" + 0.007*"state" + 0.005*"congress" + 0.003*"indian" + 0.003*"india" + 0.003*"crore" + 0.003*"sanction" + 0.003*"get" + 0.003*"indian_americans" + 0.003*"time"'),
 (5,
  '0.021*"india" + 0.012*"china" + 0.009*"country" + 0.007*"government" + 0.005*"need" + 0.005*"indi

In [56]:
ldamodel2 = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary, passes=60, random_state=0)
topics2=ldamodel2.print_topics(num_words=10)
topics2

[(0,
  '0.009*"play" + 0.009*"ipl" + 0.008*"team" + 0.007*"time" + 0.007*"film" + 0.007*"good" + 0.006*"year" + 0.006*"player" + 0.006*"get" + 0.006*"csk"'),
 (1,
  '0.016*"student" + 0.010*"exam" + 0.008*"september" + 0.008*"examination" + 0.007*"candidate" + 0.006*"university" + 0.006*"teacher" + 0.006*"school" + 0.005*"college" + 0.005*"year"'),
 (2,
  '0.011*"state" + 0.006*"india" + 0.006*"bank" + 0.005*"tax" + 0.004*"centre" + 0.004*"gst" + 0.004*"sharma" + 0.004*"work" + 0.003*"revenue" + 0.003*"like"'),
 (3,
  '0.008*"company" + 0.007*"market" + 0.006*"year" + 0.006*"price" + 0.005*"india" + 0.005*"business" + 0.004*"month" + 0.004*"government" + 0.004*"people" + 0.003*"new"'),
 (4,
  '0.014*"trump" + 0.006*"biden" + 0.006*"election" + 0.006*"president" + 0.005*"party" + 0.005*"time" + 0.004*"vote" + 0.004*"court" + 0.004*"people" + 0.004*"state"'),
 (5,
  '0.018*"india" + 0.009*"china" + 0.007*"country" + 0.006*"government" + 0.006*"vaccine" + 0.004*"need" + 0.004*"year" + 0.0

In [57]:
ldamodel3 = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary, passes=100, random_state=0)
topics3=ldamodel3.print_topics(num_words=10)
topics3

[(0,
  '0.009*"play" + 0.009*"ipl" + 0.008*"team" + 0.007*"time" + 0.007*"film" + 0.007*"good" + 0.007*"year" + 0.006*"player" + 0.006*"get" + 0.006*"csk"'),
 (1,
  '0.017*"student" + 0.011*"exam" + 0.008*"examination" + 0.008*"september" + 0.008*"candidate" + 0.007*"school" + 0.006*"university" + 0.006*"teacher" + 0.005*"college" + 0.005*"online"'),
 (2,
  '0.012*"state" + 0.006*"india" + 0.005*"bank" + 0.005*"centre" + 0.005*"tax" + 0.004*"gst" + 0.004*"sharma" + 0.004*"work" + 0.003*"revenue" + 0.003*"like"'),
 (3,
  '0.009*"company" + 0.007*"market" + 0.006*"year" + 0.006*"price" + 0.006*"business" + 0.006*"india" + 0.004*"government" + 0.004*"month" + 0.004*"economy" + 0.004*"rise"'),
 (4,
  '0.015*"trump" + 0.006*"biden" + 0.006*"election" + 0.006*"president" + 0.005*"party" + 0.004*"time" + 0.004*"court" + 0.004*"vote" + 0.004*"people" + 0.004*"state"'),
 (5,
  '0.019*"india" + 0.009*"china" + 0.008*"country" + 0.006*"government" + 0.006*"vaccine" + 0.004*"need" + 0.004*"year" +

In [58]:
ldamodel4 = LdaModel(corpus=corpus, num_topics=8, id2word=dictionary, passes=80, random_state=0)
topics4=ldamodel4.print_topics(num_words=10)
topics4

[(0,
  '0.009*"play" + 0.008*"ipl" + 0.007*"film" + 0.007*"time" + 0.007*"team" + 0.007*"year" + 0.006*"good" + 0.005*"get" + 0.005*"player" + 0.005*"csk"'),
 (1,
  '0.011*"student" + 0.010*"exam" + 0.008*"candidate" + 0.008*"september" + 0.007*"examination" + 0.005*"year" + 0.005*"question" + 0.004*"college" + 0.004*"university" + 0.004*"centre"'),
 (2,
  '0.006*"india" + 0.005*"china" + 0.005*"state" + 0.004*"taiwan" + 0.003*"work" + 0.003*"sharma" + 0.003*"police" + 0.003*"chinese" + 0.002*"kid" + 0.002*"people"'),
 (3,
  '0.007*"company" + 0.006*"india" + 0.006*"market" + 0.005*"price" + 0.004*"report" + 0.004*"year" + 0.004*"new" + 0.004*"high" + 0.004*"people" + 0.003*"month"'),
 (4,
  '0.012*"trump" + 0.006*"biden" + 0.005*"party" + 0.005*"election" + 0.005*"president" + 0.005*"state" + 0.004*"people" + 0.004*"time" + 0.004*"vote" + 0.003*"day"'),
 (5,
  '0.017*"india" + 0.010*"china" + 0.007*"country" + 0.006*"vaccine" + 0.006*"government" + 0.004*"need" + 0.004*"year" + 0.004*

In [59]:
ldamodel5 = LdaModel(corpus=corpus, num_topics=8, id2word=dictionary, passes=100, random_state=0)
topics5=ldamodel5.print_topics(num_words=10)
topics5

[(0,
  '0.009*"play" + 0.008*"ipl" + 0.007*"film" + 0.007*"time" + 0.007*"team" + 0.007*"year" + 0.006*"good" + 0.005*"get" + 0.005*"player" + 0.005*"csk"'),
 (1,
  '0.011*"student" + 0.010*"exam" + 0.009*"candidate" + 0.008*"september" + 0.007*"examination" + 0.005*"year" + 0.005*"question" + 0.004*"college" + 0.004*"university" + 0.004*"centre"'),
 (2,
  '0.006*"india" + 0.005*"china" + 0.005*"state" + 0.005*"taiwan" + 0.003*"sharma" + 0.003*"work" + 0.003*"police" + 0.003*"chinese" + 0.002*"kid" + 0.002*"official"'),
 (3,
  '0.007*"company" + 0.006*"market" + 0.006*"india" + 0.005*"price" + 0.004*"report" + 0.004*"year" + 0.004*"high" + 0.004*"new" + 0.004*"month" + 0.003*"people"'),
 (4,
  '0.012*"trump" + 0.006*"biden" + 0.005*"party" + 0.005*"election" + 0.005*"president" + 0.005*"state" + 0.004*"people" + 0.004*"time" + 0.004*"vote" + 0.003*"day"'),
 (5,
  '0.017*"india" + 0.010*"china" + 0.007*"country" + 0.006*"vaccine" + 0.006*"government" + 0.004*"need" + 0.004*"year" + 0.00

In [60]:
ldamodel6 = LdaModel(corpus=corpus, num_topics=8, id2word=dictionary, passes=60, random_state=0)
topics6=ldamodel6.print_topics(num_words=10)
topics6

[(0,
  '0.009*"play" + 0.008*"ipl" + 0.007*"film" + 0.007*"team" + 0.007*"time" + 0.007*"year" + 0.006*"good" + 0.005*"player" + 0.005*"get" + 0.005*"csk"'),
 (1,
  '0.011*"student" + 0.010*"exam" + 0.008*"candidate" + 0.008*"september" + 0.007*"examination" + 0.005*"year" + 0.004*"question" + 0.004*"college" + 0.004*"university" + 0.004*"centre"'),
 (2,
  '0.006*"india" + 0.005*"state" + 0.005*"china" + 0.003*"taiwan" + 0.003*"work" + 0.003*"sharma" + 0.003*"police" + 0.003*"chinese" + 0.002*"people" + 0.002*"tax"'),
 (3,
  '0.006*"company" + 0.006*"india" + 0.006*"market" + 0.005*"price" + 0.004*"report" + 0.004*"year" + 0.004*"new" + 0.004*"people" + 0.004*"high" + 0.003*"month"'),
 (4,
  '0.011*"trump" + 0.005*"biden" + 0.005*"party" + 0.005*"election" + 0.005*"state" + 0.005*"president" + 0.004*"people" + 0.004*"time" + 0.004*"vote" + 0.003*"day"'),
 (5,
  '0.017*"india" + 0.010*"china" + 0.007*"country" + 0.006*"vaccine" + 0.006*"government" + 0.004*"need" + 0.004*"year" + 0.004*

In [61]:
ldamodel7 = LdaModel(corpus=corpus, num_topics=8, id2word=dictionary, passes=150, random_state=0)
topics7=ldamodel7.print_topics(num_words=10)
topics7

[(0,
  '0.009*"play" + 0.008*"ipl" + 0.007*"film" + 0.007*"time" + 0.007*"team" + 0.007*"year" + 0.006*"good" + 0.005*"get" + 0.005*"player" + 0.005*"csk"'),
 (1,
  '0.012*"student" + 0.011*"exam" + 0.009*"candidate" + 0.009*"september" + 0.008*"examination" + 0.005*"question" + 0.005*"year" + 0.004*"college" + 0.004*"university" + 0.004*"centre"'),
 (2,
  '0.005*"india" + 0.005*"china" + 0.005*"taiwan" + 0.004*"state" + 0.003*"sharma" + 0.003*"work" + 0.003*"police" + 0.003*"chinese" + 0.003*"kid" + 0.002*"official"'),
 (3,
  '0.008*"company" + 0.006*"market" + 0.006*"india" + 0.005*"price" + 0.005*"year" + 0.004*"report" + 0.004*"high" + 0.004*"new" + 0.004*"month" + 0.003*"business"'),
 (4,
  '0.012*"trump" + 0.006*"biden" + 0.006*"party" + 0.005*"election" + 0.005*"president" + 0.005*"state" + 0.004*"people" + 0.004*"time" + 0.004*"vote" + 0.003*"ginsburg"'),
 (5,
  '0.018*"india" + 0.010*"china" + 0.007*"country" + 0.006*"vaccine" + 0.006*"government" + 0.004*"need" + 0.004*"year"

In [62]:
ldamodel8 = LdaModel(corpus=corpus, num_topics=7, id2word=dictionary, passes=150, random_state=0)
topics8=ldamodel8.print_topics(num_words=10)
topics8

[(0,
  '0.008*"play" + 0.007*"film" + 0.007*"ipl" + 0.007*"time" + 0.006*"team" + 0.006*"good" + 0.006*"year" + 0.006*"actor" + 0.005*"come" + 0.005*"get"'),
 (1,
  '0.016*"student" + 0.010*"exam" + 0.009*"candidate" + 0.008*"september" + 0.007*"examination" + 0.007*"school" + 0.005*"teacher" + 0.005*"university" + 0.005*"online" + 0.005*"question"'),
 (2,
  '0.005*"india" + 0.004*"police" + 0.003*"sharma" + 0.003*"work" + 0.003*"arrest" + 0.003*"state" + 0.003*"anurag" + 0.003*"bank" + 0.003*"china" + 0.003*"kid"'),
 (3,
  '0.006*"company" + 0.006*"market" + 0.005*"india" + 0.005*"year" + 0.005*"people" + 0.005*"price" + 0.004*"industry" + 0.004*"business" + 0.004*"government" + 0.004*"add"'),
 (4,
  '0.009*"trump" + 0.005*"case" + 0.005*"state" + 0.005*"party" + 0.004*"biden" + 0.004*"people" + 0.004*"election" + 0.004*"court" + 0.004*"president" + 0.004*"police"'),
 (5,
  '0.013*"india" + 0.010*"china" + 0.007*"vaccine" + 0.005*"country" + 0.004*"people" + 0.004*"government" + 0.004

In [63]:
ldamodel9 = LdaModel(corpus=corpus, num_topics=9, id2word=dictionary, passes=150, random_state=0)
topics9=ldamodel9.print_topics(num_words=10)
topics9

[(0,
  '0.009*"play" + 0.008*"ipl" + 0.008*"time" + 0.007*"team" + 0.007*"film" + 0.006*"year" + 0.006*"good" + 0.006*"get" + 0.006*"come" + 0.006*"player"'),
 (1,
  '0.011*"exam" + 0.010*"student" + 0.010*"candidate" + 0.009*"september" + 0.008*"examination" + 0.005*"question" + 0.005*"bank" + 0.005*"year" + 0.005*"centre" + 0.005*"university"'),
 (2,
  '0.005*"india" + 0.005*"people" + 0.004*"kangana" + 0.004*"sharma" + 0.003*"state" + 0.003*"kid" + 0.003*"tweet" + 0.003*"work" + 0.003*"hai" + 0.003*"like"'),
 (3,
  '0.008*"company" + 0.007*"market" + 0.006*"india" + 0.005*"year" + 0.005*"price" + 0.004*"business" + 0.004*"government" + 0.004*"new" + 0.004*"high" + 0.004*"month"'),
 (4,
  '0.012*"trump" + 0.007*"state" + 0.006*"biden" + 0.006*"party" + 0.005*"election" + 0.005*"president" + 0.004*"vote" + 0.003*"time" + 0.003*"day" + 0.003*"week"'),
 (5,
  '0.018*"india" + 0.009*"china" + 0.007*"country" + 0.006*"government" + 0.005*"vaccine" + 0.004*"people" + 0.004*"need" + 0.004*"

In [64]:
ldamodel10 = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary, passes=150, random_state=0)
topics10=ldamodel10.print_topics(num_words=10)
topics10

[(0,
  '0.009*"play" + 0.009*"ipl" + 0.008*"team" + 0.007*"time" + 0.007*"good" + 0.007*"film" + 0.007*"year" + 0.006*"player" + 0.006*"get" + 0.006*"csk"'),
 (1,
  '0.018*"student" + 0.011*"exam" + 0.008*"examination" + 0.008*"candidate" + 0.008*"september" + 0.007*"school" + 0.006*"teacher" + 0.006*"university" + 0.005*"college" + 0.005*"online"'),
 (2,
  '0.012*"state" + 0.007*"india" + 0.005*"centre" + 0.005*"bank" + 0.005*"tax" + 0.005*"gst" + 0.004*"sharma" + 0.004*"work" + 0.003*"revenue" + 0.003*"kid"'),
 (3,
  '0.009*"company" + 0.008*"market" + 0.006*"year" + 0.006*"business" + 0.006*"price" + 0.005*"india" + 0.004*"government" + 0.004*"month" + 0.004*"economy" + 0.004*"rise"'),
 (4,
  '0.015*"trump" + 0.006*"biden" + 0.006*"election" + 0.006*"president" + 0.005*"party" + 0.004*"time" + 0.004*"court" + 0.004*"vote" + 0.004*"people" + 0.004*"ginsburg"'),
 (5,
  '0.019*"india" + 0.009*"china" + 0.008*"country" + 0.006*"government" + 0.006*"vaccine" + 0.004*"need" + 0.004*"year"

In [65]:
# Model 7 seems sensible
# lets name the topics
topics={0:'cricket', 1:'education', 2:'international', 3:'economy', 4:'politics', 
        5:'health', 6:'policies', 7:'bollywood'}

In [66]:
# Reading the test data
test=open('test_news.txt', 'r').read()

In [67]:
cleaned_test=[]
for i in test.split('\n'):
    sent=nlp(clean(i))
    cleaned_test.append(sent)
print(len(cleaned_test))

243


In [91]:
len(cleaned_test[0])

130

In [93]:
cleaned_test[0]

youtube has rolled out new short form video creator called youtube shorts india and this has been done give competition chinese video sharing application tiktok per report the verge shorts will allow people make videos and can set music well music for these videos will available through product music picker feature the picker currently has tracks and working with music artists labels and publishers make more their content available continue expanding our catalogue youtube spokesperson told the verge trying attract the maximum number people there will new create icon spot and will appear prominently the app the create icon has rolled out with the shorts beta android and there are plans bring the icon ios devices soon not known when shorts will appear for other countries such the united states

In [98]:
article_te, texts_te = [], []
for doc in cleaned_test:
    for w in doc:
        # if it's not a stop word or punctuation mark, add it to our article
        if not w.is_stop and not w.is_punct and not w.like_num:
            # we add the lematized version of the word
            article_te.append(w.lemma_)
    texts_te.append(article_te)
    article_te=[]

In [99]:
texts_te[5][:20]

['time',
 'history',
 'ipl',
 'participation',
 'american',
 'cricketer',
 'kolkata',
 'knight',
 'rider',
 'sign',
 'ali',
 'khan',
 'fast',
 'bowler',
 'usa',
 'report',
 'espncricinfo',
 'states',
 'replace',
 'injured']

In [102]:
texts_te = [bigram[line] for line in texts_te]

In [103]:
len(texts_te)

243

In [104]:
len(texts_te[0])

77

In [105]:
texts_te[0]

['youtube',
 'roll',
 'new',
 'short',
 'form',
 'video',
 'creator',
 'call',
 'youtube',
 'shorts',
 'india',
 'competition',
 'chinese',
 'video',
 'sharing',
 'application',
 'tiktok',
 'report',
 'verge',
 'short',
 'allow',
 'people',
 'video',
 'set',
 'music',
 'music',
 'video',
 'available',
 'product',
 'music',
 'picker',
 'feature',
 'picker',
 'currently',
 'track',
 'work',
 'music',
 'artist',
 'label',
 'publisher',
 'content',
 'available',
 'continue',
 'expand',
 'catalogue',
 'youtube',
 'spokesperson',
 'tell',
 'verge',
 'trying',
 'attract',
 'maximum_number',
 'people',
 'new',
 'create',
 'icon',
 'spot',
 'appear',
 'prominently',
 'app',
 'create',
 'icon',
 'roll',
 'short',
 'beta',
 'android',
 'plan',
 'bring',
 'icon',
 'ios',
 'device',
 'soon',
 'know',
 'short',
 'appear',
 'country',
 'united_states']

In [106]:
corpus_te = [dictionary.doc2bow(text) for text in texts_te]

In [107]:
corpus_te[0]

[(87, 1),
 (98, 1),
 (103, 1),
 (106, 2),
 (131, 1),
 (159, 1),
 (207, 2),
 (230, 1),
 (266, 1),
 (381, 1),
 (392, 1),
 (529, 1),
 (583, 2),
 (607, 1),
 (843, 1),
 (994, 1),
 (1052, 1),
 (1151, 1),
 (1192, 1),
 (1270, 2),
 (1280, 1),
 (1593, 1),
 (1610, 1),
 (1767, 1),
 (1795, 1),
 (1844, 4),
 (1956, 1),
 (2003, 1),
 (2112, 1),
 (2120, 4),
 (2157, 2),
 (2307, 1),
 (2441, 4),
 (2754, 1),
 (3197, 1),
 (3256, 1),
 (3403, 1),
 (3963, 1),
 (4063, 1),
 (4124, 1),
 (4383, 1),
 (4394, 1),
 (4544, 3),
 (5083, 1),
 (5140, 1),
 (5303, 2),
 (8039, 2),
 (9145, 3),
 (9526, 1),
 (19197, 1),
 (19622, 1),
 (19992, 1),
 (21996, 1)]

In [108]:
corpus_te[1]

[(34, 1),
 (36, 6),
 (91, 1),
 (194, 1),
 (224, 1),
 (237, 1),
 (291, 1),
 (335, 1),
 (360, 2),
 (446, 1),
 (468, 1),
 (471, 1),
 (474, 1),
 (494, 1),
 (503, 1),
 (576, 1),
 (716, 1),
 (822, 1),
 (843, 1),
 (905, 1),
 (1035, 1),
 (1133, 2),
 (1151, 2),
 (1391, 2),
 (1534, 2),
 (1596, 1),
 (1678, 1),
 (1733, 1),
 (1757, 4),
 (1888, 3),
 (2005, 2),
 (2209, 1),
 (2283, 2),
 (2286, 1),
 (2651, 5),
 (2684, 2),
 (2694, 1),
 (2714, 1),
 (3555, 2),
 (3591, 2),
 (3603, 1),
 (3754, 1),
 (4263, 2),
 (4473, 1),
 (4495, 2),
 (4647, 1),
 (4857, 1),
 (4888, 2),
 (5838, 1),
 (6553, 1),
 (6765, 1),
 (9527, 1),
 (14746, 1)]

In [109]:
ldamodel7.get_document_topics(corpus_te[0])

[(0, 0.4369323),
 (2, 0.02098357),
 (3, 0.23718148),
 (5, 0.16460031),
 (7, 0.13520536)]

In [110]:
ldamodel7.get_document_topics(corpus_te[0])[2][1]

0.2371914

In [111]:
ldamodel7.get_document_topics(corpus_te[2])

[(1, 0.13857564), (2, 0.02623093), (5, 0.23627914), (6, 0.5939444)]

In [161]:
# Create a structure for the resulting dataframe

In [140]:
final_df=pd.DataFrame([['',0,0,0,0,0,0,0,0]], columns=['text','cricket','education','international','economy',
                                                  'politics','health','policies','bollywood'])
final_df

Unnamed: 0,text,cricket,education,international,economy,politics,health,policies,bollywood
0,,0,0,0,0,0,0,0,0


In [141]:
for i in range(len(cleaned_test)):
    result_dict={'text':'', 0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0}
    result_dict['text']=str(cleaned_test[i])
    for j in ldamodel7.get_document_topics(corpus_te[i]):
        k=j[0]
        result_dict[k]=j[1]
#     print(result_dict)
    temp=pd.DataFrame([result_dict.values()], columns=['text','cricket','education','international','economy',
                                                  'politics','health','policies','bollywood'])
    final_df=pd.concat([final_df, temp], axis=0)

In [144]:
final_df.reset_index(inplace=True)

In [147]:
final_df=final_df.drop(index=0).reset_index()

In [156]:
final_df.drop(['level_0', 'index'],axis=1, inplace=True)

In [157]:
final_df.head()

Unnamed: 0,text,cricket,education,international,economy,politics,health,policies,bollywood
0,youtube has rolled out new short form video cr...,0.436928,0.0,0.020984,0.237194,0.0,0.164589,0.0,0.135208
1,order implement the national education policy ...,0.0,0.0,0.0,0.030783,0.0,0.113877,0.780975,0.068162
2,prime minister narendra modi will address conc...,0.0,0.138574,0.026231,0.0,0.0,0.236275,0.59395,0.0
3,with the state government decision stay admiss...,0.090326,0.830284,0.0,0.024685,0.0,0.0,0.05134,0.0
4,gareth bale completed his return tottenham fro...,0.471412,0.0,0.0,0.0,0.0,0.522208,0.0,0.0


In [158]:
final_df.shape

(243, 9)

In [148]:
# lets recheck whether everything in our result df is in place or not

In [149]:
cleaned_test[54]

the national testing agency has released the question paper responses and provisional answer key for joint entrance examination jee main which was conducted from september candidates who have taken the exams for tech arch planning can raise objections against answer key any challenge any answer key candidates have log the official website jee mains jeemain nta nic and challenge the wrong answer key paying non refundable fee for each question challenged the window for raising objection will open till september while the fees can paid till the nta will release the final answer key after considering the valid objections after which the jee mains result will declared the registration process for jee advanced will begin from september candidates who clear the jee main will eligible for jee advanced challenge answer key visit the official website jeemain nta nic inclick challenge regarding answer key link given the homepagelogin with your application number and date birth and enter securityc

In [150]:
ldamodel7.get_document_topics(corpus_te[54])

[(0, 0.030795967), (1, 0.91139317), (2, 0.017926103), (4, 0.031375565)]

In [151]:
# okay, the topic for the following makes sense

In [159]:
final_df.iloc[[54]]

Unnamed: 0,text,cricket,education,international,economy,politics,health,policies,bollywood
54,the national testing agency has released the q...,0.030805,0.911407,0.017926,0.0,0.031368,0.0,0.0,0.0


In [160]:
# Everything is fine