<a href="https://colab.research.google.com/github/harikrishnareddymallavarapu/Masters/blob/main/Masters_2_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
# Run in python console
import nltk; 
nltk.download('stopwords')

# Run in terminal or command prompt
!python3 -m spacy download en

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import spacy

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [4]:

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [5]:

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use','trump','donald','say','hillary','clinton','do','united','state','the','day','news','that','have','however','year'])

In [6]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts,bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    nlp = spacy.load('en', disable=['parser', 'ner'])
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [7]:
article1 = pd.read_csv('drive/My Drive/Top-Ex/articles1.csv')
article2 = pd.read_csv('drive/My Drive/Top-Ex/articles2.csv')
article3 = pd.read_csv('drive/My Drive/Top-Ex/articles3.csv')

#Concatenating and randomly selecting data

In [9]:
articles = pd.concat([article1,article2,article3], axis=0)
articles = articles.sample(n = 50000, random_state = 2)
articles.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
44480,47352,66559,Trump says his ’massive’ tax plan is coming ne...,Business Insider,Bob Bryan,2017-04-22,2017.0,4.0,,’ ’ ’ President Donald Trump said that he e...
8195,61514,83936,Chicago police have ’no regard’ for minority l...,Fox News,,2016-04-13,2016.0,4.0,https://web.archive.org/web/20160414011639/htt...,Police in Chicago have ”no regard for the san...
10464,113926,168759,A Concussion Can Lead To Sleep Problems That L...,NPR,Jon Hamilton,2016-04-27,2016.0,4.0,http://www.npr.org/sections/health-shots/2016/...,People who sustain a concussion or a more seve...
19198,72606,106668,Trump: I Was Just Joking When I Made Crude Co...,Buzzfeed News,Andrew Kaczynski,2016-03-29,2016.0,3.0,https://web.archive.org/web/20160329002256/htt...,’You can watch the interview here:’] Trump’s c...
17793,71201,102160,"Beyoncé Dropped Out Of Coachella, Because Duh",Buzzfeed News,Alanna Bennett,2017-02-24,2017.0,2.0,https://web.archive.org/web/20170224044447/htt...,"’ It has happened, friends: Beyoncé has offici..."


In [10]:
articles.shape

(50000, 10)

In [11]:
del article1,article2,article3
articles = articles[['id','title','content']]
articles['text']= articles['title']+' '+articles['content']
articles = articles[['id','text']]
articles.columns = ['id','content']

In [14]:
articles.head()

Unnamed: 0,id,content
44480,66559,Trump says his ’massive’ tax plan is coming ne...
8195,83936,Chicago police have ’no regard’ for minority l...
10464,168759,A Concussion Can Lead To Sleep Problems That L...
19198,106668,Trump: I Was Just Joking When I Made Crude Co...
17793,102160,"Beyoncé Dropped Out Of Coachella, Because Duh..."


In [15]:
articles.dtypes

id          int64
content    object
dtype: object

In [20]:

def createCorpusDict(dataFrame):
    #cleanse the text
    dataFrame.content = dataFrame.content.astype(str)
    data = dataFrame.content.values.tolist()
    data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
    # Remove new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]
    # Remove distracting single quotes
    data = [re.sub("\'", "", sent) for sent in data]
    data_words = list(sent_to_words(data))
    
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    data_words_nostops = remove_stopwords(data_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops,bigram_mod)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en


    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    #cleanse the dictionary
    dictionary = corpora.Dictionary(data_lemmatized)
    # Create Corpus
    texts = data_lemmatized
    # Term Document Frequency
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    return corpus,dictionary,data_lemmatized

In [None]:
corpus, dictionary, data_lemmatized = createCorpusDict(articles)



In [None]:
import pickle

filehandler = open("corpus.pkl","wb")
pickle.dump(corpus,filehandler)
filehandler.close()

filehandler = open("dictionary.pkl","wb")
pickle.dump(dictionary,filehandler)
filehandler.close()

filehandler = open("data_lemmatized.pkl","wb")
pickle.dump(data_lemmatized,filehandler)
filehandler.close()

In [None]:
#Train the Topic Modelling
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
doc_lda

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
import pickle
pkl_file = open('drive/My Drive/Top-Ex/data_lemmatized.pkl', 'rb')
data_lemmatized = pickle.load(pkl_file)
pkl_file.close()

pkl_file = open('drive/My Drive/Top-Ex/dictionary.pkl', 'rb')
dictionary = pickle.load(pkl_file)
pkl_file.close()

pkl_file = open('drive/My Drive/Top-Ex/corpus.pkl', 'rb')
corpus = pickle.load(pkl_file)
pkl_file.close()

pkl_file = open('drive/My Drive/Top-Ex/lda_20_new.pkl', 'rb')
lda_model = pickle.load(pkl_file)
pkl_file.close()

In [None]:

import pyLDAvis.gensim
import pickle
import pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
gamma, _ = lda_model.inference(corpus)
topics = gamma / gamma.sum(axis=1)[:,None]

In [None]:
topics = pd.DataFrame(topics)
topics.reset_index(drop=True)
topics.rename(columns = lambda x: 'Topic_'+str(x+1).zfill(2), inplace=True)
topics.head()

Unnamed: 0,Topic_01,Topic_02,Topic_03,Topic_04,Topic_05,Topic_06,Topic_07,Topic_08,Topic_09,Topic_10,Topic_11,Topic_12,Topic_13,Topic_14,Topic_15,Topic_16,Topic_17,Topic_18,Topic_19,Topic_20
0,0.067322,0.007678,0.000438,0.243083,0.005309,0.000731,0.006883,0.086805,0.071966,0.001694,0.022597,0.010804,0.133596,0.00064,0.000602,0.005302,0.333108,0.000389,0.000407,0.000643
1,0.021442,0.087206,0.014915,0.002733,0.001883,0.030547,0.009134,0.033421,0.097003,0.045649,0.148593,0.004537,0.14065,0.020172,0.278967,0.007261,0.047378,0.001608,0.001532,0.00537
2,0.004298,0.082594,0.008812,0.003834,0.00237,0.086435,0.010681,0.064172,0.013705,0.022234,0.12288,0.01933,0.175133,0.018992,0.024341,0.189761,0.06717,0.046291,0.036694,0.000275
3,0.014538,0.075287,0.018987,0.000233,0.067784,0.039817,0.006462,0.131653,0.044122,0.004669,0.059677,0.005896,0.221646,0.014398,0.025254,0.213624,0.030434,0.005583,0.019625,0.00031
4,0.001945,0.035472,0.037059,0.000629,0.015938,0.000957,0.001271,0.054531,0.00666,0.001672,0.015477,0.002575,0.177025,0.273508,0.012672,0.002322,0.260723,0.071673,0.002247,0.025643


In [None]:
finalData = pd.concat([articles,topics], axis=1,sort=False)
finalData.head()

Unnamed: 0,id,content,Topic_01,Topic_02,Topic_03,Topic_04,Topic_05,Topic_06,Topic_07,Topic_08,Topic_09,Topic_10,Topic_11,Topic_12,Topic_13,Topic_14,Topic_15,Topic_16,Topic_17,Topic_18,Topic_19,Topic_20
0,17283,House Republicans Fret About Winning Their Hea...,0.067322,0.007678,0.000438,0.243083,0.005309,0.000731,0.006883,0.086805,0.071966,0.001694,0.022597,0.010804,0.133596,0.00064,0.000602,0.005302,0.333108,0.000389,0.000407,0.000643
1,17284,Rift Between Officers and Residents as Killing...,0.021442,0.087206,0.014915,0.002733,0.001883,0.030547,0.009134,0.033421,0.097003,0.045649,0.148593,0.004537,0.14065,0.020172,0.278967,0.007261,0.047378,0.001608,0.001532,0.00537
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",0.004298,0.082594,0.008812,0.003834,0.00237,0.086435,0.010681,0.064172,0.013705,0.022234,0.12288,0.01933,0.175133,0.018992,0.024341,0.189761,0.06717,0.046291,0.036694,0.000275
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",0.014538,0.075287,0.018987,0.000233,0.067784,0.039817,0.006462,0.131653,0.044122,0.004669,0.059677,0.005896,0.221646,0.014398,0.025254,0.213624,0.030434,0.005583,0.019625,0.00031
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,0.001945,0.035472,0.037059,0.000629,0.015938,0.000957,0.001271,0.054531,0.00666,0.001672,0.015477,0.002575,0.177025,0.273508,0.012672,0.002322,0.260723,0.071673,0.002247,0.025643


In [None]:
finalData.head()

Unnamed: 0,id,content,Topic_01,Topic_02,Topic_03,Topic_04,Topic_05,Topic_06,Topic_07,Topic_08,Topic_09,Topic_10,Topic_11,Topic_12,Topic_13,Topic_14,Topic_15,Topic_16,Topic_17,Topic_18,Topic_19,Topic_20
0,17283,House Republicans Fret About Winning Their Hea...,0.067322,0.007678,0.000438,0.243083,0.005309,0.000731,0.006883,0.086805,0.071966,0.001694,0.022597,0.010804,0.133596,0.00064,0.000602,0.005302,0.333108,0.000389,0.000407,0.000643
1,17284,Rift Between Officers and Residents as Killing...,0.021442,0.087206,0.014915,0.002733,0.001883,0.030547,0.009134,0.033421,0.097003,0.045649,0.148593,0.004537,0.14065,0.020172,0.278967,0.007261,0.047378,0.001608,0.001532,0.00537
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",0.004298,0.082594,0.008812,0.003834,0.00237,0.086435,0.010681,0.064172,0.013705,0.022234,0.12288,0.01933,0.175133,0.018992,0.024341,0.189761,0.06717,0.046291,0.036694,0.000275
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",0.014538,0.075287,0.018987,0.000233,0.067784,0.039817,0.006462,0.131653,0.044122,0.004669,0.059677,0.005896,0.221646,0.014398,0.025254,0.213624,0.030434,0.005583,0.019625,0.00031
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,0.001945,0.035472,0.037059,0.000629,0.015938,0.000957,0.001271,0.054531,0.00666,0.001672,0.015477,0.002575,0.177025,0.273508,0.012672,0.002322,0.260723,0.071673,0.002247,0.025643


In [None]:
output3 = open('drive/My Drive/Top-Ex/articles_topic_20.pkl', 'wb')
pickle.dump(finalData, output3)
output3.close()

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
                           stop_words='english')
#transform
Count_data = CountVec.fit_transform()
 
#create dataframe
cv_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names())
print(cv_dataframe)