### Reference
https://github.com/Kaustubh-Tambe/BBC-News_Topic-Modelling/blob/main/Notebook/BBC_News_Topic_Modelling_Project.ipynb

# Install Libraries

In [1]:
# pip install gensim
# conda install -c anaconda gensim
!pip3 install gensim

Defaulting to user installation because normal site-packages is not writeable


In [2]:
pip install spacy

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install pyLDAvis

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [5]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Genism
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Spacy for Lemmatization
import spacy

# Plotting
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Downloading and preparing stopwords from NLTK & extended_stopwords.txt

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /storage/home/hcocice1/jpark3141/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# add more stopwords
file = open("./extended_stopwords.txt", "r")
stop_words.extend([line.strip() for line in file.readlines()])
file.close()

# Converting it to DF

In [8]:
import pandas as pd
df = pd.read_csv("./2016-01-posts.csv")

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1654555 entries, 0 to 1654554
Data columns (total 12 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   subreddit_id  1654555 non-null  object
 1   selftext      580175 non-null   object
 2   author        1654555 non-null  object
 3   over_18       1654555 non-null  bool  
 4   ups           1654555 non-null  int64 
 5   created_utc   1654555 non-null  int64 
 6   score         1654555 non-null  int64 
 7   downs         1654555 non-null  int64 
 8   title         1654554 non-null  object
 9   num_comments  1654555 non-null  int64 
 10  subreddit     1654555 non-null  object
 11  quarantine    1654555 non-null  bool  
dtypes: bool(2), int64(5), object(5)
memory usage: 129.4+ MB


In [10]:
df.head()

Unnamed: 0,subreddit_id,selftext,author,over_18,ups,created_utc,score,downs,title,num_comments,subreddit,quarantine
0,t5_2qupf,Does anyone have the Jupiter Ascending script?...,Nonsuch42,False,11,1451606400,11,0,[REQUEST] Jupiter Ascending script?,15,Screenwriting,False
1,t5_2qi58,,historyworkisboring,False,261,1451606401,261,0,"Cristiano Ronaldo: ""We cannot live being obses...",139,soccer,False
2,t5_2qhqb,[removed],ReadsStuff,False,47,1451606401,47,0,Happy New Year /r/UnitedKingdom,17,unitedkingdom,False
3,t5_2r344,This thread is for you to promote your blog / ...,ranalog,False,11,1451606403,11,0,Monthly 'Self Promotion' - January,51,analog,False
4,t5_2qpp6,,Brandhor,False,732,1451606404,732,0,MLG sells “substantially all” assets to Activi...,285,starcraft,False


# Preprocessing

In [11]:
print(df.isnull().sum()) # selftext has 1074380 Nan

subreddit_id          0
selftext        1074380
author                0
over_18               0
ups                   0
created_utc           0
score                 0
downs                 0
title                 1
num_comments          0
subreddit             0
quarantine            0
dtype: int64


In [12]:
# delete Nan from selftext
updated_df = df.dropna(axis=0)

updated_df.astype({'selftext':'string'}).dtypes
updated_df.info()

# delete [removed]
print(len(updated_df))
#count = [c for c in updated_df['selftext'] if "[removed]" in c]
#print(len(count)) # 15552 rows of [removed]
#filtered_df = updated_df[updated_df['selftext'].str.contains("[removed]") == False]
#wrong = updated_df[updated_df['selftext'].str.contains("[removed]") == True]
#print(len(filtered_df))
#print(len(wrong))
#df['selftex'] = df['selftext'].map(lambda x: x.lower())
#filtered_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 580175 entries, 0 to 1654554
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   subreddit_id  580175 non-null  object
 1   selftext      580175 non-null  object
 2   author        580175 non-null  object
 3   over_18       580175 non-null  bool  
 4   ups           580175 non-null  int64 
 5   created_utc   580175 non-null  int64 
 6   score         580175 non-null  int64 
 7   downs         580175 non-null  int64 
 8   title         580175 non-null  object
 9   num_comments  580175 non-null  int64 
 10  subreddit     580175 non-null  object
 11  quarantine    580175 non-null  bool  
dtypes: bool(2), int64(5), object(5)
memory usage: 49.8+ MB
580175


In [13]:
updated_df['selftext'].map(lambda x: x.lower())

0          does anyone have the jupiter ascending script?...
2                                                  [removed]
3          this thread is for you to promote your blog / ...
7          ### this thread is for serious discussion of t...
10                                                 [deleted]
                                 ...                        
1654545    i'm currently reading a book on wwi and came a...
1654549    joe speaks highly about justin trudeau and say...
1654552    i'm probably going to buy $100 into bitcoin, f...
1654553    hey y'all, welcome back to my semi-gem ch coll...
1654554                                    it's pretty good.
Name: selftext, Length: 580175, dtype: object

In [14]:
# converting DataFrame column into List
data = updated_df['selftext'].tolist()

# Remove url
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]
# remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

#pprint(data[:1])
print(data[0])

Does anyone have the Jupiter Ascending script? I doubt it, but I really enjoyed the movie and would love to see how it was presented in script form, especially since there is so clearly connective tissue missing from the final product. Thanks!


In [15]:
# Tokenize the Data
def sent_to_words(sents, deacc=True):
    for sentence in sents:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc = True))

data_words = list(sent_to_words(data))

In [16]:
# Defining the Bigram and Trigram Model
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])


['does', 'anyone', 'have', 'the', 'jupiter_ascending', 'script', 'doubt', 'it', 'but', 'really', 'enjoyed', 'the', 'movie', 'and', 'would', 'love', 'to', 'see', 'how', 'it', 'was', 'presented', 'in', 'script', 'form', 'especially', 'since', 'there', 'is', 'so', 'clearly', 'connective_tissue', 'missing', 'from', 'the', 'final', 'product', 'thanks']


In [17]:
# create functions for removing stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

#creating fuctions for making bigram
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

#creating fuctions for making trigram
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

#creating fuctions for Lemmitization
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #corpus will have the words that belong to only these part of speech
    
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
#Using all the Functions Created for Preprocessing
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# print(data_lemmatized[:1])

In [None]:
# Again Removing Stop Words
# sometimes Lemma can be equivalent to the stopwords... 
data_words_nostops = remove_stopwords(data_lemmatized)

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# Use LDA for Topic Modeling

In [None]:
#here we are trying to get the optimal model according to the Coherence score(meseaure of Separability) to figure out no of topics ..
def tune_model(dictionary, corpus, texts, limit, start, step):
    
    coherence_values = []
    model_list = []
    for num_topics in tqdm(range(start, limit, step)):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True,
                                           )
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(round(coherencemodel.get_coherence(),3))

    return model_list, coherence_values

In [None]:
# Can take a long time to run
from tqdm import tqdm
model_list, coherence_values = tune_model(dictionary=id2word, co

# Coherence Score

## Plotting Line Graph for Coherence Score

In [None]:
limit=10; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

## Print Coherence Scores

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

### Should Figure out our maximum Coherence Value!! -> LDA Model

In [None]:
#here we knew that the coherence score is maximum for 6 topics so that will become our optimal model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
#getting dominant words for each topics 
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

# Result

## Cluster topics and see the dominant words in graphics

In [None]:
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
LDAvis_prepared

In [None]:
#visualization libraries 
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

In [None]:
#Creating Word Cloud
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'
cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

In [None]:
#preparation for wordcloud
topics = lda_model.show_topics(formatted=False)
topic_words = dict(topics)

In [None]:
fig, axes = plt.subplots(2,3 , figsize=(15,10), sharex=True, sharey=True)
#fig.delaxes(ax[1,1])

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')