<a href="https://colab.research.google.com/github/hawc2/Text-Analysis-with-Python/blob/master/Topic_Modeling%20with%20Gensim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intro to Topic Modeling with Gensim and pyLDAvis

This Colab Notebook adapts a simplified version of the this [tutorial](https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/). This tutorial focuses on guiding you through importing the necessary packages, cleaning and processing text data from a spreadsheet of documents, and visualizing the topics in an interactive, web-based application.


If you would like to do more advanced topic modeling, including by integrating Mallet, testing for coherence of the model, and visualizing metrics, Gensim provides a wide array of resources.  

# Mount Drive

In [1]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Mounted at /gdrive
/gdrive


# Upload Files

In [2]:
#from google.colab import files

#uploaded = files.upload()

#for fn in uploaded.keys():
#  print('User uploaded file "{name}" with length {length} bytes'.format(
#      name=fn, length=len(uploaded[fn])))

# Import CSV Data from Github

In [3]:
RTdata = 'https://raw.githubusercontent.com/hawc2/Text-Analysis-with-Python/master/RottenTomatoes.csv'

In [4]:
#SFdata = 'https://raw.githubusercontent.com/hawc2/Text-Analysis-with-Python/master/Scifi.csv'

# Convert RottenTomatoes.csv to Data Frame

In [4]:
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv(RTdata, usecols=['Username', 'content'], encoding = 'utf-8')

In [6]:
data = df.content.values.tolist()

### View Dataframe

In [None]:
df

In [None]:
%load_ext google.colab.data_table 
df

# Convert Scifi.CSV to Data Frame

In [5]:
#dfSF = pd.read_csv(SFdata, usecols=['BookChapter', 'text'], encoding = 'utf-8')

In [20]:
#dfSF['text']=dfSF['text'].apply(str)

In [21]:
#dataSF = dfSF.text.values.tolist()

# Clean Texts

In [9]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
!pip3 install spacy
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.2MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp36-none-any.whl size=829180944 sha256=6ac677507dc38c1fe79063e342114fd778453fec75a317fde0f4e04dfb2feb41
  Stored in directory: /tmp/pip-ephem-wheel-cache-01bfk_29/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [11]:
import spacy
import en_core_web_sm
#from spacy.lang.en import English
#parser = English()
#nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])

In [12]:
import en_core_web_lg
nlp = en_core_web_lg.load()

In [13]:
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess

In [14]:
import re
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]

In [15]:
def sent_to_words(sentences):
    for sentence in sentences:
      yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [16]:
bigram = gensim.models.Phrases(data_words, min_count=1, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [17]:
def remove_stopwords(texts):
   return [[word for word in simple_preprocess(str(doc))
if word not in stop_words] for doc in texts]
def make_bigrams(texts):
   return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
   return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
   texts_out = []
   for sent in texts:
     doc = nlp(" ".join(sent))
     texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
   return texts_out

In [18]:
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=[
   'NOUN', 'ADJ', 'VERB', 'ADV'
])

In [29]:
print(data_lemmatized[:4])

[['abandon', 'absence', 'absorb', 'accident', 'activate', 'address', 'admit', 'adversary', 'age', 'air', 'alarm', 'alert', 'already', 'amuse', 'angst', 'answering', 'anyhow_anyhow', 'apartment', 'argument', 'argument', 'arm', 'around_around', 'ask', 'attention', 'auto_auto', 'automatically', 'available', 'away_away', 'away', 'awful', 'awry', 'back', 'back', 'back', 'back', 'back', 'back', 'badly', 'banter', 'base', 'base', 'bathroom', 'beg', 'well', 'bill', 'bind', 'bit_bit', 'bitterly', 'blau_blau', 'blink', 'blonde', 'blonde', 'blue', 'book', 'bother', 'bother', 'bottle', 'breath', 'broadcasting', 'brush', 'building', 'button', 'cabinet_cabinet', 'call_calle', 'call', 'come', 'capable', 'capsule', 'car_car', 'car_car', 'car_car', 'car_car', 'car_car', 'car_car', 'car_car', 'car', 'carbon', 'car', 'catalyst', 'chair', 'change', 'chat', 'cheer', 'chronic', 'chute', 'clean', 'clem_clem', 'clem_clem', 'clem_clem', 'clem_clem', 'click', 'close', 'clown', 'coat', 'coffee', 'cold', 'comb', 

# Building Dictionary and Corpus

In [19]:
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1)], [(0, 1), (1, 1), (7, 1), (8, 5), (13, 2), (16, 23), (19, 2), (20, 1), (22, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 5), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 2), (46, 1), (47, 1), (48, 1), (49, 1), (50, 2), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 2), (65, 3), (66, 1), (67, 1), (68, 1), (69, 1), (70, 2), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 2), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 2), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 3), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102

# Create Topic Model

In [20]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20,
                                           random_state=100,
                                           update_every=2,
                                           chunksize=100,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

# Create Visualization (Save HTML)

The easiest way to create the visualization is to reveal it in the Google Colab notebook and save it as an html file that you can view on your browser. 

In [21]:
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim

Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 2.8MB/s 
Collecting funcy
  Downloading https://files.pythonhosted.org/packages/66/89/479de0afbbfb98d1c4b887936808764627300208bb771fcd823403645a36/funcy-1.15-py2.py3-none-any.whl
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (setup.py) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97712 sha256=ae70f10872446de55ec98a6769473a51e9e79914c6951b927d834509d43ec384
  Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.15 pyLDAvis-2.1.2


In [22]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
#vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds')

In [None]:
pyLDAvis.save_html(vis, '/content/LDAviz.html')

In [23]:
pyLDAvis.display(vis)

# Serve Visualization in Browser

You can also serve the visualization locally in the browser using the below chunk of code. Beware that caching in your browser and other issues, such as ad-blockers, may require some debugging to get this working on your machine. 

In [None]:
#pyLDAvis.enable_notebook()
#pyLDAvis.show(vis)