# Topic Modeling with NLTK and Gensim
Reference: https://towardsdatascience.com/topic-modeling-in-pythoon-with-nltk-and-gensim-4ef03213cd21

#### Text Cleaning
Clean text and return list of tokens

In [5]:
import spacy

In [8]:
spacy.load('en')

<spacy.lang.en.English at 0x1250cfb3f98>

In [9]:
from spacy.lang.en import English
parser = English()

In [31]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [32]:
import nltk

In [33]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jennifer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [34]:
from nltk.corpus import wordnet as wn

In [35]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [36]:
from nltk.stem.wordnet import WordNetLemmatizer

In [37]:
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [38]:
# Filter out stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jennifer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [39]:
en_stop = set(nltk.corpus.stopwords.words('english'))

In [40]:
# define function to prepare the text for topic modelling
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [66]:
import pandas as pd
import datetime

In [96]:
# Load the dataset into dataframe, filter only Canada tweets
tweets_df = pd.read_csv('D:/development/CSDA1050/playground/twitter_jj/twitter_tweets_new.csv')
tweets_df = tweets_df[tweets_df['place_country']=='Canada']
tweets_df.tweet_date = pd.to_datetime(tweets_df['tweet_date'])

In [97]:
start_date = '03-19-2019 01:00:00'
end_date = '03-20-2019 01:00:00'
mask = (tweets_df['tweet_date'] > start_date) & (tweets_df['tweet_date'] < end_date)
in_range_df = tweets_df.loc[mask]

In [99]:
in_range_df.shape

(15533, 24)

In [100]:
# open data, read line by line, prepare text for LDA
import random
text_data = []
for index, row in in_range_df.iterrows():
    tweet = row['tweet_text']
    
    tokens = prepare_text_for_lda(tweet)
    #print(tokens)
    text_data.append(tokens)

In [101]:
# LDA with Gensim
# convert to bag of words corpus and save the dictionary and corpus for future use
from gensim import corpora

In [102]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [103]:
import pickle

In [104]:
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [105]:
# we are asking LDA to find 5 topics in the data:
import gensim

In [106]:
NUM_TOPICS=5

In [107]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics=NUM_TOPICS, 
                                           id2word=dictionary, 
                                           passes=15
                                          )
ldamodel.save('model5.gensim')

In [108]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.006*"woman" + 0.005*"hahaha" + 0.005*"canada" + 0.005*"apply"')
(1, '0.032*"SCREEN_NAME" + 0.029*"toronto" + 0.015*"general" + 0.015*"dispatch"')
(2, '0.344*"SCREEN_NAME" + 0.006*"would" + 0.006*"people" + 0.005*"great"')
(3, '0.005*"amber" + 0.005*"budget2019" + 0.005*"action" + 0.004*"amberalert"')
(4, '0.008*"happy" + 0.008*"toronto" + 0.006*"ontario" + 0.005*"budget"')


In [109]:
#pyLDAvis: visualizing topics
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [111]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

  """
  """
  """


Saliency: a measuer of how much the term tells you about the topic
Relevance: a weighted average of the probability of the word given the topic and the word given the topic normalized by the probability of the topic.

Size of bubble measures the importance of the topics, relative to the data

In [114]:
lda3 = gensim.models.ldamodel.LdaModel.load('model5.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
