# Twitter Topic Modeling: Toronto Tweets
By: Jennifer Johnson

Twitter data filtered for Toronto Only
Reference: https://towardsdatascience.com/topic-modeling-in-pythoon-with-nltk-and-gensim-4ef03213cd21

In [1]:
import pandas as pd
import re
import spacy
import nltk
import string
import ast
import numpy as np
import seaborn as sns

import missingno as msno

from datetime import datetime
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

sns.set(style="whitegrid") 

In [2]:
spacy.load('en')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jennifer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jennifer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jennifer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from spacy.lang.en import English
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from translate import Translator

parser = English()
translator = Translator(to_lang="English")

In [4]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [5]:
# Filter out stopwords
en_stop = set(nltk.corpus.stopwords.words('english'))

In [6]:
def processTweet(tweet):
    # process the tweets
    translator = str.maketrans('', '', string.punctuation)

    #Convert to lower case
    tweet = tweet.lower()
    
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','USER',tweet)
    
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    
    #trim
    tweet = tweet.strip('\'"')
    
    #remove punctuation from text
    tweet = tweet.translate(translator)
    
    tweet = tweet.split()
    tweet = [get_lemma(token) for token in tweet]
    tweet = [token for token in tweet if token not in en_stop]
    tweet = ' '.join(tweet)
    return tweet

In [7]:
# Load dataframe
tweets_df = pd.read_csv('D:/development/CSDA1050/playground/twitter_jj/twitter_tweets_new.csv')
tweets_df = tweets_df[tweets_df['place_country']=='Canada']
tweets_df.shape

(299111, 24)

In [8]:
# changing tweet_text to String column (returned error about 'float')
# when full dataset was used
tweets_df['tweet_text'] = tweets_df['tweet_text'].astype(str)
tweets_df['tweet_clean'] = tweets_df['tweet_text'].apply(processTweet)

In [9]:
import pytz

In [10]:
def getTZ(tweet_date):
    est = pytz.timezone('US/Eastern')
    
    tweet_date = pd.Timestamp(tweet_date).tz_localize('UTC')
    tweet_date = tweet_date.astimezone(est)
    
    return tweet_date

In [11]:
# changing tweet_date to DateTime column
tweets_df.tweet_date = pd.to_datetime(tweets_df['tweet_date'])
tweets_df['tweet_date'] = tweets_df['tweet_date'].apply(getTZ)

In [12]:
tweets_df.head()

Unnamed: 0,emojis,hashtags,id,language,latitude,longitude,mentions,place_bbcoordinates,place_bbtype,place_country,...,reply_to_status,reply_to_user,sensitive,sentiment,source,tweet_date,tweet_text,user_id,user_name,tweet_clean
0,[':red_heart:'],,1107369998148788224,en,,,,"[[[-79.583667, 43.550871], [-79.583667, 43.550...",Polygon,Canada,...,,,False,"{'neg': 0.0, 'neu': 0.506, 'pos': 0.494, 'comp...",Twitter for iPhone,2019-03-17 15:55:56.155000-04:00,Loving the vibes :red_heart:️ https://t.co/X8v...,908049944040361984,wikki_87,love vibes redheart️ URL
1,[],,1107370010182324229,und,,,"['Rosalestri', 'Cnyari']","[[[-80.248423, 43.050553], [-80.248423, 43.470...",Polygon,Canada,...,1.107354e+18,Rosalestri,False,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",Twitter for Android,2019-03-17 15:55:59.024000-04:00,@Rosalestri @Cnyari Olé,16801721,njsh2008,USER USER olé
4,"[':fearful_face:', ':fearful_face:', ':flushed...",,1107370005883023360,ko,,,['BTS_twt'],"[[[-95.155898, 41.676329], [-95.155898, 56.852...",Polygon,Canada,...,,BTS_twt,False,"{'neg': 0.187, 'neu': 0.51, 'pos': 0.304, 'com...",Twitter for iPhone,2019-03-17 15:55:57.999000-04:00,"@BTS_twt 김태형, 밴드에이드? :fearful_face::fearful_fa...",828139192035196928,BangtanBabeXO,USER 김태형 밴드에이드 fearfulfacefearfulface wa cut h...
5,[],,1107370047134224384,en,,,['MazharAbbasGEO'],"[[[-79.810142, 43.373074], [-79.810142, 43.737...",Polygon,Canada,...,1.107348e+18,MazharAbbasGEO,False,"{'neg': 0.423, 'neu': 0.577, 'pos': 0.0, 'comp...",Twitter for Android,2019-03-17 15:56:07.834000-04:00,@MazharAbbasGEO WHAT ABOUT THIS SHAMELESS http...,127687121,ZA_chaudhry,USER shameless URL
6,[],"['torontocontractors', 'renovation', 'interior...",1107370085608538113,en,43.7166,-79.3407,,"[[[-79.639319, 43.403221], [-79.639319, 43.855...",Polygon,Canada,...,,,False,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",Instagram,2019-03-17 15:56:17.007000-04:00,"Before and after, big different....\n.\n.\n.\n...",349291874,905RENO,big different torontocontractors renovation in...


##### Topic Model Periods
Solid periods, days without interruption in streaming
- 2019-03-19 to 2019-03-20 (2 days)
- 2019-03-23 to 2019-03-26 (4 days)
- 2019-04-01 to 2019-04-10 (9 days)

In [13]:
p1_start = '03-19-2019 00:00:00'
p1_end = '03-20-2019 23:59:59'

p2_start = '03-23-2019 00:00:00'
p2_end = '03-26-2019 23:59:59'

p3_start = '04-01-2019 00:00:00'
p3_end = '04-10-2019 23:59:59'

In [14]:
start_date = p1_start
end_date = p1_end
mask = (tweets_df['tweet_date'] > start_date) & (tweets_df['tweet_date'] < end_date)
in_range_df = tweets_df.loc[mask]
in_range_df.shape

(32621, 25)

In [22]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [23]:
text_data = []
for index, row in in_range_df.iterrows():
    tweet = row['tweet_clean']
    tokens = tokenize(tweet)
    tokens = [token for token in tokens if len(token) > 4]
    text_data.append(tokens)

In [24]:
from gensim import corpora
import pickle
import gensim
import pyLDAvis.gensim

In [25]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [26]:
NUM_TOPICS=10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics=NUM_TOPICS, 
                                           id2word=dictionary, 
                                           passes=15
                                          )
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.105*"toronto" + 0.040*"ontario" + 0.018*"check" + 0.014*"mississauga"')
(1, '0.017*"little" + 0.014*"celebrate" + 0.011*"tomorrow" + 0.010*"season"')
(2, '0.017*"thanks" + 0.017*"great" + 0.015*"tonight" + 0.015*"first"')
(3, '0.022*"trump" + 0.014*"right" + 0.010*"wethenorth" + 0.010*"crazy"')
(4, '0.039*"general" + 0.037*"dispatch" + 0.031*"200319" + 0.024*"street"')
(5, '0.012*"going" + 0.011*"people" + 0.010*"please" + 0.010*"hahaha"')
(6, '0.046*"happy" + 0.022*"canada" + 0.018*"woman" + 0.013*"birthday"')
(7, '0.042*"would" + 0.033*"spring" + 0.028*"really" + 0.016*"think"')
(8, '0.029*"people" + 0.017*"world" + 0.014*"awesome" + 0.014*"medium"')
(9, '0.014*"thank" + 0.013*"could" + 0.011*"follow" + 0.011*"story"')


In [27]:
#pyLDAvis: visualizing topics
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


#### Topic Analysis for March 19-20, 2019

## Period 2

In [29]:
start_date = p2_start
end_date = p2_end
mask = (tweets_df['tweet_date'] > start_date) & (tweets_df['tweet_date'] < end_date)
in_range_df = tweets_df.loc[mask]
in_range_df.shape

(64017, 25)

In [30]:
text_data = []
for index, row in in_range_df.iterrows():
    tweet = row['tweet_clean']
    tokens = tokenize(tweet)
    text_data.append(tokens)

In [31]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [32]:
NUM_TOPICS=10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics=NUM_TOPICS, 
                                           id2word=dictionary, 
                                           passes=15
                                          )
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.051*"ford" + 0.024*"purpleday" + 0.017*"news" + 0.014*"ufc"')
(1, '0.030*"are" + 0.024*"solid" + 0.023*"denis" + 0.023*"stef"')
(2, '0.029*"there" + 0.029*"na" + 0.029*"room" + 0.028*"eat"')
(3, '0.462*"user" + 0.020*"url" + 0.010*"thank" + 0.008*"patient"')
(4, '0.022*"become" + 0.019*"brother" + 0.019*"especially" + 0.018*"trumpisaliar"')
(5, '0.024*"’s" + 0.020*"wa" + 0.018*"n’t" + 0.014*"get"')
(6, '0.124*"url" + 0.024*"…" + 0.023*"toronto" + 0.018*"ontario"')
(7, '0.056*"url" + 0.017*"job" + 0.015*"might" + 0.015*"toronto"')
(8, '0.039*"url" + 0.037*"photo" + 0.027*"area" + 0.026*"general"')
(9, '0.089*"i" + 0.035*"m" + 0.031*"’m" + 0.019*"u"')


In [33]:
#pyLDAvis: visualizing topics
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Period 3

In [34]:
start_date = p3_start
end_date = p3_end
mask = (tweets_df['tweet_date'] > start_date) & (tweets_df['tweet_date'] < end_date)
in_range_df = tweets_df.loc[mask]
in_range_df.shape

(153273, 25)

In [None]:
text_data = []
for index, row in in_range_df.iterrows():
    tweet = row['tweet_clean']
    tokens = tokenize(tweet)
    text_data.append(tokens)

In [None]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [None]:
NUM_TOPICS=10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics=NUM_TOPICS, 
                                           id2word=dictionary, 
                                           passes=15
                                          )
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.018*"e" + 0.015*"v" + 0.014*"true" + 0.014*"player"')
(1, '0.018*"url" + 0.011*"pay" + 0.011*"follow" + 0.010*"tax"')
(2, '0.583*"user" + 0.042*"url" + 0.011*"thanks" + 0.010*"thank"')
(3, '0.038*"area" + 0.036*"general" + 0.035*"edt" + 0.035*"dispatch"')
(4, '0.088*"i" + 0.033*"’m" + 0.033*"m" + 0.032*"“"')
(5, '0.014*"medical" + 0.013*"he" + 0.008*"final" + 0.008*"و"')
(6, '0.129*"url" + 0.022*"…" + 0.018*"toronto" + 0.013*"amp"')
(7, '0.033*"de" + 0.020*"le" + 0.020*"la" + 0.017*"que"')
(8, '0.029*"that" + 0.023*"s" + 0.012*"na" + 0.012*"cdnpoli"')
(9, '0.050*"user" + 0.017*"wa" + 0.017*"’s" + 0.014*"n’t"')


In [None]:
#pyLDAvis: visualizing topics
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)