# Twitter Topic Modeling: Toronto Tweets
By: Jennifer Johnson

Twitter data filtered for Toronto Only
Reference: https://towardsdatascience.com/topic-modeling-in-pythoon-with-nltk-and-gensim-4ef03213cd21

In [1]:
import pandas as pd
import re
import spacy
import nltk
import string
import ast
import numpy as np
import seaborn as sns

import missingno as msno

from datetime import datetime
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

sns.set(style="whitegrid") 

In [2]:
spacy.load('en')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [3]:
from spacy.lang.en import English
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from translate import Translator

parser = English()
translator = Translator(to_lang="English")

In [4]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [5]:
# Filter out stopwords
en_stop = set(nltk.corpus.stopwords.words('english'))

In [6]:
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

In [7]:
def processTweet(tweet):
    # process the tweets
    translator = str.maketrans('', '', string.punctuation)

    #Convert to lower case
    tweet = tweet.lower()
    
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)
    
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','',tweet)
    
    ## Add
    # Update negations (isn't to is not)
    tweet = neg_pattern.sub(lambda x: negations_dic[x.group()], tweet)
    
    # Remove all non-alpha characers
    tweet = re.sub("[^a-zA-Z]", " ", tweet)    
    ##
    
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    
    #trim
    tweet = tweet.strip('\'"')
    
    #remove punctuation from text
    tweet = tweet.translate(translator)    
    
    tweet = tweet.split()
    tweet = [get_lemma(token) for token in tweet]
    tweet = [token for token in tweet if token not in en_stop]
    tweet = ' '.join(tweet)
    return tweet

In [8]:
# Load dataframe
tweets_df = pd.read_csv('D:/development/CSDA1050/playground/twitter_jj/twitter_tweets_new.csv')
tweets_df = tweets_df[tweets_df['place_country']=='Canada']
tweets_df.shape

(299111, 24)

In [9]:
# changing tweet_text to String column (returned error about 'float')
# when full dataset was used
tweets_df['tweet_text'] = tweets_df['tweet_text'].astype(str)
tweets_df['tweet_clean'] = tweets_df['tweet_text'].apply(processTweet)

In [10]:
import pytz

In [11]:
def getTZ(tweet_date):
    est = pytz.timezone('US/Eastern')
    
    tweet_date = pd.Timestamp(tweet_date).tz_localize('UTC')
    tweet_date = tweet_date.astimezone(est)
    
    return tweet_date

In [12]:
# changing tweet_date to DateTime column
tweets_df.tweet_date = pd.to_datetime(tweets_df['tweet_date'])
tweets_df['tweet_date'] = tweets_df['tweet_date'].apply(getTZ)

In [13]:
tweets_df.head()

Unnamed: 0,emojis,hashtags,id,language,latitude,longitude,mentions,place_bbcoordinates,place_bbtype,place_country,...,reply_to_status,reply_to_user,sensitive,sentiment,source,tweet_date,tweet_text,user_id,user_name,tweet_clean
0,[':red_heart:'],,1107369998148788224,en,,,,"[[[-79.583667, 43.550871], [-79.583667, 43.550...",Polygon,Canada,...,,,False,"{'neg': 0.0, 'neu': 0.506, 'pos': 0.494, 'comp...",Twitter for iPhone,2019-03-17 15:55:56.155000-04:00,Loving the vibes :red_heart:️ https://t.co/X8v...,908049944040361984,wikki_87,love vibes red heart
1,[],,1107370010182324229,und,,,"['Rosalestri', 'Cnyari']","[[[-80.248423, 43.050553], [-80.248423, 43.470...",Polygon,Canada,...,1.107354e+18,Rosalestri,False,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",Twitter for Android,2019-03-17 15:55:59.024000-04:00,@Rosalestri @Cnyari Olé,16801721,njsh2008,ol
4,"[':fearful_face:', ':fearful_face:', ':flushed...",,1107370005883023360,ko,,,['BTS_twt'],"[[[-95.155898, 41.676329], [-95.155898, 56.852...",Polygon,Canada,...,,BTS_twt,False,"{'neg': 0.187, 'neu': 0.51, 'pos': 0.304, 'com...",Twitter for iPhone,2019-03-17 15:55:57.999000-04:00,"@BTS_twt 김태형, 밴드에이드? :fearful_face::fearful_fa...",828139192035196928,BangtanBabeXO,fearful face fearful face wa cut hope hurt muc...
5,[],,1107370047134224384,en,,,['MazharAbbasGEO'],"[[[-79.810142, 43.373074], [-79.810142, 43.737...",Polygon,Canada,...,1.107348e+18,MazharAbbasGEO,False,"{'neg': 0.423, 'neu': 0.577, 'pos': 0.0, 'comp...",Twitter for Android,2019-03-17 15:56:07.834000-04:00,@MazharAbbasGEO WHAT ABOUT THIS SHAMELESS http...,127687121,ZA_chaudhry,shameless
6,[],"['torontocontractors', 'renovation', 'interior...",1107370085608538113,en,43.7166,-79.3407,,"[[[-79.639319, 43.403221], [-79.639319, 43.855...",Polygon,Canada,...,,,False,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",Instagram,2019-03-17 15:56:17.007000-04:00,"Before and after, big different....\n.\n.\n.\n...",349291874,905RENO,big different torontocontractors renovation in...


##### Topic Model Periods
Solid periods, days without interruption in streaming
- 2019-03-19 to 2019-03-20 (2 days)
- 2019-03-23 to 2019-03-26 (4 days)
- 2019-04-01 to 2019-04-10 (9 days)

In [14]:
p1_start = '03-19-2019 00:00:00'
p1_end = '03-20-2019 23:59:59'

p2_start = '03-23-2019 00:00:00'
p2_end = '03-26-2019 23:59:59'

p3_start = '04-01-2019 00:00:00'
p3_end = '04-10-2019 23:59:59'

In [15]:
start_date = p1_start
end_date = p1_end
mask = (tweets_df['tweet_date'] > start_date) & (tweets_df['tweet_date'] < end_date)
in_range_df = tweets_df.loc[mask]
in_range_df.shape

(32621, 25)

In [16]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [17]:
text_data = []
for index, row in in_range_df.iterrows():
    tweet = row['tweet_clean']
    tokens = tokenize(tweet)
    tokens = [token for token in tokens if len(token) > 4]
    text_data.append(tokens)

In [18]:
from gensim import corpora
import pickle
import gensim
import pyLDAvis.gensim



In [19]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [20]:
NUM_TOPICS=10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics=NUM_TOPICS, 
                                           id2word=dictionary, 
                                           passes=15
                                          )
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.132*"heart" + 0.078*"smiling" + 0.049*"medium" + 0.048*"hands"')
(1, '0.019*"check" + 0.016*"point" + 0.012*"trump" + 0.012*"people"')
(2, '0.020*"canada" + 0.012*"great" + 0.011*"finally" + 0.010*"agree"')
(3, '0.015*"student" + 0.015*"school" + 0.011*"class" + 0.011*"hahaha"')
(4, '0.107*"tears" + 0.046*"crying" + 0.039*"loudly" + 0.018*"raising"')
(5, '0.032*"spring" + 0.026*"happy" + 0.023*"toronto" + 0.020*"tonight"')
(6, '0.026*"would" + 0.025*"people" + 0.019*"think" + 0.019*"really"')
(7, '0.024*"thanks" + 0.023*"right" + 0.019*"please" + 0.015*"always"')
(8, '0.049*"pumper" + 0.039*"toronto" + 0.037*"general" + 0.034*"dispatch"')
(9, '0.066*"toronto" + 0.044*"ontario" + 0.033*"rolling" + 0.033*"laugh"')


In [21]:
#pyLDAvis: visualizing topics
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


#### Topic Analysis for March 19-20, 2019

## Period 2

In [22]:
start_date = p2_start
end_date = p2_end
mask = (tweets_df['tweet_date'] > start_date) & (tweets_df['tweet_date'] < end_date)
in_range_df = tweets_df.loc[mask]
in_range_df.shape

(64017, 25)

In [23]:
text_data = []
for index, row in in_range_df.iterrows():
    tweet = row['tweet_clean']
    tokens = tokenize(tweet)
    text_data.append(tokens)

In [24]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [25]:
NUM_TOPICS=10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics=NUM_TOPICS, 
                                           id2word=dictionary, 
                                           passes=15
                                          )
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.037*"skin" + 0.037*"tone" + 0.032*"hands" + 0.032*"medium"')
(1, '0.198*"camera" + 0.048*"n" + 0.026*"rolling" + 0.024*"laugh"')
(2, '0.036*"pumper" + 0.030*"area" + 0.029*"general" + 0.029*"edt"')
(3, '0.050*"crying" + 0.050*"face" + 0.046*"loudly" + 0.022*"two"')
(4, '0.035*"toronto" + 0.033*"photo" + 0.025*"ontario" + 0.020*"fire"')
(5, '0.021*"like" + 0.018*"wa" + 0.016*"right" + 0.014*"hear"')
(6, '0.143*"face" + 0.069*"joy" + 0.068*"tears" + 0.027*"patient"')
(7, '0.023*"wa" + 0.016*"get" + 0.016*"one" + 0.013*"see"')
(8, '0.024*"cap" + 0.017*"place" + 0.014*"room" + 0.013*"os"')
(9, '0.053*"heart" + 0.048*"face" + 0.035*"smiling" + 0.033*"eyes"')


In [26]:
#pyLDAvis: visualizing topics
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Period 3

In [27]:
start_date = p3_start
end_date = p3_end
mask = (tweets_df['tweet_date'] > start_date) & (tweets_df['tweet_date'] < end_date)
in_range_df = tweets_df.loc[mask]
in_range_df.shape

(153273, 25)

In [28]:
text_data = []
for index, row in in_range_df.iterrows():
    tweet = row['tweet_clean']
    tokens = tokenize(tweet)
    text_data.append(tokens)

In [29]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [30]:
NUM_TOPICS=10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics=NUM_TOPICS, 
                                           id2word=dictionary, 
                                           passes=15
                                          )
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.023*"thanks" + 0.018*"job" + 0.013*"new" + 0.012*"th"')
(1, '0.030*"pumper" + 0.026*"laugh" + 0.025*"floor" + 0.024*"area"')
(2, '0.025*"two" + 0.023*"right" + 0.016*"na" + 0.016*"mark"')
(3, '0.023*"amp" + 0.014*"u" + 0.012*"canada" + 0.010*"student"')
(4, '0.236*"face" + 0.062*"tears" + 0.061*"joy" + 0.048*"smiling"')
(5, '0.019*"heavy" + 0.017*"i" + 0.016*"mississauga" + 0.015*"cool"')
(6, '0.020*"good" + 0.019*"love" + 0.019*"guy" + 0.014*"happy"')
(7, '0.023*"wa" + 0.019*"like" + 0.017*"get" + 0.014*"know"')
(8, '0.024*"toronto" + 0.017*"great" + 0.013*"amp" + 0.012*"ontario"')
(9, '0.077*"heart" + 0.058*"skin" + 0.057*"tone" + 0.049*"medium"')


In [31]:
#pyLDAvis: visualizing topics
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
