In [None]:
#title:Short text topic modeling
#author: Jing-Huei Huang
#date: May 26, 2022

In [None]:
%config Completer.use_jedi = False
import warnings
warnings.filterwarnings("ignore")#, category=DeprecationWarning)
#warnings.filterwarnings("ignore", category=SettingWithCopyWarning)

In [None]:
import os
import csv
import nltk
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.max_rows = 999
import numpy as np
import glob
import re
import string
import seaborn as sbn
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import preprocessor as p
from ekphrasis.classes.segmenter import Segmenter
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
from datetime import datetime
import json
from pprintpp import pprint
from PIL import Image
%matplotlib inline

sbn.set(style='ticks', font_scale=1.5)

In [None]:
# Read tweets raw datasets - geotagged tweets - pre-COVID
dfgeo1_pre = pd.read_csv('../Tweet_preprocessing/bronx/bronx_geo_pre1_N5035.csv', header=5)
dfgeo2_pre = pd.read_csv('../Tweet_preprocessing/bronx/bronx_geo_pre2_N1273.csv', header=5)

#combine datasets
dfgeo_pre  = pd.concat([dfgeo1_pre, dfgeo2_pre], axis=0).reset_index()

# Read tweets raw dataset - tweets containing park names
dfkey_pre = pd.read_csv('../Tweet_preprocessing/bronx/bronx_kw_pre_N906.csv', header=5)


In [None]:
# Read tweets raw datasets - geotagged tweets - post-COVID
dfgeo1_post = pd.read_csv('../Tweet_preprocessing/bronx/bronx_geo_post1_N2775.csv', header=5)
dfgeo2_post = pd.read_csv('../Tweet_preprocessing/bronx/bronx_geo_post2_N611.csv', header=5)

#combine datasets
dfgeo_post  = pd.concat([dfgeo1_post, dfgeo2_post], axis=0).reset_index()

# Read tweets raw dataset - tweets containing park names
dfkey_post = pd.read_csv('../Tweet_preprocessing/bronx/bronx_kw_post_N944.csv', header=5)



In [None]:
# select columns

def colsel(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values(by='Date', ascending=True).reset_index()
    cols = ['Url', 'Date', 'Gender', 'Account Type', 'Twitter Verified', 'Author', 'Twitter Author ID', 'Hashtags', 'Longitude', 'Latitude', 'City','Location Name', 'Sentiment', 'Twitter Retweet of', 'Full Text']
    df = df[cols]
    df.columns = ['Url', 'Date', 'Gender', 'Account Type', 'Twitter Verified', 'Author', 'userID', 'Hashtags', 'Longitude', 'Latitude', 'City','Location Name', 'Sentiment', 'Twitter Retweet of', 'fulltext']
    df['id'] = df['Url'].apply(lambda x: x.split('/')[-1])
    df['Twitter Retweet of'] = df['Twitter Retweet of'].fillna(0)
    df['hashtag_seg'] = df[df['Hashtags'].notnull()]['Hashtags'].apply(lambda x: x.split(','))
    return df

dfkey_pre = colsel(dfkey_pre)
dfgeo_pre = colsel(dfgeo_pre)

dfkey_post = colsel(dfkey_post)
dfgeo_post = colsel(dfgeo_post)

In [None]:
print(dfkey_pre.shape)
print(dfgeo_pre.shape)

print(dfkey_post.shape)
print(dfgeo_post.shape)


In [None]:
print(len(dfkey_pre['userID'].unique()))
print(len(dfgeo_pre['userID'].unique()))

print(len(dfkey_post['userID'].unique()))
print(len(dfgeo_post['userID'].unique()))

## Next step: identify tweets geotagged in UG areas

In [None]:
# Read dbf files to extract overlapping tweets
from simpledbf import Dbf5

def arc(dfgeo, f, pp):
    dbf = Dbf5(f)
    dfdb = dbf.to_dataframe()
    dfgeo = dfgeo[dfgeo['id'].isin(dfdb['tweetID'])]
    print(pp+' : identify tweets in UG areas.')
    print(dfgeo.shape, 'tweets in df')
    print(len(dfgeo['userID'].unique()), 'unique user ID in df')
    return dfgeo

dfgeo_pre = arc(dfgeo_pre, '../Tweet_preprocessing/bronx/XYbronx_geo_inpark_pre_N2766.dbf', 'preCOVID')
dfgeo_post = arc(dfgeo_post, '../Tweet_preprocessing/bronx/XYbronx_geo_inpark_post_N1133.dbf', 'postCOVID')

In [None]:
# remove duplicate tweets
dfpre  = pd.concat([dfkey_pre, dfgeo_pre], axis=0).reset_index()
dfpre = dfpre[~dfpre.duplicated('id')]

dfpost  = pd.concat([dfkey_post, dfgeo_post], axis=0).reset_index()
dfpost = dfpost[~dfpost.duplicated('id')]

dfkey = pd.concat([dfkey_pre, dfkey_post], axis=0).reset_index()
dfgeo = pd.concat([dfgeo_pre, dfgeo_post], axis=0).reset_index()

print(dfpre.shape)
print(dfpost.shape)

dfpre['COVID'] = 'pre'
dfpost['COVID'] = 'post'

dfall = pd.concat([dfpre, dfpost], axis=0).reset_index(drop=True)

In [None]:
# create variables
dfall['inKeywords'] = dfall['id'].isin(dfkey['id'])
dfall['inGeo']      = dfall['id'].isin(dfgeo['id'])
dfall['inLong']     = dfall['Longitude']!=0

In [None]:
# Select tweets that meet criteria - from individual accounts, excluded tweets from verified accounts and retweets
dfall = dfall[(dfall['Account Type']=='individual') & -(dfall['Twitter Verified']) & (dfall['Twitter Retweet of']==0)]
dfall.shape

## 1st Cleanning tweets: remove bot/ad/scam tweets

In [None]:
# Add columns for num_vocab, num_userID, num_RedunTweet

In [None]:
dfall['original_fulltext'] = dfall['fulltext']

# remove junk tweets

In [None]:
tweetRemoveKeys = ['amazon ccbeauty','ccbeauty flash','focus foundation',
                   'gel liner','flash palette','coffee wetnwildbeauty',
                   'water outage','photo focus','hot water',
                   'wetnwildbeauty photo', 'residential tenant',
                   'bronx park east']
                   #'@ bronx park east',
                   #'bronx park east station',
                   #'near bronx park east',
                   #'bronx park east:',
                   #'approaching bronx park east',
                   #'at bronx park east'] 

for tk in tweetRemoveKeys:
    matched=dfall['original_fulltext'].str.contains(tk, flags=re.IGNORECASE)
    print('Current keywords to romove: %s' % tk)
    print('Keywords matched: %d tweets' % sum(matched))
    print('Matched tweet full text\n')
    print(dfall[matched]['original_fulltext'].values)
    dfall = dfall[-matched].reset_index(drop=True)
    print('Remained dfall size: %d tweets' % dfall.shape[0])
    print('*****************************\n')

In [None]:
# Code to get the number of redundant tweet per userID
# Also exclude 

dfall['num_RedunTweet_perUser'] = dfall.groupby(['userID', 'fulltext'])['id'].transform('count')

In [None]:
# code to get tweets posted by users who have more than 2 redundant tweets 
dfall['MoreThan2RedunTweets_perUser']= dfall['num_RedunTweet_perUser']>2
dfall['MoreThan2RedunTweets_perUser'].value_counts()

In [None]:
def topUremove(df):
    # Number of tweet per user in current (or remained) dataframe
    df['numTweet_perUser'] = df.groupby(['userID'])['id'].transform('count')

    # Top 5% most numTweet users
    numUsers = len(df['userID'].unique())
    rank5 = int(np.ceil(numUsers*0.05))
    rank5numTweetCutoff = sorted(df.groupby('userID')['numTweet_perUser'].first().to_list(), reverse=True)[rank5]
    print('Top 5%% User removed when numTweet_perUser is greater than %d' % rank5numTweetCutoff)
    df['rank5remove'] = df['numTweet_perUser']>=rank5numTweetCutoff
    print(df['rank5remove'].value_counts())
    # Top 1% most numTweet users
    numUsers = len(df['userID'].unique())
    rank1 = int(np.ceil(numUsers*0.01))
    rank1numTweetCutoff = sorted(df.groupby('userID')['numTweet_perUser'].first().to_list(), reverse=True)[rank1]
    print('Top 1%% User removed when numTweet_perUser is greater than %d' % rank1numTweetCutoff)
    df['rank1remove'] = df['numTweet_perUser']>=rank1numTweetCutoff
    print(df['rank1remove'].value_counts())

    return df

dfe_tmp = topUremove(dfall[dfall['COVID']=='pre'])
dfo_tmp = topUremove(dfall[dfall['COVID']=='post'])
dfall = pd.concat([dfe_tmp, dfo_tmp], axis=0).reset_index(drop=True)

In [None]:
# Add column that show number of words in fulltext 
dfall['num_vocab'] = dfall['fulltext'].apply(lambda x: len(x.split(' ')) if x else 0)
dfall['atleast_3_vocab']=dfall['num_vocab']>=3
dfall['atleast_3_vocab'].value_counts()

In [None]:
# select pre or post here
dfselo = dfall[-(dfall['MoreThan2RedunTweets_perUser']) & -(dfall['rank1remove']) & (dfall['num_vocab']>=3) & (dfall['COVID']=='pre')]
dfselo.shape

In [None]:
#dfselo['Author'].value_counts()

In [None]:
# print wordcloud
tweet_ALL = " ".join(t for t in dfselo['fulltext'])
fig, ax = plt.subplots(1,1, figsize=(30,10))
#cloud_mask = np.array(Image.open("shapes\cloud1.png"))
wc = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(tweet_ALL)
ax.imshow(wc, interpolation='bilinear')

## 2nd Cleaning text content of tweets

# Create dataframe for a park

In [None]:
# create dataframe for a park
dfsel_park = dfselo.copy(deep=True)


In [None]:
dfsel_park.shape

In [None]:
# Tweet cleaning - extract hashtags
#dfsel_park['hashtag'] = dfsel_park['fulltext'].apply(lambda x: re.findall(r"#(\w+)", x))
# Part of ekphrasis package - segmenter and download twitter word stats
seg_tw = Segmenter(corpus="twitter")
# Apply each hashtag to segmenter and return to a new column

dfsel_park['Hashtags'] = dfsel_park['Hashtags'].fillna(' ')
dfsel_park['hashtag_seg'] = dfsel_park['Hashtags'].apply(lambda x: x.replace(' ','').split(','))

In [None]:
# Need validation if most of tweets don't have hashtags

(dfsel_park.astype(str)['hashtag_seg'] == '[\'\']').value_counts()

In [None]:
import nltk
from nltk import word_tokenize, FreqDist
fdist = FreqDist([e for l in dfsel_park['hashtag_seg'] for e in l])
fdist.most_common()

In [None]:
dffreq = pd.DataFrame.from_records(fdist.most_common(), columns=['word', 'freq'])
dffreq.to_csv('../output/pre/Bronx/Bronx_final_hashtagFreq.csv')

In [None]:
# print wordcloud
tweet_ALL = " ".join([e.strip('\#').lower() for l in dfsel_park['hashtag_seg'] for e in l if len(e)>0])
fig, ax = plt.subplots(1,1, figsize=(20,5))
wc = WordCloud(max_font_size=80, max_words=100, background_color="white").generate(tweet_ALL)
ax.imshow(wc, interpolation='bilinear')
fig.savefig('../output/pre/Bronx/20210619_bronxPark_Hashtag_wordcloud.png', dpi=300)

In [None]:
# Tweet cleaning - tweet-preprocessor 0.6.0 
# Executed cleaning using default, which went through all options
#URL	p.OPT.URL
#Mention	p.OPT.MENTION
#Hashtag	p.OPT.HASHTAG
#Reserved Wordsp.OPT.RESERVED
#Emoji	p.OPT.EMOJI
#Smiley	p.OPT.SMILEY
#Number	p.OPT.NUMBER

dfsel_park['cleantext'] = dfsel_park['fulltext'].apply(lambda x: p.clean(x))

In [None]:
# Code to get the number of redundant cleantweet per userID
# Also exclude 

dfsel_park['num_RedunCleanTweet_perUser'] = dfsel_park.groupby(['cleantext'])['id'].transform('count')
dfsel_park = dfsel_park[dfsel_park['num_RedunCleanTweet_perUser']<2]

In [None]:
dfsel_park['time_EST'] = pd.to_datetime(dfsel_park['Date'],  utc=True)

In [None]:
dfsel_park['only_date'] = dfsel_park['time_EST'].apply(lambda x: x.date())

In [None]:
dfsel_park['only_date']

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15,4))
dftmp = dfsel_park.reset_index(drop=True)
#dftmp = dftmp.pivot_table(index=['year', 'month', 'date'], values='id', aggfunc='count')
idx = pd.date_range("2019-03-01", periods=300)
dftmp['only_date']
idnum = []
for d in idx:
    idnum.append((dftmp['only_date']==d.date()).sum())
    #print(d.date(), (dftmp['only_date']==d.date()).sum())
ax.bar(idx, idnum)

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15,4))
keyword = 'bronx'
dftmp = dfsel_park[dfsel_park['fulltext'].str.contains(keyword, flags=re.I)].reset_index(drop=True)
#dftmp = dftmp.pivot_table(index=['year', 'month', 'date'], values='id', aggfunc='count')
idx = pd.date_range("2019-03-01", periods=153)
idnum = []
for d in idx:
    idnum.append((dftmp['only_date']==d.date()).sum())
    #print(d.date(), (dftmp['only_date']==d.date()).sum())
ax.bar(idx, idnum)

In [None]:
# [Optional step]
# Concatenate cleantext and hashtag_seg
# Maybe later

In [None]:
# manually remove high frequency keywords that may skew the result
# Change to park name before replace
# dfsel_park['cleantext'] = dfsel_park['cleantext'].str.replace('prospect park', '', flags=re.IGNORECASE)
#dfsel['fulltext'] = dfsel['fulltext'].str.replace('prospectpark', '', flags=re.IGNORECASE)

In [None]:
# NLTK module

import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')
from nltk.tokenize import TweetTokenizer

In [None]:
# Remove digits
dfsel_park['cleantext'] = dfsel_park['cleantext'].str.replace('[0-9]+', '')
# all text to lower case
dfsel_park['cleantext'] = dfsel_park['cleantext'].str.lower()

In [None]:
def remove_punc(text):
    text = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text) #keep English Character
    return text

In [None]:
dfsel_park['cleantext'] = dfsel_park['cleantext'].apply(lambda x: remove_punc(x))
#dfsel['hashtag_seg'] = dfsel['hashtag_seg'].apply(lambda x: remove_punc(x))

## Make n-gram with nltk package

In [None]:
sum(dfsel_park['cleantext'].str.contains('childrens zoo',flags=re.IGNORECASE))

In [None]:
#dfsel_park[dfsel_park['cleantext'].str.contains('bronx zoo holiday',flags=re.IGNORECASE)]['fulltext'].values

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_documents([doc.split() for doc in dfsel_park['cleantext']])
# Filter only those that occur at least 50 times
finder.apply_freq_filter(5)
bigram_scores = finder.score_ngrams(bigram_measures.pmi)

In [None]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = nltk.collocations.TrigramCollocationFinder.from_documents([doc.split() for doc in dfsel_park['cleantext']])
# Filter only those that occur at least 50 times
finder.apply_freq_filter(5)
trigram_scores = finder.score_ngrams(trigram_measures.pmi)

In [None]:
bigram_pmi = pd.DataFrame(bigram_scores)
bigram_pmi.columns = ['bigram', 'pmi']
bigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)

trigram_pmi = pd.DataFrame(trigram_scores)
trigram_pmi.columns = ['trigram', 'pmi']
trigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)

In [None]:
#bigram_pmi

In [None]:
stop_word_list = set(stopwords.words('english'))
# Filter for bigrams with only noun-type structures
def bigram_filter(bigram):
    tag = nltk.pos_tag(bigram)
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['NN']:
        return False
    if bigram[0] in stop_word_list or bigram[1] in stop_word_list:
        return False
    if 'n' in bigram or 't' in bigram:
        return False
    if 'PRON' in bigram:
        return False
    return True

# Filter for trigrams with only noun-type structures
def trigram_filter(trigram):
    tag = nltk.pos_tag(trigram)
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['JJ','NN']:
        return False
    if trigram[0] in stop_word_list or trigram[-1] in stop_word_list or trigram[1] in stop_word_list:
        return False
    if 'n' in trigram or 't' in trigram:
         return False
    if 'PRON' in trigram:
        return False
    return True 

In [None]:
# Can set pmi threshold to whatever makes sense - eyeball through and select threshold where n-grams stop making sense
# choose top 500 ngrams in this case ranked by PMI that have noun like structures
filtered_bigram = bigram_pmi[bigram_pmi.apply(lambda bigram:\
                                              bigram_filter(bigram['bigram'])\
                                              and bigram.pmi > 5, axis = 1)][:500]

filtered_trigram = trigram_pmi[trigram_pmi.apply(lambda trigram: \
                                                 trigram_filter(trigram['trigram'])\
                                                 and trigram.pmi > 5, axis = 1)][:500]


bigrams = [' '.join(x) for x in filtered_bigram.bigram.values if len(x[0]) > 2 or len(x[1]) > 2]
trigrams = [' '.join(x) for x in filtered_trigram.trigram.values if len(x[0]) > 2 or len(x[1]) > 2 and len(x[2]) > 2]

## Curate ngram words

In [None]:
bigrams

In [None]:
#bigram_del = ['open air', 'cant wait', 'th stprospect', 'join us', 
#              'bandshell whats', 'th annual', 'ive seen', 'past saturday',
#              'july th', 'last night', 'th street', 'feel like',
#              'dont know', 'im gonna', 'last week', 'last weekend',
#              'look like', 'im going']
#
#bigrams = [e for e in bigrams if e not in bigram_del]

In [None]:
# Curated bigrams after word analyses

bigrams = [
 'nybg',
 'bronx park',
 'bronx river',
 'bronx zoo',
 'van cortlandt',
 'orchid show', # redundant
 'cherry blossoms',
 'brady playground',
 'pelham bay',
 'dinosaur safari',
 'living art',
 'sea lion',
 'last night',
 'happy birthday',
 'holiday light',
 'holiday train',
 'mothers day',
 'train show',
 #'orchid show', # redundant
 'family fun',
 'new york',
 'botanical garden',
 'rock garden',
 'rose garden',
 'york city',
 'east bronx'
]

# make these singular: q trains, water fountains, bike lanes, new years

In [None]:
bigrams

In [None]:
trigrams

In [None]:
# Curated trigrams after word analyses
trigrams = [
 'summer end cityscape',
 'roberto burle mar',
 'rockefeller rose garden',
 'holiday train show',
 'bronx park east',
 'New York Botanical Garden',
 'Botanical Garden',
 'new york city',
 'ny botanical garden'
]

In [None]:
trigrams

In [None]:
# Create n-grams. updated on 0618.2021 
def create_ngram(x):
    ngram = []
    pickedgrams = []
    for gram in trigrams:
        if gram in x:
            pickedgrams.append(gram)
            ngram.append('_'.join(gram.split()))
    for gram in bigrams:
        anchor=True
        for p in pickedgrams:
            if (gram in p):
                anchor=False
        if anchor and (gram in x):
            ngram.append('_'.join(gram.split()))
    return ngram

In [None]:
# Concatenate n-grams
def replace_ngram(x):
    for gram in trigrams:
        x = x.replace(gram, '')
    for gram in bigrams:
        x = x.replace(gram, '')
    return x

In [None]:
#dfsel_park['ngrams'].values

## Move bigram/trigram to another column
## Then delete ngram words in tweets cleantext

In [None]:
dfsel_park['ngrams'] = dfsel_park['cleantext'].map(lambda x: create_ngram(x))
dfsel_park['cleantext'] = dfsel_park['cleantext'].map(lambda x: replace_ngram(x))

In [None]:
# Lemmatizer using nltk
lemmatizer = nltk.stem.WordNetLemmatizer()
w_tokenizer = TweetTokenizer()
def lemmatize_text(text):
    return [(lemmatizer.lemmatize(w)) for w in w_tokenizer.tokenize((text))]

dfsel_park['cleantext'] = dfsel_park['cleantext'].apply(lambda x: lemmatize_text(x))
#dfsel['hashtag_seg'] = dfsel['hashtag_seg'].apply(lambda x: lemmatize_text(x))

In [None]:
#remove non-English words
nltk.download('words')
words = set(nltk.corpus.words.words())
def remove_nonEnglish(token):
    return([w for w in token if w.lower() in words or not w.isalpha()]) 

In [None]:
dfsel_park['cleantext'] = dfsel_park['cleantext'].apply(lambda x: remove_nonEnglish(x))
#dfsel['hashtag_seg'] = dfsel['hashtag_seg'].apply(lambda x: remove_nonEnglish(x))

In [None]:
def removeSmallVocab(token):
    return([w for w in token if ((len(w)>2) & (len(w)<16))])

In [None]:
#dfsel_park['cleantext'] = dfsel_park['cleantext'].apply(lambda x: removeSmallVocab(x))

In [None]:
import spacy
nlp = spacy.load('en_core_web_lg')
stop_words = set(stopwords.words('english'))
stop_words.add('th')

# Custom stopwords
custom_stopwords = ['hi','\n','\n\n', '&amp;', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@', 'en', 'de', 'wa', 'dont', 'cant', 's', 'nt']
stop_words = nlp.Defaults.stop_words.union(custom_stopwords)

dfsel_park['cleantext'] = dfsel_park['cleantext'].apply(lambda x: [t for t in x if t not in stop_words])
#dfsel['hashtag_seg'] = dfsel['hashtag_seg'].apply(lambda x: [t for t in x if t not in stop_words])

In [None]:
# Lemmatizer using spacy
dfsel_park['clean_text_trigrams'] = dfsel_park['cleantext'].copy()
allowed_tags=['NOUN', 'VERB']
nlp = spacy.load('en_core_web_lg')
dfsel_park['clean_text_trigrams'] = dfsel_park['clean_text_trigrams'].apply(lambda x: [token.lemma_ for token in nlp(' '.join(x)) if token.pos_ in allowed_tags])

In [None]:
# remove smallvocab
def removeSmallVocab(token):
    return([w for w in token if ((len(w)>2) & (len(w)<16))])

In [None]:
dfsel_park['clean_text_trigrams'] = dfsel_park['clean_text_trigrams'].apply(lambda x: removeSmallVocab(x))

## Merge single words and bigram/trigrams

In [None]:
dfsel_park['clean_text_trigrams'] = dfsel_park['clean_text_trigrams'] + dfsel_park['ngrams']

In [None]:
# update column that show number of words in fulltext 
dfsel_park['num_vocab'] = dfsel_park['clean_text_trigrams'].apply(lambda x: len(x) if x else 0)
# remove empty rows and words length less than 2
dfsel_park = dfsel_park[(dfsel_park['num_vocab']>=2)]

In [None]:
dfsel_park['clean_text_trigrams'].apply(lambda x: len(x)).describe()

In [None]:
tweet_ALL = " ".join(t for t in dfsel_park['clean_text_trigrams'].apply(lambda x: ' '.join(map(str, x))))

fig, ax = plt.subplots(1,1, figsize=(30,30))
# makes the circle using numpy
x, y = np.ogrid[:300, :300]
#mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
#mask = 255 * mask.astype(int)
cloud_mask = np.array(Image.open('shapes/cloud1.png'))
wc = WordCloud(max_font_size=200, max_words=100, background_color="white", 
              mask=cloud_mask, contour_width=3).generate(tweet_ALL)

ax.imshow(wc, interpolation='bilinear')

In [None]:
fdist = FreqDist([e for l in dfsel_park['ngrams'] for e in l])
fdist.most_common()
#len(fdist.keys())

In [None]:
#dfsel_park['hashtag_seg'] 
#fdist_hashtag = FreqDist([e for l in dfsel_park['hashtag'] for e in l])
#fdist_hashtag.most_common()

# Copy ready to use dataframe into a dataframe named dfsel_park_all

# Select regions of survey (NYC, by borough, by park)

In [None]:
# Cut low frequency words


freq_cut = 3 # np.ceil(dfsel_park.shape[0]*0.0001)
cut_words = []
for v, freq in fdist.most_common():
    if freq<=freq_cut:
        cut_words.append(v)
#dfboro['clean_text_trigrams_cutlowfreq'] = dfboro['clean_text_trigrams'].apply(lambda x: [e for e in x if e not in cut_words])        
dfsel_park['clean_text_trigrams_cutlowfreq'] = dfsel_park['clean_text_trigrams'].apply(lambda x: [e for e in x if e not in cut_words])

In [None]:
fdist = FreqDist([e for l in dfsel_park['clean_text_trigrams_cutlowfreq'] for e in l])
#fdist.most_common()
len(fdist.keys())

In [None]:
dfsel_park.shape

In [None]:
# update column that show number of words in fulltext 
dfsel_park['num_vocab'] = dfsel_park['clean_text_trigrams_cutlowfreq'].apply(lambda x: len(x) if x else 0)
# remove empty rows and words length less than 2
dfsel_park = dfsel_park[(dfsel_park['num_vocab']>=2)]

In [None]:
dfsel_park['clean_text_trigrams_cutlowfreq'].apply(lambda x: len(x)).describe()

In [None]:
dfsel_park['num_vocab'].value_counts()

In [None]:
dfsel_park.shape

In [None]:
#dfsel_park['clean_text_trigrams_cutlowfreq']

In [None]:
fdist = FreqDist([e for l in dfsel_park['clean_text_trigrams_cutlowfreq'] for e in l])
plt.hist(fdist.values(), bins=30, range=(0,30))

In [None]:
tweet_ALL = " ".join(t for t in dfsel_park['clean_text_trigrams_cutlowfreq'].apply(lambda x: ' '.join(map(str, x))))
fig, ax = plt.subplots(1,1, figsize=(30,10))
wc = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(tweet_ALL)
ax.imshow(wc, interpolation='bilinear')

In [None]:
fdist = FreqDist([e for l in dfsel_park['clean_text_trigrams_cutlowfreq'] for e in l])
fdist.most_common()
#len(fdist.keys())

In [None]:
dffreq = pd.DataFrame.from_records(fdist.most_common(), columns=['word', 'freq'])
dffreq.to_csv('../output/pre/Bronx/bronx_pre_wordfrequency.csv')

## Make decision on which subset of tweets for modeling

In [None]:
dfsel_park['inLong'] = dfsel_park['Longitude']!=0

In [None]:
dfsel_park[['inKeywords', 'inGeo', 'inLong']].value_counts()

In [None]:
# save current dfsel_park to a ori
dfsel_park_static = dfsel_park.copy(deep=True)


In [None]:
dfsel_park = dfsel_park_static[dfsel_park_static['inGeo']|dfsel_park_static['inKeywords']]
dfsel_park = dfsel_park.reset_index(drop=True)
dfsel_park.shape

In [None]:
 #dfsel_park_static['Longitude']

In [None]:
tweet_ALL = " ".join(t for t in dfsel_park['clean_text_trigrams_cutlowfreq'].apply(lambda x: ' '.join(map(str, x))))
fig, ax = plt.subplots(1,1, figsize=(30,10))
wc = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(tweet_ALL)
ax.imshow(wc, interpolation='bilinear')

In [None]:
fdist = FreqDist([e for l in dfsel_park['clean_text_trigrams_cutlowfreq'] for e in l])
len(fdist.most_common())


In [None]:
# Keep oritinal clean text final
dfsel_park['clean_text_trigrams_cutlowfreq_ori'] = dfsel_park['clean_text_trigrams_cutlowfreq']

In [None]:
remove_words = ['bronx_park','day','time','today','year','let','come','park',
                'find','take','way','thing',
                'new_york','new_york_city','york_city','yesterday','tomorrow','doe'] #add'new yor'bc it could result in not informative topics

dfsel_park['clean_text_trigrams_cutlowfreq'] = dfsel_park['clean_text_trigrams_cutlowfreq'].apply(lambda x: [e for e in x if e not in remove_words])


In [None]:
fdist = FreqDist([e for l in dfsel_park['clean_text_trigrams_cutlowfreq'] for e in l])

#dffreq = pd.DataFrame.from_records(fdist.most_common(), columns=['word', 'freq'])
#dffreq.to_csv('../../Tweet_TM_result/STTM_model/20210424_Prospect_final_wordFreq.csv')

In [None]:
# fdist = FreqDist([e for l in dfsel_park['hashtag_seg'] for e in l])
# fdist.most_common()

In [None]:
# update column that show number of words in fulltext 
dfsel_park['num_vocab'] = dfsel_park['clean_text_trigrams_cutlowfreq'].apply(lambda x: len(x) if x else 0)
# remove empty rows
dfsel_park = dfsel_park[(dfsel_park['num_vocab']>=2)]

In [None]:
dfsel_park['clean_text_trigrams_cutlowfreq'].apply(lambda x: len(x)).describe()

# Topic modeling

In [None]:
#Base and Cleaning 
import json
import requests
import emoji
import regex
import re
import string
from collections import Counter

#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt 
import pyLDAvis.gensim
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls

#Natural Language Processing (NLP)
import spacy
import gensim
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)
warnings.filterwarnings("ignore")#, category=DeprecationWarning)

In [None]:
id2word = Dictionary(dfsel_park['clean_text_trigrams_cutlowfreq'])
print(len(id2word))
corpus = [id2word.doc2bow(d) for d in dfsel_park['clean_text_trigrams_cutlowfreq']]

In [None]:
import scipy.sparse as ss
from corextopic import corextopic as ct

# Topic modeling STTM

In [None]:
import sys

In [None]:
dfsel_park.shape

In [None]:
# sys.path.append('../../pyCode/gits/gsdmm/')
# sys.path.append('../../pyCode/gits/gsdmm/gsdmm/')

In [None]:
sys.path.append('../../chapter3/gits/gsdmm/')
sys.path.append('../../chapter3/gits/gsdmm/gsdmm/')

In [None]:
from gsdmm import MovieGroupProcess

In [None]:
# ks = list(range(100,650,50))
# kstable = []
# for k in ks:
#     mgp = MovieGroupProcess(K=k, alpha=0.1, beta=0.1, n_iters=20)
#     l_id2word = set(list(id2word.values()))
#     y = mgp.fit(dfsel_park['clean_text_trigrams_cutlowfreq'].to_list(), len(l_id2word))
#     with open('../output/pre/Bronx/bronx_kw_%s.model' % k, 'wb') as f:
#         pickle.dump(mgp, f)
#         f.close()
#     clus = (np.array(mgp.cluster_doc_count)>0).sum()
#     kstable.append({'k': k,
#                     'clusters': clus})

In [None]:
dfks = pd.DataFrame(kstable)
fig, ax = plt.subplots(1,1, figsize=(6,4))
ax.plot(dfks['k'], dfks['clusters'], '.-', markersize=10)
ax.set_xlabel('K')
ax.set_ylabel('Number of cluster found by GDSMM')
plt.savefig('../output/pre/Bronx/bronx_pre_STTM_modeltest_0101',bbox_inches ="tight",pad_inches = 1)

In [None]:
mgp = MovieGroupProcess(K=300,alpha=0.1, beta=0.1, n_iters=100) #updated parameters
l_id2word = set(list(id2word.values()))
y = mgp.fit(dfsel_park['clean_text_trigrams_cutlowfreq'].to_list(), len(l_id2word))
with open('../output/pre/Bronx/bronx_pre_0101_k300', 'wb') as f:
    pickle.dump(mgp, f)
    f.close()

In [None]:
doc_count = np.array(mgp.cluster_doc_count)
nt = (doc_count>0).sum()
print('Number of topic that has tweets:', nt)
print('Number of documents per topic :', doc_count)
print('*'*20)
# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-nt:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)
# Show the top 5 words in term frequency for each cluster 
#top_words(mgp.cluster_word_distribution, top_index, 5)

In [None]:
(doc_count>0).sum()

In [None]:
from itertools import islice
def topic_words(cwd, ti, wc=5):
    twords = []
    for t in ti:
        nwd = {k: v for k, v in sorted(cwd[t].items(), key=lambda item: item[1], reverse=True)}
        n_items = list(islice(nwd.items(), 20))
        #print('Topic number %s' % t)
        for k,v in n_items:
            twords.append({'fulltext': k,
                           'topic_prob': v})
            #print(k, v)
        #print('\n')
    return twords

In [None]:
clu_assign = []

for t in dfsel_park['clean_text_trigrams_cutlowfreq'].tolist():
    score = mgp.choose_best_label(t)
    clu_assign.append({'topic_number': score[0],
                       'topic_prob': score[1]})

dfclu = pd.DataFrame(clu_assign)
dfsel_ppp = pd.concat([dfsel_park.reset_index(drop=True), dfclu], axis=1)

In [None]:
with pd.ExcelWriter('../output/pre/Bronx/bronx_pre_sttmmodel_topic_0629.xlsx', engine='xlsxwriter') as writer:
    for k, g in dfsel_ppp.groupby('topic_number'):
        gtmp = g.sort_values(by='topic_prob', ascending=False).reset_index(drop=True)
        twords = topic_words(mgp.cluster_word_distribution, [k,], 10)
        dftw = pd.DataFrame(twords)
        #print(dftw.head())
        dftw = pd.concat([dftw, gtmp[['original_fulltext', 'topic_prob']]], axis=0)
        print(dftw.shape)
        dftw.to_excel(writer, sheet_name='Topic%s'%k)

In [None]:
dfsel_ppp.to_csv('../output/pre/Bronx/tweets_STTMresult_bronx_pre_0629.csv')

In [None]:
dfsel_ppp['topic_number'].value_counts()


In [None]:
# print wordcloud

tweet_ALL = " ".join([e for l in dfsel_ppp[dfsel_ppp['topic_number']==43]['clean_text_trigrams_cutlowfreq'] for e in l])
fig, ax = plt.subplots(1,1, figsize=(30,10))
wc = WordCloud(width=2400, height=1200, max_words=400, background_color="white").generate(tweet_ALL) #max_font_size=60
ax.imshow(wc, interpolation='none')
fig.savefig('test_wordcloud.png', dpi=500)

In [None]:
#pd.pivot_table(data=dfsel_ppp, index='topic_number', columns=['inKeywords', 'inGeo', 'inLong'], values='id', aggfunc='count')

In [None]:
#dfsel_ppp.columns

In [None]:
dfsel_ppp['categorize'] = np.random.choice(range(0,5), dfsel_ppp.shape[0])

In [None]:
dfsel_ppp['Date'] = dfsel_ppp['Date'].dt.tz_localize('utc').dt.tz_convert('US/Eastern')

In [None]:
dfsel_ppp['categorize'].value_counts()