In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import gensim
import re
import nltk
from tqdm import tqdm

from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()

In [None]:
with open('volvo_nlp.txt') as file:
    lines = [line.rstrip() for line in file]

#drop empty lines
lines2 = [x for x in lines if x]

lines2

import re


lines3 = []
for k in lines2:
    str_output = re.sub('â€™', '\'', k)
    #print(str_output)
    lines3.append(str_output)

len(lines3)

lines3

In [None]:
tweets_df= pd.DataFrame()

tweets_df['Text']=pd.Series(lines3)
tweets_df.shape

In [None]:
# pip install clean_tokenizer

In [None]:
import os

In [None]:
os.listdir()

In [None]:
# Import customs module to tokenize and clean tweet dataset
import sys
sys.path.insert(0, 'topic_modelling/notebooks/')
from clean_tokenizer import tokenize_tweets, clean_tweet



### Frequency of tweets daily

### Preprocessing: Dictionary and tokenizing

In [None]:
import nltk
nltk.download('wordnet')
# Convert cleaned tweet into tokens list
tweets_df['Text']=tweets_df['Text'].apply(str)
tweets_df['Text'] = tweets_df['Text'].replace({'govt':'government', 'Govt':'Government'} )
tweets_df['clean_text_new']=tweets_df['Text'].apply(clean_tweet, bigrams=True) 
#   Note: clean_tweet function will: 
# -	remove retweet and @user information
# -	remove web links
# -	remove hashtags
# -	remove audio/video tags or labels
# -	lower case the tweet
# -	strip punctuation
# -	remove double spacing
# -	remove numbers
# -	apply lemmatization and tokenization (within lemmatize function, remove stop words drops words with 3 or less characters
# -	form bigrams
tweets_df.head()
tweets_df['clean_tokens'] = tweets_df.clean_text_new.apply(lambda x: re.split('\s', x))
#remove additional custom stopwords
# stop = []
stop = ["pm","t.co","http","https","amp","t","t.c","c","rt", "pl", "s", "p", "like", "im","new", "day", "days","year", "ur", "ve", "la", "ive", "cos", "guys",
        "didnt", "time", "people", "dont", "today", "thing", "week", "months", "post","yesterday", "man", "wont", "uk",
        "st", "lets", "don", "feel", "gonna","isnt", "pls", "share", "wait", "wanna", "na", "back", "means",
        "lah","due", "sa", "ingat", "just", "will", "can", "now", "get", "go", "us",
        "can", "one", "even", "just", "ada", "ke", "got", "going", "last", "etc", "kaypo", "still", "say", "know",
        "situation", "need", "want", "take", "come", "look",
        "think", 'actually', 'especially', 'later', 'guess', 'note', 'dear', 'road', 'start', 'stop', 'things', 'give',
        'try', 'tell', 'shit', 'maybe', 'keep', 'right']
tweets_df['clean_tokens_final']= tweets_df['clean_tokens'].apply(lambda x: [item for item in x if item not in stop])
tweets_df.head()

In [None]:
all_words = [word for item in list(tweets_df['clean_tokens_final']) for word in item]
#all_words

#frequency distribution of all terms
fdist = nltk.FreqDist(all_words)

#number of unique terms
len(fdist)
#fdist

In [None]:
#create dataframe of terms with their respective frequency
fdist = pd.DataFrame.from_dict(fdist, orient='index').reset_index()
fdist.columns = ['word', 'freq']
fdist = fdist.sort_values('freq', ascending=False)

In [None]:
#discard words containing only 1 character
fdist = fdist[fdist.apply(lambda r: len(r['word']) > 1, axis=1)]

#discard numbers/digits
fdist = fdist[fdist.apply(lambda r: r['word'].isdigit() == False, axis=1)]

#select terms that appear at least 2 times
fdist = fdist[(fdist.freq >= 2)]

In [None]:
fdist.head()

In [None]:
fdist

In [None]:
len(fdist)

In [None]:
#dictionary to collect order of mapping
dict_word = {}
for i in range(len(fdist)):
    temp = fdist.iloc[i]
    word = temp['word']
    dict_word[word] = word


#mapping
tweets_df['cleaned'] = tweets_df.apply(lambda row: [dict_word[x] for x in row.clean_tokens_final if x in dict_word.keys() ], axis=1)

#removing stopwords
tweets_df['cleaned'] = tweets_df.apply(lambda row: [x for x in row.cleaned if x not in stop], axis=1)

# Hashtag analysis

In [None]:
def get_hashtags(text):
    list_of_hashtags = []
    temp = text.split()
    for word in temp:
        if word[0] == '#':
            list_of_hashtags.append(word)
    return list_of_hashtags

In [None]:
texts = list(tweets_df['Text'])
tweets_df['hashtags'] = [get_hashtags(text) for text in texts]

In [None]:
from collections import Counter

#Feb
all_hashtags = list(tweets_df['hashtags'])
all_hashtags = Counter([item for sublist in all_hashtags for item in sublist])
all_hashtags = pd.DataFrame.from_dict(all_hashtags, orient='index').reset_index()
all_hashtags.columns = ['hashtag', 'num']
all_hashtags['sum'] = np.sum(all_hashtags['num'])
all_hashtags['pct'] = 100 * all_hashtags['num']/all_hashtags['sum']
all_hashtags = all_hashtags.sort_values('pct', ascending = False).head(10)
all_hashtags

In [None]:
tweets_df

In [None]:
from gensim.models.coherencemodel import CoherenceModel

In [None]:
#preparation
texts = list(tweets_df['cleaned'])

id2word = gensim.corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

#choose no. of topics based on coherence score
num_topics = np.arange(5,30+1, 2)
coherences = []
models = []

for num_topic in num_topics:
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topic, 
                                           random_state=100,
                                           chunksize=6000,
                                           passes=20,
                                           alpha='auto',
                                           eta='auto',
                                           eval_every=None)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    coherences.append(coherence_score)
    models.append(lda_model)

In [None]:
# Plot coherence scores across topic numbers
sns.reset_orig()
sns.set(font_scale = 2)
plotData = pd.DataFrame({'Number of topics':num_topics,
                         'Coherence score':coherences})
f,ax = plt.subplots(figsize=(16,10))
#sns.set_style("darkgrid")
#sns.set(font_scale = 0.1)
sns.pointplot(x='Number of topics', y= 'Coherence score',data=plotData)
#plt.axhline(y=coherences[7], color='red', linestyle='--' )
#plt.axvline(x=6, color='red', linestyle='--' )
plt.title('Topic Coherence')

In [None]:
import os

In [None]:
# Import module from gsdmm repository
import sys
sys.path.insert(0, 'topic_modelling/gsdmm/')
from gsdmm import MovieGroupProcess

import pickle
import os

docs = tweets_df['cleaned'].tolist()
vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)

In [None]:
import random
random.seed(1000)
mgp = MovieGroupProcess(K=21, alpha=0.1, beta=0.1, n_iters=40)
y = mgp.fit(docs, n_terms)

In [None]:
# Save GSDMM model as pickle file

import pickle

filehandler = open("submit_model7.pkl","wb")
pickle.dump(mgp,filehandler)
filehandler.close()

In [None]:
# Load GSDMM model from saved pickle file

import pickle

with open('submit_model7.pkl', 'rb') as f:
 mgp = pickle.load(f)
 f.close()

In [None]:
# try saved model
import operator

for i in range(21):
    print('Cluster ' + str(i))
    print(sorted(mgp.cluster_word_distribution[i].items(),key = operator.itemgetter(1),reverse = True)[:10])
    print('*' * 15)

In [None]:
sum(mgp.cluster_doc_count)

In [None]:
topics = []
for i in range(len(docs)):
    topics.append(mgp.choose_best_label(docs[i])[0])
    
tweets_df['sttm_topic_from_zero'] = topics

In [None]:
# shift index of topics to +1 such that it starts from 0 instead of 1
tweets_df['sttm_topic'] = tweets_df['sttm_topic_from_zero'] + 1

In [None]:
tweets_df.head()

In [None]:
tweets_df.shape

In [None]:
sttm_dist = tweets_df.groupby('sttm_topic').count().reset_index()
sttm_dist['sum'] = np.sum(sttm_dist.index)
sttm_dist['pct'] = 100 * sttm_dist.index / sttm_dist['sum']

In [None]:
from matplotlib.pyplot import figure
figure(num=None, figsize=(30, 15), dpi=80, facecolor='w', edgecolor='k')
plt.rcParams.update({'font.size': 25})

plt.bar(sttm_dist.sttm_topic, sttm_dist.pct, align='center', alpha=0.5, color='#66023C');
plt.xlabel('Topic');
plt.xticks(np.arange(np.min(sttm_dist.sttm_topic), np.max(sttm_dist.sttm_topic)+1, 1));
plt.ylabel('Percentage of Tweets');
plt.title('Distribution of Topics in the Tweets (GSDMM)');

In [None]:
sttm_dist.sort_values(by=['pct'], ascending=False).head()


In [None]:
sttm_dist['pct'].sum()


In [None]:
df_filtered = tweets_df[(tweets_df['sttm_topic'] ==20)]
df_filtered