# Tweets analyzer.

#### This Notebook analyzes tweets to from individuals to a company. The please note:
##### 1. The main goal of this notebook is to analyze questions and reviews left by users for company's therefore to get more relevant data I have excluded tweets with attachments and only included tweets which are not replies to tweets or retweets.

#### 2. The data has been scrapped from twitter directly using octoparse 8. This data is from the periods between 01/01/2021 to 31/12/2021.

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install spacy-langdetect
!pip install contractions
!pip install gensim
!pip install pyLDAvis

import pandas as pd
import regex as re
import plotly.express as px
import numpy as np
import datetime
import spacy
import nltk
import contractions
import matplotlib.pyplot as plt
import gensim
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from gensim.models import CoherenceModel
from gensim.models import TfidfModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
pyLDAvis.enable_notebook()

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)


2022-03-02 12:39:14.185939: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-03-02 12:39:14.186009: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
df = pd.read_csv('/content/kplc_twitter_2021.csv')

In [None]:
def convertCountToInt(count):
    if(isinstance(count,str) and count[-1].lower() in ['k','m']):
        multiplier = 1000  if count[-1].lower() == 'k' else 1000000
        count = float(count[:len(count)-1]) * multiplier
    return int(count)

def cleanTweet1(tweet):
    pattern = r'\n'
    return re.sub(pattern,' ',tweet)

In [None]:
#converting comments, reshare and likes to int
df.fillna(0,inplace=True)

intColumns = ['comments','reshare','likes']

for column in intColumns:
    df[column] = df[column].apply(convertCountToInt);
    


In [None]:
#we'll remove next line tags to cleanup the data
df['tweet_cleaned'] = df['tweet'].apply(cleanTweet1)

#checking whether there are any null columns
df.isnull().sum()

In [None]:
#we'll remove the name column since we don't need it for the analysis
df.drop(['Name'],axis=1,inplace = True)

df.sample(10, random_state=10)

In [None]:
# we need to format the date in a more analsis friendly way
df['Month'] = pd.DatetimeIndex(df['Time']).month
df['Day'] = pd.DatetimeIndex(df['Time']).day
df['Time'] = pd.DatetimeIndex(df['Time']).time

df.sample(10, random_state=7)

In [None]:


import spacy
from spacy_langdetect import LanguageDetector
LanguageClassifier = spacy.load('en')
LanguageClassifier.add_pipe(LanguageDetector(), name='language_detector', last=True)

In [None]:
#since we have tweets in both swahili and english, we will first categorize them into the respective language.
df['language'] = df['tweet'].apply(lambda tweet: 'Swahili' if LanguageClassifier(tweet)._.language['language']== 'sw' else 'English')

In [None]:
df.sample(10, random_state=7)

In [None]:
#Lets see the count of kiswahili vs English reviews
df['language'].value_counts()

fig = px.bar(df, 
             y=df['language'].value_counts().values, 
             x=df['language'].value_counts().index,
             title = "Counts of tweets by language",
            labels={"x":"Language","y":"Number of tweets"})
fig.show()


In [None]:
#Now let's get a feel of the average length of a tweet

df['tweetLength'] = df['tweet'].apply(lambda tweet:len(tweet))
fig = px.box(df,y='tweetLength',title="Average tweet length")
fig.show()

In [None]:
#lets get to see the distribution of the tweets over different months

fig = px.bar(df,
             y=df['Month'].value_counts().values, 
             x=df['Month'].value_counts().index,
             title = "Counts of tweets by Month",
            labels={"x":"Month","y":"Number of tweets"})
fig.show()

In [None]:
#Getting distribution on accordance to hour of the day

df['hour'] = df['Time'].apply(lambda time: time.hour)

fig = px.bar(df,
             y=df['hour'].value_counts().values, 
             x=df['hour'].value_counts().index,
             title = "Counts of tweets by Month",
            labels={"x":"Hour","y":"Number of tweets"},
            )
fig.show()

#### From the initial EDA we have the following info:
#### 1. The tweets are multilingual with the larger percentage being on English and the lesser is Kiswahili.
#### 2. The average length of a tweet is 148 characters which tells us that people don't write long tweets for this company
#### 3. The first quarter of the year saw relatively more tweets than the other quarters.
#### 4. Most people tweet in the morning hours (6am - 9am) and in the afternoon (2pm - 6pm)

#### We will proceed to analyze the English reviews since they are more as compared to the English once and we also have more resources for English compared to swahili

In [None]:
#We'll first of all get our subset of the English reviews

df_english = df[df['language'] == 'English'].copy()
df_english.drop(['comments','reshare','likes','Time','language'], axis = 1, inplace= True)
df_english.sample(10, random_state=9)

In [None]:
#Lets no clean the data to so a more indepth analysis

#removing contractions, special charaters
def cleanTweet2(tweet):
  tweet = tweet.lower()
  tweet = [contractions.fix(word) for word in tweet.split()]
  tweet = ' '.join(map(str, tweet)) 
  tweet = re.sub('[^\w\d\s]+','',tweet)
  return tweet


df_english['tweet_cleaned2'] = df_english['tweet_cleaned'].apply(cleanTweet2)

In [None]:
#iteratively going through the most common words to remove those that don't provide us with meaning.
from sklearn.feature_extraction.text import CountVectorizer

def get_top_n_words(dataframe,ngram,n=None,):
  vector = CountVectorizer(ngram_range=(ngram, ngram)).fit(dataframe)
  bag_of_words = vector.transform(dataframe)
  sum_words = bag_of_words.sum(axis=0) 
  words_freq = [(word, sum_words[0, idx]) for word, idx in vector.vocabulary_.items()]
  words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
  return pd.DataFrame(words_freq[:n],columns=['words','count'])


In [None]:
#checking bigrams to see which stop words bring meaning to statements

fig =px.bar(get_top_n_words(df_english['tweet_cleaned2'],2,50),x='words',y='count')
fig.show()

In [None]:
#well remove the twitter mentions for kenya power since it is skewing the results and remove no from stop word since it has meaning in our statements

from nltk.corpus import stopwords
stop=set(stopwords.words('english'))
stop.update(['kenyapower_care','kenyapower','kenyapoweralert','hey','hi','hello'])

In [None]:
#removing stop words
from nltk.tokenize import word_tokenize

df_english['tweet_cleaned2'] = df_english['tweet_cleaned2'].apply(word_tokenize)
df_english['tweet_cleaned2'] = df_english['tweet_cleaned2'].apply(lambda x: [word for word in x if word not in stop])
df_english['tweet_cleaned2'] = [' '.join(map(str, l)) for l in df_english['tweet_cleaned2']]
fig =px.bar(get_top_n_words(df_english['tweet_cleaned2'],2,50),x='words',y='count')
fig.show()


In [None]:
fig =px.bar(get_top_n_words(df_english['tweet_cleaned2'],3,50),x='words',y='count')
fig.show()

In [None]:
#Lamentization
df_english['tweet_cleaned2'] = df_english['tweet_cleaned2'].apply(word_tokenize)

df_english['pos_tags'] = df_english['tweet_cleaned2'].apply(nltk.tag.pos_tag)

In [None]:
from nltk.corpus import wordnet

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

df_english['wordnet_pos'] = df_english['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

In [None]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()
df_english['lemmatized'] = df_english['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])

In [None]:
df_english['lemmatized_joined'] = [' '.join(map(str, l)) for l in df_english['lemmatized']]

fig =px.bar(get_top_n_words(df_english['lemmatized_joined'],2,50),x='words',y='count')
fig.show()

In [None]:
#we need to replace similar words which have been written differently

replacementKey = {
    'meter no':' meter number ',
    'metre no':' meter number ',
    'acc no':' account number ',
    'metre':' meter ',
    'acc': ' account ',
    'ac': ' account ',
    'account no':' account number ',
    'mtr':' meter '}

def replaceWord(tweet):
  for word,word2 in replacementKey.items():
    tweet = re.sub('\s' + word + '\s',word2,tweet)
  return tweet

df_english['lemmatized_joined'] = df_english['lemmatized_joined'].apply(replaceWord)

fig =px.bar(get_top_n_words(df_english['lemmatized_joined'],2,50),x='words',y='count')
fig.show()

In [None]:
#let's see a word cloud of the most common terms
from wordcloud import WordCloud, ImageColorGenerator

text = df_english['lemmatized_joined'].tolist() 


text = ' '.join(text)


wordcloud = WordCloud(collocations=True,width=800, height=400).generate(text)

plt.figure( figsize=(20,10) )
plt.imshow(wordcloud, interpolation='bilInear')
plt.axis('off')
plt.show()

### From the word cloud we realize that some of the words are repeated many times and is worth investigation to see whether it is a sign of dirty data.

In [None]:
#checking the most common words

fig =px.bar(get_top_n_words(df_english['lemmatized_joined'],1,50),x='words',y='count')
fig.show()

In [None]:
mostCommon = get_top_n_words(df_english['lemmatized_joined'],1,150)
indexes = []
for word in mostCommon['words']:
  sentences = df_english[df_english['lemmatized_joined'].str.contains(word)]
  if(sentences['handle'].nunique()/len(sentences) * 100 < 10):
    users = sentences['handle'].unique();
    for user in users:
      indexes += (df_english[df_english['lemmatized_joined'].str.contains(word) & df_english['handle'].str.contains(user)][1:].index.tolist())
indexes = list(dict.fromkeys(indexes))
df_english.drop(indexes,inplace=True)


In [None]:
fig =px.bar(get_top_n_words(df_english['lemmatized_joined'],1,50),x='words',y='count')
fig.show()

In [None]:
text = df_english['lemmatized_joined'].tolist() 


text = ' '.join(text)


wordcloud = WordCloud(collocations=True,width=800, height=400).generate(text)

plt.figure( figsize=(20,10) )
plt.imshow(wordcloud, interpolation='bilInear')
plt.axis('off')
plt.show()

### Lets do some topic modeling on this data to check whether we could be able to categorize the tweets.

### We will start by trying LSA technique to see if we get sensible topics

In [None]:
vectorizer = TfidfVectorizer(
    max_df=0.8,
    ngram_range=(1,3),
    stop_words='english',
    max_features=100
    )

vectors = vectorizer.fit_transform(df_english['lemmatized_joined'])

In [None]:
lsa = TruncatedSVD(n_components=2,n_iter=1000)

lsa.fit(vectors)

In [None]:
vocabulary = vectorizer.get_feature_names_out()

for i, vect in enumerate(lsa.components_):
  voc_vect = zip(vocabulary,vect)
  sorted_voc = sorted(voc_vect,key = lambda x:x[1], reverse=True)[:15]
  print("Topic %d" % i)
  for term in sorted_voc:
    print(term[0])
  print(" ")

In [None]:
cluster_model = KMeans(n_clusters=2, max_iter=100,n_init=4)

cluster_model.fit(vectors)

clusters = cluster_model.predict(vectors)

pca = PCA(n_components=2)

scatter_plots = pca.fit_transform(vectors.toarray())

colors = ["topic 1","topic 2","topic 3"]

x_axis = [o[0] for o in scatter_plots]
y_axis = [o[1] for o in scatter_plots]

fig = px.scatter(x= x_axis, y = y_axis, color = [colors[d] for d in clusters])
fig.show()

In [None]:
clustersProb = cluster_model.cluster_centers_.argsort()[:,::-1]

for index_cluster in range(2):
  print("Topic %d" % index_cluster)
  for ind in clustersProb[index_cluster,:15]:
    print(vocabulary[ind])
  print(" ")

### From LSA we can clearly tell that we have 2 main categories for tweet queries:

1. Power outage.

2. Token Enquries.

### Lets now do an analysis using LDA to see if we are going to get more insightful data. 

In [None]:

textTokenized = df_english['lemmatized_joined'].apply(word_tokenize)
# Build the bigram and trigram models
bigram = gensim.models.Phrases(textTokenized, min_count=5, threshold=100) 

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

textP = make_bigrams(textTokenized)

id2word = corpora.Dictionary(textP)

corpus = [id2word.doc2bow(text) for text in textP]

tfidf = TfidfModel(corpus,id2word)

max_df = 0.03
words = []
words_missing = []
for i in range (0, len(corpus)):
  bow = corpus[i]
  low_value_words = []
  tfidf_ids = [id for id, value in tfidf[bow]]
  bow_ids = [id for id,value in bow]
  low_value_words = [id for id, value in tfidf[bow] if value < max_df]
  drops = low_value_words + words_missing
  for item in drops:
    words.append(id2word[item])
  words_missing = [id for id in bow_ids if id not in tfidf_ids]
  new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing]
  corpus[i] = new_bow


In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=2, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus,id2word,mds="mmds", R=30)
vis

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=textTokenized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)