In [None]:
import pandas as pd
import numpy as np 
from textblob import TextBlob
from wordcloud import WordCloud
import re
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stat
from matplotlib import gridspec

In [None]:
hc = pd.read_csv('dataHC6_tweets.csv')
bo = pd.read_csv('dataBO3_tweets.csv')
jb = pd.read_csv('dataJB3_tweets.csv')
kh = pd.read_csv('dataKH2_tweets.csv')

In [None]:
#drop duplicate row
hc.drop_duplicates(inplace=True)
bo.drop_duplicates(inplace=True)
jb.drop_duplicates(inplace=True)
kh.drop_duplicates(inplace=True)

In [None]:
df = pd.concat([hc, bo, jb,kh]).copy()
df.head(5)

In [None]:
len(df)

In [None]:
def cleanTxt(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'RT[\s]+', '', text)
    text = re.sub(r'https?:\/\/\S+', '', text)

    return text

In [None]:
def cleanNum(num):
    num = re.sub(',', '', num)
    if '萬' in num:
        num = num.replace('萬', '',1)
        num = float(num)*10000
    return float(num)

In [None]:
def cleanDate(datentime):
    dateonly = datentime.split("T")
    return dateonly[0]

In [None]:
df['Tweet_clean']  = df['Tweet'].astype(str).apply(cleanTxt)

df['Comments_clean'] = df['Comments'].astype(str).apply(cleanNum)
df['Likes_clean'] = df['Likes'].astype(str).fillna(0).apply(cleanNum)
df['Retweets_clean'] = df['Retweets'].astype(str).fillna(0).apply(cleanNum)

df['date_clean'] = df['Timestamp'].apply(cleanDate)
df['date_clean'] = pd.to_datetime(df['date_clean'])

In [None]:
df.loc[df['Comments_clean'].isna(), 'Comments_clean'] = 0
df.loc[df['Likes_clean'].isna(), 'Likes_clean'] = 0
df.loc[df['Retweets_clean'].isna(), 'Retweets_clean'] = 0

In [None]:
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

df['Subjectivity'] = df['Tweet_clean'].apply(getSubjectivity)
df['Polarity'] = df['Tweet_clean'].apply(getPolarity)

In [None]:
df.head(10)

In [None]:
# plotting word cloud
allWords = ''.join([twts for twts in df['Tweet_clean'].astype(str)])
wordCloud = WordCloud(width = 500, height =300, random_state = 21, max_font_size =120).generate(allWords)

plt.imshow(wordCloud, interpolation = "bilinear")
plt.axis("off")
plt.show()

In [None]:
#Create function to compute the negative, netural and positive analysis

def getAnalysis(score):
    if score < 0:
        return 'negative'
    if score == 0:
        return 'neutral'
    else: return 'positive'

df['Analysis'] = df['Polarity'].apply(getAnalysis)

In [None]:
df.head(5)

In [None]:
# showing the top 10 negative tweets
print(df.sort_values(by='Polarity').iloc[:10,]['Tweet'])

In [None]:
print(df.sort_values(by='Polarity').iloc[:10,]['Tweet'])

In [None]:
df.sort_values(by='Polarity', ascending=False).iloc[:10,]['Tweet_clean']

In [None]:
# plt.figure(figsize=(8,6))
# plt.scatter(x=df['Polarity'], y=df['Subjectivity'])
# plt.title('Sentiment Analysis')
# plt.xlabel('Polarity')
# plt.ylabel('Subjectivity')


In [None]:
# get the percentage of different type of tweets:
print(' Percentage of positive tweets: ', round(df[df['Analysis'] == 'positive'].shape[0] / df['Analysis'].shape[0] * 100, 3) ,'%', '\n',
'Percentage of netural tweets: ', round(df[df['Analysis'] == 'neutral'].shape[0] / df['Analysis'].shape[0]*100,3),'%', '\n',
'Percentage of negative tweets: ', round(df[df['Analysis'] == 'negative'].shape[0] / df['Analysis'].shape[0]*100, 3),'%')


In [None]:

fig = plt.figure(constrained_layout=True, figsize=(10,5))
widths = [1, 1, 1]
heights = [1, 3]
spec5 = fig.add_gridspec(ncols=3, nrows=2, width_ratios=widths,
                          height_ratios=heights)

ax0 = fig.add_subplot(spec5[0, 0])
ax0 = sns.boxplot(x='Likes_clean', data= df, showfliers=False)

ax1 = fig.add_subplot(spec5[0, 1])
ax1 = sns.boxplot(x='Retweets_clean', data=df, showfliers=False)

ax2 = fig.add_subplot(spec5[0, 2])
ax2 = sns.boxplot(x='Comments_clean', data=df, showfliers=False)

ax3 = fig.add_subplot(spec5[1, 0])
ax3 = sns.kdeplot(x='Likes_clean', data=df)

ax4 = fig.add_subplot(spec5[1, 1])
ax4 = sns.kdeplot(x='Retweets_clean', data=df)

ax5 = fig.add_subplot(spec5[1, 2])
ax5 = sns.kdeplot(x='Comments_clean', data=df)


Testing if the mean number of Likes, Retweets and Comments are the same for positive and non-positive tweets

In [None]:
# if the positive and non-positive tweets get same mean of likes, retweets and comment

pos_t = df[df['Analysis']=='positive'].copy()
nonpos_t = df[df['Analysis'] != 'positive'].copy()

In [None]:
stat.ttest_ind(pos_t['Likes_clean'], nonpos_t['Likes_clean'])
stat.ttest_ind(pos_t['Retweets_clean'], nonpos_t['Retweets_clean'])
stat.ttest_ind(pos_t['Comments_clean'], nonpos_t['Comments_clean'])

In [None]:
print("Comparing the mean of Likes: " , pos_t['Retweets_clean'].mean(), nonpos_t['Retweets_clean'].mean(), "\n")
print("Comparing the mean of Retweets: ", pos_t['Retweets_clean'].mean(), nonpos_t['Retweets_clean'].mean(), "\n")
print("Comparing the mean of Comments: Positive tweets: ", pos_t['Comments_clean'].mean(), "Non positive tweets: ",nonpos_t['Comments_clean'].mean())