# Sentiment Analysis on Covid-19 vaccines in Ireland
## Analysis on tweets
### Import Libraries

In [145]:
# Data Manipulation
import pandas as pd
import numpy as np
import statsmodels.api as sm
from dateutil.relativedelta import relativedelta

# Visualization
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

# Own modules
from layer_data_access import tweet_data

In [146]:
def count_length(tweet):
    return len(tweet.replace(' ', ''))

def show_tweets_sentiment(tweets_df, date_df, cond, title, rule = 'D'):
    
    tweets_time_df, tweets_time_pos_df, tweets_time_neg_df, tweets_time_neu_df = get_distribution_labels(tweets_df, date_df, cond, rule)
    include_event = False
    rangeslider_visible = False

    if rule == 'D':
        max_display = tweets_time_df.created_at.max()
        min_display = tweets_time_df.created_at.max() - relativedelta(months=6)
        include_event = True
        rangeslider_visible = True

    else:
        max_display = tweets_time_df.created_at.max()
        min_display = tweets_time_df.created_at.min()

    layout = go.Layout(
        dragmode='pan',
        xaxis=dict(
            range=[min_display, max_display]
        )
    )
    
    fig = go.Figure(layout = layout)                     
    fig.add_trace(go.Scatter(x=tweets_time_pos_df.created_at, y=tweets_time_pos_df.tweet_id,
                        mode='lines',
                        name='Positive',
                        line={'color':my_color[2] + ',0.7)'}))
    fig.add_trace(go.Scatter(x=tweets_time_neg_df.created_at, y=tweets_time_neg_df.tweet_id,
                        mode="lines",
                        name='Negative',
                        line={'color': my_color[1] + ',0.8)'}))  
    fig.add_trace(go.Scatter(x=tweets_time_neu_df.created_at, y=tweets_time_neu_df.tweet_id,
                        mode="lines",
                        name='Neutral',
                        line={'color':my_color[0] + ',0.9)'})) 

    if include_event:    
        fig.add_trace(go.Scatter(x=tweets_time_df.created_at, y=tweets_time_df.tweet_id,
                        mode='none',
                        name = "",
                        text = tweets_time_df['description'],
                        hovertemplate = 'Event: '+
                        '<b>%{text}</b>',))

    fig.update_layout(
        template = 'plotly_white',
        hovermode="x unified",
        title={
            'text': "Sentiment over time " + title,
            'y':0.9,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
        legend={
                'orientation':"h",
                'yanchor':"bottom",
                'y': 1.02,
                'xanchor':"right",
                'x':1},
        xaxis_title="",
        yaxis_title="Tweets",)

    fig.update_xaxes(rangeslider_visible=rangeslider_visible)
    fig.show()


def pie_chart(tweets_df, cond, title):
    df = get_distribution(tweets_df, cond)
    fig = px.pie(df, values='tweet_id', names='label')

    fig.update_traces(textinfo='label+percent',
                    textfont_size=15,
                    marker={'colors': ['#CE1B28', '#E1C027', '#0CCE49']},
                    hovertemplate = None,
                    hoverinfo='skip',
                    showlegend = False,   
                    )
    
    fig.update_layout(template = 'plotly_white',
                    title={
                            'text': "Sentiment distribution " + title,
                            'x':0.5,
                            'xanchor': 'center',
                            'yanchor': 'top'
                            },
                    )

    fig.show()


def split_text(a_string, n):
    split_strings = []
    for index in range(0, len(a_string), n):
        split_strings.append(a_string[index : index + n].strip())
    return "<br>".join(split_strings)


def number_tweets(tweets_df, date_df, title, rule = 'D'):
    tweets_df.created_at = pd.to_datetime(tweets_df.created_at)
    if rule == 'D':
        tweets_df.created_at = pd.to_datetime(tweets_df.created_at).dt.date
        date_df.created_at = pd.to_datetime(date_df.created_at).dt.date

        date_df2 = date_df.drop_duplicates(subset=['created_at'], keep='last').copy()
        date_df2['description'] = date_df2['description'].apply(split_text, n=50)
        
        tweets_time_df = pd.merge(tweets_df[['created_at', 'tweet_id']].groupby('created_at').count(), date_df2, on='created_at', how='left')
        tweets_time_df['description'] = tweets_time_df['description'].fillna("No found")
        
        max_display = tweets_time_df.created_at.max()
        min_display = tweets_time_df.created_at.max() - relativedelta(months=6)

        layout = go.Layout(
        dragmode='pan',
        xaxis=dict(
                range=[min_display, max_display]
            )
        )
        fig = go.Figure(layout = layout)
        fig.add_trace(go.Scatter(x=tweets_time_df.created_at, y=tweets_time_df.tweet_id,
                            mode='lines',
                            name = "",
                            text = tweets_time_df['description'], 
                            hovertemplate = 'Total: %{y}<br>Event: '+ '<b>%{text}</b>'))
        fig.update_xaxes(rangeslider_visible=True)

    else:
        tweets_time_df = tweets_df[['created_at', 'tweet_id']].set_index('created_at').resample(rule).count()
        tweets_time_df.reset_index(inplace = True)
        fig = go.Figure()
        fig.add_trace(go.Scatter(x=tweets_time_df.created_at, y=tweets_time_df.tweet_id,
                            mode='lines',
                            name = ""))
        fig.update_xaxes(rangeslider_visible=False)

    fig.update_layout(
        template = 'plotly_white',
        hovermode="x unified",
        title={
            'text': "Number of tweets over the time<br>" + title,
            'y':0.9,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
        legend={
                'orientation':"h",
                'yanchor':"bottom",
                'y': 1.02,
                'xanchor':"right",
                'x':1},
        xaxis_title="",
        yaxis_title="Tweets",)
    
    fig.show()

def seven_days(tweets_df, cond):
    # lets check components of time series
    if not cond is None:
        series = tweets_df[cond][['created_at', 'label', 'tweet_id']].groupby(['created_at', 'label']).count().reset_index()
    else:
        series = tweets_df[['created_at', 'label', 'tweet_id']].groupby(['created_at', 'label']).count().reset_index()

    series.created_at = pd.to_datetime(series.created_at)
    series = series.set_index('created_at')
    series = series.pivot(columns='label', values='tweet_id')
    series.fillna(0, inplace=True)
    series = series.resample(rule='W').sum()
    series = series.reset_index()
    
    print("First week of 2021:")
    print(series[series.created_at.dt.year == 2021].iloc[0,:])

    print("\nLast week of 2021:")
    print(series.iloc[-1,:])

def retunr_tweet(d):
    if len(d.split(',')) == 1:
        return d
    return ''


def get_distribution(tweets_df, cond):
    if not cond is None:
        df = tweets_df[cond].copy()
    else:
        df = tweets_df.copy()
    tweets_time_df = df[['label', 'tweet_id']].groupby(['label']).count().reset_index().copy()
    return tweets_time_df


def get_distribution_labels(tweets_df, date_df, cond, rule = 'D'):
    if not cond is None:
        df = tweets_df[cond].copy()
    else:
        df = tweets_df.copy()

    df = df[['created_at', 'label', 'tweet_id']].groupby(['created_at', 'label']).count().reset_index().copy()
    tweets_time_pos_df = df[df.label == 'positive'][['created_at','tweet_id']].copy()
    tweets_time_neg_df = df[df.label == 'negative'][['created_at','tweet_id']].copy()
    tweets_time_neu_df = df[df.label == 'neutral'][['created_at','tweet_id']].copy()
    
    if rule == 'D':
        date_df2 = date_df.drop_duplicates(subset=['created_at'], keep='last').copy()
        date_df2['description'] = date_df2['description'].apply(split_text, n=50)
        date_df2.created_at = pd.to_datetime(date_df2.created_at).dt.date

        df2 = df[['created_at', 'tweet_id']].groupby('created_at').count().reset_index().copy()
        df2.created_at = pd.to_datetime(df2.created_at).dt.date

        tweets_time_df = pd.merge(df2, date_df2, on='created_at',how='left')
        tweets_time_df['description'] = tweets_time_df['description'].fillna("No found")

    else:
        df.created_at = pd.to_datetime(df.created_at)
        tweets_time_pos_df.created_at = pd.to_datetime(tweets_time_pos_df.created_at)
        tweets_time_neg_df.created_at = pd.to_datetime(tweets_time_neg_df.created_at)
        tweets_time_neu_df.created_at = pd.to_datetime(tweets_time_neu_df.created_at)

        tweets_time_df = df.set_index('created_at').resample(rule).sum()
        tweets_time_pos_df = tweets_time_pos_df.set_index('created_at').resample(rule).sum()
        tweets_time_neg_df = tweets_time_neg_df.set_index('created_at').resample(rule).sum()
        tweets_time_neu_df = tweets_time_neu_df.set_index('created_at').resample(rule).sum()

        tweets_time_df.reset_index(inplace = True)
        tweets_time_pos_df.reset_index(inplace = True)
        tweets_time_neg_df.reset_index(inplace = True)
        tweets_time_neu_df.reset_index(inplace = True)
    
    return tweets_time_df, tweets_time_pos_df, tweets_time_neg_df, tweets_time_neu_df


In [147]:
# Global variables
my_color = ['rgba(225, 192, 39', 'rgba(206, 27, 40', 'rgba(12, 206, 73']

In [148]:
# Get token excluding stop words and keywords
access_data = tweet_data.tweet_data_remote()
tweets_df = access_data.get_tweets()
date_df = access_data.get_dates()
del access_data

In [149]:
tweets_df.shape

(155950, 11)

In [150]:
# Change column names from date
date_df.columns = ['id', 'created_at', 'description']

# Remove time from created_at
tweets_df.created_at = pd.to_datetime(tweets_df.created_at).dt.date
date_df.created_at = pd.to_datetime(date_df.created_at).dt.date

# Remove all tweets with no characters
# Empty tweets would be classified as neutral as the lack of verbs/words with sentiment
cond = tweets_df['active'] == 1 # Empty tweets
tweets_df = tweets_df[cond].copy()

display(tweets_df.head(3))
display(date_df.head(3))

Unnamed: 0,tweet_id,created_at,label_id,label,author,conversation_id,batch_name,keywords,keywords_pharma,tweet_type,active
0,1216683812287057920,2020-01-13,2,negative,other,1216683812287057920,from_gov,"vaccinated, virus",,post,1
1,1216683996068876289,2020-01-13,2,negative,hpscireland,1216683996068876289,from_gov,virus,,retweeted,1
2,1217877514149879808,2020-01-16,1,positive,other,1217877514149879808,covid_vaccine,,,quoted,1


Unnamed: 0,id,created_at,description
0,1,2020-01-27,The National Public Health Emergency Team (NPH...
1,2,2020-02-05,The Coronavirus Expert Advisory Group—a subgro...
2,3,2020-02-29,The first confirmed case in the Republic of Ir...


In [151]:
tweets_df.shape

(141886, 11)

In [152]:
# lets check components of time series
series = tweets_df[['created_at', 'tweet_id']].groupby(['created_at']).count().reset_index()
series.created_at = pd.to_datetime(series.created_at)
series = series.set_index('created_at')
series = series.asfreq(freq='D', how='end', fill_value = 0)

decomposed = sm.tsa.seasonal_decompose(series.tweet_id)

In [153]:
# Zoom in seasonal component
fig = px.line(decomposed.seasonal['2021-07':], title='Daily COVID-19 vaccines tweets - seasonal component')
fig.show()

In [154]:
number_tweets(tweets_df, date_df, "Daily tweets", 'D')

number_tweets(tweets_df, date_df, "Weekly tweets", 'W')

number_tweets(tweets_df, date_df, "Monthly tweets", 'M')

number_tweets(tweets_df, date_df, "Quater tweets", 'Q')

In [166]:
# Daily
show_tweets_sentiment(tweets_df, date_df, None, "<br>Daily tweets", rule = 'D')

# Weekly
show_tweets_sentiment(tweets_df, date_df, None, "<br>Weekly tweets", rule = 'W')

# Monthly
show_tweets_sentiment(tweets_df, date_df, None, "<br>Monthly tweets", rule = 'M')

# Quater
show_tweets_sentiment(tweets_df, date_df, None, "<br>Quater tweets", rule = 'Q')

# pie chart
pie_chart(tweets_df, None, "")

# Seven days
seven_days(tweets_df, None)

First week of 2021:
label
created_at    2021-01-03 00:00:00
negative                     1454
neutral                      1316
positive                     1755
Name: 50, dtype: object

Last week of 2021:
label
created_at    2021-08-15 00:00:00
negative                     1854
neutral                      1229
positive                     1301
Name: 82, dtype: object


In [167]:
# media
cond = tweets_df.batch_name.str.startswith('from_media')
show_tweets_sentiment(tweets_df, date_df, cond, "<br>Weekly tweets from media usernames", rule = 'W')
pie_chart(tweets_df, cond, "<br>Tweets from media usernames")

# Seven days
seven_days(tweets_df, cond)
print("total:", cond.sum())

First week of 2021:
label
created_at    2021-01-03 00:00:00
negative                       12
neutral                        47
positive                       20
Name: 49, dtype: object

Last week of 2021:
label
created_at    2021-08-15 00:00:00
negative                       37
neutral                        42
positive                       27
Name: 81, dtype: object
total: 3779


In [168]:
# Government
cond = tweets_df.batch_name.str.startswith('from_gov')
show_tweets_sentiment(tweets_df, date_df, cond, "<br>Weekly tweets from government usernames", rule = 'W')
pie_chart(tweets_df, cond, "<br>Tweets from government usernames")

# Seven days
seven_days(tweets_df, cond)

print("total:", cond.sum())

First week of 2021:
label
created_at    2021-01-03 00:00:00
negative                        1
neutral                         5
positive                        9
Name: 50, dtype: object

Last week of 2021:
label
created_at    2021-08-15 00:00:00
negative                       16
neutral                        20
positive                       24
Name: 82, dtype: object
total: 1109


In [169]:
# politicians
cond = tweets_df.batch_name.str.startswith('from_politic')
show_tweets_sentiment(tweets_df, date_df, cond, "<br>Weekly tweets from political party usersnames", rule = 'W')
pie_chart(tweets_df, cond, "<br>Tweets from political party usersnames")

# Seven days
seven_days(tweets_df, cond)

print("total:", cond.sum())

First week of 2021:
label
created_at    2021-01-03 00:00:00
negative                        1
neutral                         3
positive                        4
Name: 42, dtype: object

Last week of 2021:
label
created_at    2021-08-15 00:00:00
negative                        2
neutral                         3
positive                        2
Name: 74, dtype: object
total: 339


In [170]:
# health
cond = tweets_df.batch_name.str.startswith('from_health')
show_tweets_sentiment(tweets_df, date_df, cond, "<br>Weekly tweets from health department usersnames", rule = 'W')
pie_chart(tweets_df, cond, "<br>Tweets from health department usersnames")

# Seven days
seven_days(tweets_df, cond)
print("total:", cond.sum())

First week of 2021:
label
created_at    2021-01-03 00:00:00
negative                        0
neutral                         0
positive                        1
Name: 35, dtype: object

Last week of 2021:
label
created_at    2021-08-15 00:00:00
negative                        5
neutral                         1
positive                        4
Name: 67, dtype: object
total: 70


In [179]:
# astrazenecan
cond1 = tweets_df.keywords_pharma.str.contains('vaxzevria')
cond2 = tweets_df.keywords_pharma.str.contains('oxford')
cond3 = tweets_df.keywords_pharma.str.contains('astra')
cond4 = tweets_df.keywords_pharma.str.contains('azd1222')
cond5 = tweets_df.tweet_id == tweets_df.conversation_id

cond = tweets_df.conversation_id.isin(tweets_df[(cond1 | cond2 | cond3 | cond4) & cond5].conversation_id.unique())

print("Total tweets", cond.sum())

show_tweets_sentiment(tweets_df, date_df, cond, "<br>Weekly tweets about Astrazenecan vaccine", rule = 'W')
pie_chart(tweets_df, cond, "<br>Tweets about Astrazenecan vaccine")

# Seven days
seven_days(tweets_df, cond)

Total tweets 7354


First week of 2021:
label
created_at    2021-01-03 00:00:00
negative                        9
neutral                         6
positive                       35
Name: 42, dtype: object

Last week of 2021:
label
created_at    2021-08-15 00:00:00
negative                        3
neutral                         0
positive                        2
Name: 74, dtype: object


In [183]:
# pfizer
cond1 = tweets_df.keywords_pharma.str.contains('pfizer')
cond2 = tweets_df.keywords_pharma.str.contains('biontech')
cond3 = tweets_df.keywords_pharma.str.contains('bnt162')
cond4 = tweets_df.keywords_pharma.str.contains('comirnaty')
cond5 = tweets_df.tweet_id == tweets_df.conversation_id
cond = tweets_df.conversation_id.isin(tweets_df[(cond1 | cond2 | cond3 | cond4) & cond5].conversation_id.unique())
print("Total tweets", cond.sum())

show_tweets_sentiment(tweets_df, date_df, cond, "<br>Weekly tweets about Pfizer vaccine", rule = 'W')
pie_chart(tweets_df, cond, "<br>Tweets about Pfizer vaccine")

# Seven days
seven_days(tweets_df, cond)

Total tweets 4671


First week of 2021:
label
created_at    2021-01-03 00:00:00
negative                       49
neutral                        58
positive                       64
Name: 41, dtype: object

Last week of 2021:
label
created_at    2021-08-15 00:00:00
negative                       13
neutral                         9
positive                       14
Name: 73, dtype: object


In [184]:
# moderna
cond1 = tweets_df.keywords_pharma.str.contains('moderna')
cond2 = tweets_df.tweet_id == tweets_df.conversation_id
cond = tweets_df.conversation_id.isin(tweets_df[cond1 & cond2].conversation_id.unique())
print("Total tweets", cond.sum())

show_tweets_sentiment(tweets_df, date_df, cond, "<br>Weekly tweets about Moderna vaccine", rule = 'W')
pie_chart(tweets_df, cond, "<br>Tweets about Moderna vaccine")

# Seven days
seven_days(tweets_df, cond)

Total tweets 1361


First week of 2021:
label
created_at    2021-01-03 00:00:00
negative                        9
neutral                         9
positive                       12
Name: 39, dtype: object

Last week of 2021:
label
created_at    2021-08-15 00:00:00
negative                        1
neutral                         3
positive                        2
Name: 71, dtype: object


In [182]:
# johnson
cond1 = tweets_df.keywords_pharma.str.contains('johnson')
cond2 = tweets_df.keywords_pharma.str.contains('janssen')
cond3 = tweets_df.keywords_pharma.str.contains('j&j')
cond5 = tweets_df.tweet_id == tweets_df.conversation_id
cond = tweets_df.conversation_id.isin(tweets_df[(cond1 | cond2 | cond3) & cond5].conversation_id.unique())
print("Total tweets", cond.sum())

show_tweets_sentiment(tweets_df, date_df, cond, "<br>Weekly tweets about J&J vaccine", rule = 'W')
pie_chart(tweets_df, cond, "<br>Tweets about j&j vaccine")

# Seven days
seven_days(tweets_df, cond)

Total tweets 3644


First week of 2021:
label
created_at    2021-01-03 00:00:00
negative                        1
neutral                         1
positive                        2
Name: 47, dtype: object

Last week of 2021:
label
created_at    2021-08-15 00:00:00
negative                        4
neutral                         1
positive                        5
Name: 79, dtype: object
