In [1]:
import pandas as pd
import ijson
import glob
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_theme(style="ticks", color_codes=True)

In [64]:
def read_json(path):
    data ={"tweet_id":[], "time":[],"text":[],"city":[],"coordinates":[],"react_count":[],"quote_count":[],"reply_count":[],
           "retweet_count":[],"favorite_count":[]}

    read_files = glob.glob(path)
    for file in read_files:
        with open(str(file), 'r') as f:
            objects = ijson.items(f, 'results.item')
            for obj in objects:
                for row in obj:
                    if row['text'].startswith("RT") == False:
                        data['tweet_id'].append(row['id'])
                        data['time'].append(row['created_at'])
                        if "extended_tweet" in row:
                            data['text'].append(row["extended_tweet"]['full_text'])
                        else:
                            data['text'].append(row["text"])
                        data['city'].append(row['place']['name'])
                        data['coordinates'].append(row['coordinates'])
                        data['quote_count'].append(row['quote_count'])
                        data['reply_count'].append(row['reply_count'])
                        data['retweet_count'].append(row['retweet_count'])
                        data['favorite_count'].append(row['favorite_count'])
                        data['react_count'].append(row['quote_count']+row['reply_count']+row['retweet_count']+row['favorite_count'])
    data = pd.DataFrame(data)
    data['time'] = pd.to_datetime(data['time']).dt.date
    return data

In [73]:
from SentimentAnalysis import *
from TextAnalytics import *
from pyspark.sql.types import *

def to_word(text):
    return text.split(" ")

strip_non_ascii_udf = udf(strip_non_ascii, StringType())
fix_abbreviation_udf = udf(fix_abbreviation, StringType())
remove_features_udf = udf(remove_features, StringType())
sentiment_analysis_udf = udf(sentiment_analysis , FloatType())
sentiment_udf = udf(lambda x: condition(x), StringType())

check_blanks_udf = udf(check_blanks, StringType())
check_lang_udf = udf(check_lang, StringType())
remove_stops_udf = udf(remove_stops, StringType())
tag_and_remove_udf = udf(tag_and_remove, StringType())
lemmatize_udf = udf(lemmatize, StringType())
to_word_udf = udf(to_word,ArrayType(StringType()))

In [66]:
def spark_df(data):
    spark = SparkSession.builder.appName('NLP').getOrCreate()
    df = spark.createDataFrame(data)
    return df

In [72]:
def top_react_tweet(df):
    pd = df.select('react_count','text',month('time').alias('month')).toPandas()
    return pd
top_react_tweet(df)

Unnamed: 0,react_count,text,month
0,1,"@PhillipAdams_1 As my golfer Dad would say, ‘ ...",6
1,0,@DavidLeyonhjelm @JoshFrydenberg @political_al...,6
2,3,@in_a_knot @rrogerramjet @SallyRMelb @peacockp...,6
3,0,Here is post no. 6 to second part of the What...,6
4,0,@Len_hayes what a legend... next one is indoor...,6
...,...,...,...
14995,1,@jordanlperkins I think those four words are t...,3
14996,0,@JTanganga99 Happy birthday mate! Keep playing...,3
14997,3,@mattyperic @DamianTBerry @ziggylisk Chicken S...,3
14998,2,"@_DovahGaming_ @Mythical Where's this ""biologi...",3


In [5]:
def sentiment_analysis(df):
    
    df = df.withColumn('text_non_asci',strip_non_ascii_udf(df['text']))
    df = df.withColumn('fixed_abbrev',fix_abbreviation_udf(df['text_non_asci']))
    df = df.withColumn('removed',remove_features_udf(df['fixed_abbrev']))
    df  = df.withColumn("sentiment_score", sentiment_analysis_udf( df['removed'] ))
    df  = df.withColumn("sentiment", sentiment_udf( df['sentiment_score'] ))
    return df

In [6]:
# SA_results = df.select('text','time','sentiment_score','sentiment')

In [7]:
import itertools
def covid_SA_date(df):
    covid_relate = df.withColumn('covid_content',when((col("removed").like('%covid%') | col("removed").like('%coronavirus%')),"Covid-related").otherwise("Not covid-related"))
    # filterWords_byMonth = filterWords.select('text','time',month('time').alias('month'),'sentiment_score','sentiment')
    covid_relate_date = covid_relate.select('text','time',month('time').alias('month'),'sentiment','covid_content')
    return covid_relate_date

def covid_SA_month(covid_relate_date):
    covid_relate  = covid_relate_date.groupBy('month','sentiment','covid_content').count().orderBy('month','covid_content')
    return covid_relate

In [8]:
def pd_summary(covid_relate_date,covid_relate):
    summary = covid_relate.toPandas()
    freq = covid_relate_date.groupBy('month','covid_content').count().orderBy('month','covid_content').toPandas()
    summary['total']=list(itertools.chain.from_iterable(itertools.repeat(x, 3) for x in freq['count']))
    summary['percentage'] = summary['count']/summary['total']*100
    summary['percentage'] = summary['percentage'].round(decimals=1)
    summary['month'] = summary['month'].replace({3:"March",5:"May",6:"June",8:"August",11:"November"})
    return summary

In [9]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [10]:
def covid_charts(summary, month):
    data = summary[summary['month']==month]
    covid = data[data['covid_content']=='Covid-related']
    title = "Tweets from "+month+" 2020"
    
    layout = go.Layout(
                    legend=dict(
                    orientation="h")
                    )
    
    fig = make_subplots(
        rows=1, cols=2,
        column_widths=[0.4, 0.6],
        row_heights=[0.4],
        specs=[[{"type": "pie"}, {"type": "bar"}]],
        subplot_titles=("Tweet topics","Sentiment of tweets related to Covid-19"),
        horizontal_spacing = .2)
    
    fig.add_trace(
        go.Pie(labels=data['covid_content'], values=data['count'],hole=.3,texttemplate = "%{label}: %{value} <br>(%{percent})", textposition='outside'),
        row=1, col=1
    )
    fig.add_trace(
        go.Bar(y=covid["count"],x=covid["sentiment"],showlegend=False,texttemplate='%{y}', textposition='outside'),
        row=1, col=2
    )
    
    
    fig.update_layout(
        legend=dict(
        yanchor="top",
        y=0,
        xanchor="left",
        x=0),
        title={
            'text':title,
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
    )
    fig.update_xaxes(title_text="Sentiment", row=1, col=2)
    fig.update_yaxes(title_text="Number of tweets", row=1, col=2)
    fig.show()

In [11]:
def monthly_covid(covid_relate):
    month_count_covid = covid_relate.filter(covid_relate.covid_content=="Covid-related").groupBy('month').sum('count').orderBy('month')
    month_covid = month_count_covid.toPandas().rename(columns={'month':'Month','sum(count)':'Number of tweets'})
    month_covid['Month'] = month_covid['Month'].replace({3:"March",5:"May",6:"June",8:"August",11:"November"})
    return month_covid

In [12]:
def covid_overall_chart(summary,month_covid):
    fig = px.bar(summary, x="sentiment", y="percentage",color="sentiment", facet_row="covid_content", facet_col="month",
                      category_orders = {"sentiment":["positive","neutral","negative"]},
                labels={
                     "percentage %": "Percentage %",
                     "sentiment": "Sentiment",
                     "month": "Month"
                 }            )
    
    fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
    
    fig['layout']['xaxis']['title']['text']=''
    fig['layout']['xaxis2']['title']['text']=''
    fig['layout']['xaxis4']['title']['text']=''
    fig['layout']['xaxis5']['title']['text']=''
    fig.update_layout(
        title={
                'text':"Sentiment of tweets from key months of Covid-19 in 2020",
                'y':0.98,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'},
        )
    fig.update_xaxes(matches='x')
    fig.update_yaxes(matches=None,ticksuffix="%")
    fig.update_traces(texttemplate='%{y}', textposition='inside')
    
    pie = px.pie(month_covid,values='Number of tweets', names='Month',hole=.3)
    pie.update_layout(
        title={
                'text':"Distribution of Covid-related tweets",
                'y':0.95,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'})
    pie.update_traces(textinfo='percent+label', textposition='inside')
    
    pie.show()
    fig.show()

In [13]:
def work_relate(df):
    remote = df.withColumn('work_related',when((col("removed").like('%career%') | col("removed").like('%job%') | col("removed").like('%work%')),"Yes").otherwise("No"))
    remote = remote.select('text','time',month('time').alias('month'),'sentiment','work_related')
    remote.groupBy('month','sentiment','work_related').count().orderBy('month','work_related','sentiment').show()
work_relate(SA_results)

NameError: name 'SA_results' is not defined

In [None]:
def covid_overall_chart(summary,month_covid):
    fig = px.bar(summary, x="sentiment", y="percentage",color="sentiment", facet_row="covid_content", facet_col="month",
                      category_orders = {"sentiment":["positive","neutral","negative"]},
                labels={
                     "percentage %": "Percentage %",
                     "sentiment": "Sentiment",
                     "month": "Month"
                 }            )
    
    fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
    
    fig['layout']['xaxis']['title']['text']=''
    fig['layout']['xaxis2']['title']['text']=''
    fig['layout']['xaxis4']['title']['text']=''
    fig['layout']['xaxis5']['title']['text']=''
    fig.update_layout(
        title={
                'text':"Sentiment of tweets from key months of Covid-19 in 2020",
                'y':0.98,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'},
        )
    fig.update_xaxes(matches='x')
    fig.update_yaxes(matches=None,ticksuffix="%")
    fig.update_traces(texttemplate='%{y}', textposition='inside')
    
    pie = px.pie(month_covid,values='Number of tweets', names='Month',hole=.3)
    pie.update_layout(
        title={
                'text':"Distribution of Covid-related tweets",
                'y':0.95,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'})
    pie.update_traces(textinfo='percent+label', textposition='inside')
    
    pie.show()
    fig.show()

In [None]:
mean_sentiment = SA_results.select('time',dayofmonth('time').alias('day'),month('time').alias('month'),'sentiment_score','sentiment').groupBy('time','month','day').agg(avg(col("sentiment_score"))).orderBy('time')
mean_sentiment = mean_sentiment.toPandas()
mean_sentiment['month'] = mean_sentiment['month'].replace({3:"March",5:"May",6:"June",8:"August",11:"November"})

In [None]:
mean_sentiment.groupby('month').agg(['min', 'max'])
mean_sentiment.groupby('month').mean('avg(sentiment_score)')

In [None]:
import plotly.express as px
line_graph = px.line(mean_sentiment,x='day',y='avg(sentiment_score)',facet_row="month",
                     labels={'day':"Day of month", 'avg(sentiment_score)':"Average sentiment_score"})
line_graph.update_xaxes(matches=None)
line_graph.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
line_graph['layout']['yaxis']['title']['text']=''
line_graph['layout']['yaxis2']['title']['text']=''
line_graph['layout']['yaxis4']['title']['text']=''
line_graph['layout']['yaxis5']['title']['text']=''
line_graph.update_yaxes(matches='y')
line_graph.update_layout(
        title={
                'text':"Average sentiment scores over the key months of Covid-19 in 2020",
                'y':0.98,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'}
        )

line_graph.add_vrect(
    x0=16, x1=25,
    fillcolor="yellow", opacity=0.5,
    layer="below", line_width=0,row=5,col=1,
    annotation_text="Stage 1", annotation_position="top left"
)

line_graph.add_vrect(
    x0=25, x1=30,
    fillcolor="orange", opacity=0.5,
    layer="below", line_width=0,row=5,col=1,
    annotation_text="Stage 2", annotation_position="top left"
)

line_graph.add_vrect(
    x0=30, x1=31,
    fillcolor="salmon", opacity=0.5,
    layer="below", line_width=0,row=5,col=1,
    annotation_text="Stage 3", annotation_position="top left"
)

line_graph.add_vrect(
    x0=1, x1=11,
    fillcolor="salmon", opacity=0.5,
    layer="below", line_width=0,row=4,col=1,
    annotation_text="Stage 3", annotation_position="top left"
)

line_graph.add_vrect(
    x0=11, x1=31,
    fillcolor="lightsalmon", opacity=0.5,
    layer="below", line_width=0,row=4,col=1,
    annotation_text="Stage 2.5 - Ease of resctrictions", annotation_position="bottom left"
)
line_graph.add_vrect(
    x0=1, x1=31,
    fillcolor="lightsalmon", opacity=0.5,
    layer="below", line_width=0,row=3,col=1,
    annotation_text="Stage 2.5 - Ease of resctrictions", annotation_position="bottom left"
)
line_graph.add_vrect(
    x0=1, x1=31,
    fillcolor="salmon", opacity=0.5,
    layer="below", line_width=0,row=2,col=1,
    annotation_text="Stage 3 - Pandemic outbreak", annotation_position="top left"
)

line_graph.add_vrect(
    x0=1, x1=30,
    fillcolor="lightgreen", opacity=0.5,
    layer="below", line_width=0,row=1,col=1,
    annotation_text="Lift of restrictions", annotation_position="bottom left"
)
line_graph

In [None]:
if __name__ == "__main__":
    data = read_json("2020historical/*.json")
    SA_results = sentiment_analysis(data)
    covid_date_results = covid_SA_date(SA_results)
    covid_month_results = covid_SA_month(covid_date_results)
    covid_summary= pd_summary(covid_date_results,covid_month_results)
    monthy_summary = monthly_covid(covid_month_results)
    covid_charts(covid_summary,'March')
    covid_overall_chart(covid_summary,monthy_summary)