In [None]:
import pandas as pd
import ijson
import glob
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_theme(style="ticks", color_codes=True)

from pyspark.sql import SparkSession
from SentimentAnalysis import *
from TextAnalytics import *
from pyspark.sql.types import *

from pyspark.sql.functions import monotonically_increasing_id
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier
from pyspark.ml.clustering import LDA
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.feature import CountVectorizer

from pyspark.ml.feature import StopWordsRemover,Tokenizer, RegexTokenizer, CountVectorizer, IDF
from pyspark.sql.functions import rank,udf, col, size, explode, regexp_replace, trim, lower, lit
from pyspark.sql.types import *
# from pyspark.ml.clustering import LDA
import pyLDAvis

from pyspark.sql.window import Window
# from pyspark.sql.functions import rank, col
pd.set_option('display.max_colwidth', None)


from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image

import plotly
import itertools
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from sklearn import preprocessing

from shapely import geometry
import geopandas as gpd
from shapely.geometry import Point, Polygon
import folium
from geopandas import sjoin
import json
import geojson
import warnings
def fxn():
    warnings.warn("deprecated", DeprecationWarning)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()
warnings.filterwarnings("ignore")

In [None]:
def read_json(path):
    data ={"tweet_id":[], "time":[],"text":[],"city":[],"coordinates":[],"react_count":[],"quote_count":[],"reply_count":[],
           "retweet_count":[],"favorite_count":[]}

    read_files = glob.glob(path)
    for file in read_files:
        with open(str(file), 'r') as f:
            objects = ijson.items(f, 'results.item')
            for obj in objects:
                for row in obj:
                    if row['text'].startswith("RT") == False:
                        data['tweet_id'].append(row['id'])
                        data['time'].append(row['created_at'])
                        if "extended_tweet" in row:
                            data['text'].append(row["extended_tweet"]['full_text'])
                        else:
                            data['text'].append(row["text"])
                        data['city'].append(row['place']['name'])
                        data['coordinates'].append(row['coordinates'])
                        data['quote_count'].append(row['quote_count'])
                        data['reply_count'].append(row['reply_count'])
                        data['retweet_count'].append(row['retweet_count'])
                        data['favorite_count'].append(row['favorite_count'])
                        data['react_count'].append(row['quote_count']+row['reply_count']+row['retweet_count']+row['favorite_count'])
    data = pd.DataFrame(data)
    data['time'] = pd.to_datetime(data['time']).dt.date
    return data

In [None]:
def spark_df(data):
    spark = SparkSession.builder.appName('NLP').getOrCreate()
    df = spark.createDataFrame(data)
    return df

In [None]:
def to_word(text):
    return text.split(" ")

strip_non_ascii_udf = udf(strip_non_ascii, StringType())
fix_abbreviation_udf = udf(fix_abbreviation, StringType())
remove_features_udf = udf(remove_features, StringType())
sentiment_analysis_udf = udf(sentiment_analysis , FloatType())
sentiment_udf = udf(lambda x: condition(x), StringType())

check_blanks_udf = udf(check_blanks, StringType())
check_lang_udf = udf(check_lang, StringType())
remove_stops_udf = udf(remove_stops, StringType())
tag_and_remove_udf = udf(tag_and_remove, StringType())
lemmatize_udf = udf(lemmatize, StringType())
to_word_udf = udf(to_word,ArrayType(StringType()))

In [None]:
class Topic_analysis():
    def __init__(self,rawdata):
        raw_cols =  rawdata.columns
        data = rawdata.withColumn('non_asci', strip_non_ascii_udf(rawdata['text']))
        data = data.select(raw_cols+['non_asci'])\
                 .withColumn('fixed_abbrev',fix_abbreviation_udf(data['non_asci']))
        data = data.select(raw_cols+['fixed_abbrev'])\
                .withColumn('stop_texts',remove_stops_udf(data['fixed_abbrev']))
        data= data.select(raw_cols+['stop_texts'])\
                .withColumn('removed',remove_features_udf(data['stop_texts']))
        data = data.select(raw_cols+['removed'])\
                  .withColumn('tagged_text',tag_and_remove_udf(data['removed']))
        data = data.select(raw_cols+['tagged_text']) \
                  .withColumn('lemm_text',lemmatize_udf(data['tagged_text']))
        
        data = data.select(raw_cols+['lemm_text']) \
                  .withColumn("is_blank", check_blanks_udf(data["lemm_text"]))
        data = data.select(raw_cols+['lemm_text','is_blank']) \
                  .withColumn("word", to_word_udf(data["lemm_text"]))
        
        data = data.withColumn("uid", monotonically_increasing_id())
        data = data.filter(data["is_blank"] == "False")
        
        self.clean_data = data
        
        self.data = rawdata
        
        self.stopwords = ['love','week','way','watch','fuck','use','be','want','good','great','thanks','amp','see','go','think','people','today','say','get','time','day','look','make','know','need','thank','come','do','new','take','thing','take','make','know','need','new','year','many']
    def lda(self):
        def indices_to_terms_udf(vocabulary):
            def indices_to_terms(xs):
                return [vocabulary[int(x)] for x in xs]
            return udf(indices_to_terms, ArrayType(StringType()))
    
        def format_data_to_pyldavis(df_filtered, count_vectorizer, transformed, lda_model):
            xxx = df_filtered.select((explode(df_filtered.word)).alias("words")).groupby("words").count()
            word_counts = {r['words']:r['count'] for r in xxx.collect()}
            word_counts = [word_counts[w] for w in count_vectorizer.vocabulary]

            data = {'topic_term_dists': np.array(lda_model.topicsMatrix().toArray()).T, 
                    'doc_topic_dists': np.array([x.toArray() for x in transformed.select(["topicDistribution"]).toPandas()['topicDistribution']]),
                    'doc_lengths': [r[0] for r in df_filtered.select(size(df_filtered.word)).collect()],
                    'vocab': count_vectorizer.vocabulary,
                    'term_frequency': word_counts}

            return data

        def filter_bad_docs(data):
            bad = 0
            doc_topic_dists_filtrado = []
            doc_lengths_filtrado = []

            for x,y in zip(data['doc_topic_dists'], data['doc_lengths']):
                if np.sum(x)==0:
                    bad+=1
                elif np.sum(x) != 1:
                    bad+=1
                elif np.isnan(x).any():
                    bad+=1
                else:
                    doc_topic_dists_filtrado.append(x)
                    doc_lengths_filtrado.append(y)

            data['doc_topic_dists'] = doc_topic_dists_filtrado
            data['doc_lengths'] = doc_lengths_filtrado
            return data
        
        # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and nb.
        tokenizer = Tokenizer(inputCol="lemm_text", outputCol="words")
        #data = tokenizer.transform(data)
        vectorizer = CountVectorizer(inputCol= "words", outputCol="rawFeatures")
        idf = IDF(inputCol="rawFeatures", outputCol="features")
        #idfModel = idf.fit(data)

        lda = LDA(k=20, seed=1, optimizer="em")

        cv = CountVectorizer(inputCol="word", outputCol="rawFeatures", vocabSize = 1000)
        cvmodel = cv.fit(self.clean_data)
        featurizedData = cvmodel.transform(self.clean_data)
        
        idf = IDF(inputCol="rawFeatures", outputCol="features")
        idfModel = idf.fit(featurizedData)
        rescaledData = idfModel.transform(featurizedData) # TFIDF
        
        lda = LDA(k=6, seed=123, optimizer="em", featuresCol="features")
        ldamodel = lda.fit(rescaledData)
        ldatopics = ldamodel.describeTopics()
        
        ldatopics = ldatopics.withColumn(
            "topics_words", indices_to_terms_udf(cvmodel.vocabulary)("termIndices"))
        
        transformed = ldamodel.transform(rescaledData)
        formatted = format_data_to_pyldavis(self.clean_data, cvmodel, transformed, ldamodel)
        final = filter_bad_docs(formatted)
        
        return final

    def top_react_tweet(self):
        df = self.clean_data.select('*',month('time').alias('month'))
        df = df.orderBy(df.month,df.react_count.desc())
        window = Window.partitionBy(df['month']).orderBy(df['react_count'].desc())
        df_pd = df.select('*', rank().over(window).alias('rank')).filter(col('rank') <= 10).orderBy('month')
        df_pd = df_pd.select('text','word','react_count','quote_count','reply_count','retweet_count','favorite_count','month','rank').toPandas()
        return df_pd
    
    def word_cloud_fall(self,input_month):
        months = {"March":3,"April":4,"May":5,"June":6,"July":7,"August":8,"September":9,"October":10,"November":11,"December":12}
        word = self.clean_data.select(month('time').alias('month'),'lemm_text')
        words_byMonth = word.groupby('month').agg(collect_list('lemm_text').alias("words")).orderBy('month')

        def tweets_string(text):
            return ' '.join(text)

        tweets_string_udf = udf(tweets_string, StringType())
        words_byMonth = words_byMonth.select("*")\
                          .withColumn('all_words',tweets_string_udf(words_byMonth['words']))

        colosseum_mask = np.array(Image.open('VIC_shape.jpeg'))
        colors = ImageColorGenerator(colosseum_mask)

        monthly_word = words_byMonth.filter(words_byMonth.month==months[input_month]).select('all_words').collect()[0].all_words
        return monthly_word
#         cloud = WordCloud(mask=colosseum_mask,
#                           stopwords=self.stopwords,
#                           background_color='white',
#                           colormap='tab20b',
#                             ).generate_from_text(monthly_word)
#         file_name = 'Chart_HTML/wordCloud_{}.png'.format(input_month)
#         plt.figure(figsize=(18,12))
#         plt.imshow(cloud)
#         plt.axis('off')
#         plt.title(input_month,fontsize=20)
#         plt.savefig(file_name)
        
    def word_cloud_spring(self,input_month):
        months = {"March":3,"April":4,"May":5,"June":6,"July":7,"August":8,"September":9,"October":10,"November":11,"December":12}
        word = self.clean_data.select(month('time').alias('month'),'lemm_text')
        words_byMonth = word.groupby('month').agg(collect_list('lemm_text').alias("words")).orderBy('month')

        def tweets_string(text):
            return ' '.join(text)

        tweets_string_udf = udf(tweets_string, StringType())
        words_byMonth = words_byMonth.select("*")\
                          .withColumn('all_words',tweets_string_udf(words_byMonth['words']))

        colosseum_mask = np.array(Image.open('VIC_shape.jpeg'))
        colors = ImageColorGenerator(colosseum_mask)

        monthly_word = words_byMonth.filter(words_byMonth.month==months[input_month]).select('all_words').collect()[0].all_words
        cloud = WordCloud(mask=colosseum_mask,
                          stopwords=self.stopwords,
                          background_color='white',
                          colormap='tab20',
                            ).generate_from_text(monthly_word)
        file_name = 'Chart_HTML/wordCloud_{}.png'.format(input_month)
        plt.figure(figsize=(18,12))
        plt.imshow(cloud)
        plt.axis('off')
        plt.title(input_month,fontsize=20)
        plt.savefig(file_name)
    def word_cloud_winter(self,input_month):
        months = {"March":3,"April":4,"May":5,"June":6,"July":7,"August":8,"September":9,"October":10,"November":11,"December":12}
        word = self.clean_data.select(month('time').alias('month'),'lemm_text')
        words_byMonth = word.groupby('month').agg(collect_list('lemm_text').alias("words")).orderBy('month')

        def tweets_string(text):
            return ' '.join(text)

        tweets_string_udf = udf(tweets_string, StringType())
        words_byMonth = words_byMonth.select("*")\
                          .withColumn('all_words',tweets_string_udf(words_byMonth['words']))

        colosseum_mask = np.array(Image.open('VIC_shape.jpeg'))
        colors = ImageColorGenerator(colosseum_mask)

        monthly_word = words_byMonth.filter(words_byMonth.month==months[input_month]).select('all_words').collect()[0].all_words
        cloud = WordCloud(mask=colosseum_mask,
                          stopwords=self.stopwords,
                          background_color='white',
                          colormap='ocean',
                            ).generate_from_text(monthly_word)
        file_name = 'Chart_HTML/wordCloud_{}.png'.format(input_month)
        plt.figure(figsize=(18,12))
        plt.imshow(cloud)
        plt.axis('off')
        plt.title(input_month,fontsize=20)
        plt.savefig(file_name)
    def word_cloud_summer(self,input_month):
        months = {"March":3,"April":4,"May":5,"June":6,"July":7,"August":8,"September":9,"October":10,"November":11,"December":12}
        word = self.clean_data.select(month('time').alias('month'),'lemm_text')
        words_byMonth = word.groupby('month').agg(collect_list('lemm_text').alias("words")).orderBy('month')

        def tweets_string(text):
            return ' '.join(text)

        tweets_string_udf = udf(tweets_string, StringType())
        words_byMonth = words_byMonth.select("*")\
                          .withColumn('all_words',tweets_string_udf(words_byMonth['words']))

        colosseum_mask = np.array(Image.open('VIC_shape.jpeg'))
        colors = ImageColorGenerator(colosseum_mask)
        
        monthly_word = words_byMonth.filter(words_byMonth.month==months[input_month]).select('all_words').collect()[0].all_words
        cloud = WordCloud(mask=colosseum_mask,
                          stopwords=self.stopwords,
                          background_color='white',
                          colormap='Set1',
                            ).generate_from_text(monthly_word)
        file_name = 'Chart_HTML/WordCloud/wordCloud_{}.png'.format(input_month)
        plt.figure(figsize=(18,12))
        plt.imshow(cloud)
        plt.axis('off')
        plt.title(input_month,fontsize=20)
        plt.savefig(file_name)

In [None]:
class Sentiment:
    def __init__(self,df):
        df = df.withColumn('text_non_asci',strip_non_ascii_udf(df['text']))
        df = df.withColumn('fixed_abbrev',fix_abbreviation_udf(df['text_non_asci']))
        df = df.withColumn('removed',remove_features_udf(df['fixed_abbrev']))
        df = df.withColumn("sentiment_score", sentiment_analysis_udf( df['removed'] ))
        df = df.withColumn("sentiment", sentiment_udf( df['sentiment_score'] ))
        self.SA_results = df

        covid_relate = df.withColumn('covid_content',when((col("removed").like('%covid%') | col("removed").like('%coronavirus%')),"Covid-related").otherwise("Not covid-related"))
        self.covid_relate_date = covid_relate.select('text','time',month('time').alias('month'),'sentiment','covid_content')
        self.covid_relate  = self.covid_relate_date.groupBy('month','sentiment','covid_content').count().orderBy('month','covid_content')

        summary = self.covid_relate.orderBy('month','covid_content').toPandas()
        freq = self.covid_relate_date.groupBy('month','covid_content').count().orderBy('month','covid_content').toPandas()
        summary['total']=list(itertools.chain.from_iterable(itertools.repeat(x, 3) for x in freq['count']))
        summary['percentage'] = summary['count']/summary['total']*100
        summary['percentage'] = summary['percentage'].round(decimals=1)
        summary['month'] = summary['month'].replace({3:"March",4:"April",5:"May",6:"June",7:"July",8:"August",9:"September",10:"October",11:"November",12:"December"})
        self.summary = summary

        month_count_covid = self.covid_relate.filter(covid_relate.covid_content=="Covid-related").groupBy('month').sum('count').orderBy('month')
        month_covid = month_count_covid.toPandas().rename(columns={'month':'Month','sum(count)':'Number of tweets'})
        month_covid['Month'] = month_covid['Month'].replace({3:"March",4:"April",5:"May",6:"June",7:"July",8:"August",9:"September",10:"October",11:"November",12:"December"})
        self.month_covid = month_covid
    
    def sentiment_results(self):
        return self.SA_results
    def print_summary(self):
        return self.summary
    def by_city(self):
        city_score = self.SA_results.select('city','time',month('time').alias('month'),'sentiment_score').groupBy('city','month').agg(avg(col("sentiment_score"))).orderBy('city','month')

        return city_score
    
    def covid_overall_chart(self):
#         fig = px.bar(self.summary, x="sentiment", y="percentage",color="sentiment", facet_row="covid_content", facet_col="month",
#                           category_orders = {"sentiment":["positive","neutral","negative"]},
#                     labels={
#                          "percentage %": "Percentage %",
#                          "sentiment": "Sentiment",
#                          "month": "Month"})
        fig = px.bar(self.summary, x="covid_content", y="percentage",color="sentiment",barmode='group',
                  animation_frame="month", animation_group="covid_content",color_discrete_sequence=['rgb(255,217,47)', '#3283FE','rgb(179,179,179)'],
                 category_orders = {"sentiment":["positive","neutral","negative"]},
            labels={
                 "percentage %": "Percentage %",
                 "sentiment": "Sentiment",
                 "month": "Month"})
        fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

        for axis in fig.layout:
            if type(fig.layout[axis]) == go.layout.YAxis:
                fig.layout[axis].title.text = ''
            if type(fig.layout[axis]) == go.layout.XAxis:
                fig.layout[axis].title.text = ''
        fig.update_layout(
            title={
                    'text':"Sentiment of tweets in 2020",
                    'y':0.98,
                    'x':0.5,
                    'xanchor': 'center',
                    'yanchor': 'top'},
             autosize=False,
                width=650,
            legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.02,
                xanchor="right",
                x=1
            ),
            template="plotly_white",
            annotations = list(fig.layout.annotations) + 
                [go.layout.Annotation(
                        x=-0.15,
                        y=0.5,
                        font=dict(
                            size=14
                        ),
                        showarrow=False,
                        text="Percentage",
                        textangle=-90,
                        xref="paper",
                        yref="paper"
                    )
                ]
            )
        fig.update_xaxes(matches='x')
        fig.update_yaxes(matches=None,ticksuffix="%")
        fig.update_traces(texttemplate='%{y}', textposition='inside')
        
        fig.write_json("graph_json/monthly_covidTweets_Bar.json")
#         fig.write_html("Chart_HTML/Covid-related_glance/monthly_covidTweets_Bar.html")
        
        annotations=[]
        annotations.append(dict(xref='paper', yref='paper',
          x=0.5, y=0.5,
          xanchor= 'center',
          yanchor='middle',
          text= '<b>COVID-19<b>',
          font=dict(family="Arial", size=30),
          showarrow=False,
          ))

        pie = go.Figure(data=[go.Pie(labels=self.month_covid['Month'], values=self.month_covid['Number of tweets'], hole=.4,
                                     direction ='clockwise', sort=False,
                                     marker=dict(colors=px.colors.sequential.haline,
                                                line=dict(color='white',width=2)),
                                    )])

        pie.update_traces(
          textposition='inside',
          textinfo='value+percent',
          showlegend=True,
        )

        pie.update_layout(
        title={
                'text':"Distribution of Covid-related tweets over the pandemic period",
                'y':0.95,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'},
         annotations=annotations,
         autosize=False,
            width=700,
            height=700,
        )
        pie.write_json("graph_json/monthly_dist_pie.json")
#         pie.write_html("Chart_HTML/Covid-related_glance/monthly_dist_pie.html")




In [None]:
class sentiment_impact:
    def __init__(self,SA_results):
        self.ASX = pd.read_json('ASX200.json')
        self.ASX['Price'] = pd.to_numeric(self.ASX['Price'].str.replace(",", ""))
        
        self.SP500 = pd.read_json('SP500.json')
        self.SP500['Price'] = pd.to_numeric(self.SP500['Price'].str.replace(",", ""))
        
        self.AUD_index = pd.read_json('AUD_index.json')
        self.AUD_index['Price'] = pd.to_numeric(self.AUD_index['Price'])*100
        
        self.raw_SA = SA_results.select('time','sentiment_score','sentiment').toPandas()
        mean_sentiment = SA_results.select('time',dayofmonth('time').alias('day'),month('time').alias('month'),'sentiment_score','sentiment').groupBy('time','month','day').agg(avg(col("sentiment_score"))).orderBy('time')
        mean_sentiment = mean_sentiment.toPandas()
        mean_sentiment['month'] = mean_sentiment['month'].replace({3:"March",4:"April",5:"May",6:"June",7:"July",8:"August",9:"September",11:"November"})
        mean_sentiment['scaled_mean'] = preprocessing.scale(mean_sentiment['avg(sentiment_score)'])
        self.mean_sentiment = mean_sentiment

        self.by_week =SA_results.withColumn('week_of_year', weekofyear('time')).withColumn("week_strt_day",date_sub(next_day(col("time"),"monday"),1)).toPandas()
        self.by_week['scaled_score'] = preprocessing.scale(self.by_week['sentiment_score'])

        confirmed_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
        confirmed_df = confirmed_df[confirmed_df['Country/Region']=="Australia"].drop(['Country/Region','Lat','Long'],1)
        confirmed_df = confirmed_df.set_index("Province/State").transpose().reset_index()
        confirmed_df = confirmed_df.rename(columns = {'index':'Date','Victoria':'VIC_confirmed'})
        confirmed_df['Date'] = pd.to_datetime(confirmed_df['Date']).dt.date
        
        death_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")
        death_df = death_df[death_df['Country/Region']=="Australia"].drop(['Country/Region','Lat','Long'],1)
        death_df = death_df.set_index("Province/State").transpose().reset_index()
        death_df = death_df.rename(columns = {'index':'Date','Victoria':'VIC_death'})
        
        death_df['Date'] = pd.to_datetime(death_df['Date']).dt.date
        recover_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv")
        recover_df  = recover_df [recover_df['Country/Region']=="Australia"].drop(['Country/Region','Lat','Long'],1)
        recover_df = recover_df.set_index("Province/State").transpose().reset_index()
        recover_df = recover_df.rename(columns = {'index':'Date','Victoria':'VIC_recover'})
        recover_df['Date'] = pd.to_datetime(recover_df['Date']).dt.date
        
        summary = pd.merge(pd.merge(confirmed_df,recover_df,on="Date"),death_df,on="Date")
        summary['VIC_active'] = summary['VIC_confirmed']-summary['VIC_death']-summary['VIC_recover']
        self.summary = summary

    def overall(self):

        bar = make_subplots()
#         bar.update_layout(
#                 template="plotly_white",
#                 title={
#                     'text':"Trends of sentiment scores and Australian stock market",
#                     'y':0.98,
#                     'x':0.5,
#                     'xanchor': 'center',
#                     'yanchor': 'top'},
#                 legend=dict(
#                 orientation="h",yanchor="bottom",y=1.1),
#                          yaxis_range=[-9,11])
      
        bar.add_trace(go.Bar(x=self.summary['Date'],y=self.summary['VIC_active'],name='Active cases'))
        bar.add_trace(go.Scatter(x=self.summary['Date'], y=self.summary['VIC_death'],
                    mode='lines',
                    name='Death cases',visible="legendonly"))
        bar.add_trace(go.Scatter(x=self.summary['Date'], y=self.summary['VIC_confirmed'],
                    mode='lines',
                    name='Confirmed cases',visible="legendonly"))
        bar.add_trace(go.Scatter(x=self.summary['Date'], y=self.summary['VIC_recover'],
                    mode='lines',
                    name='Recover cases',visible="legendonly"))
#         bar = go.Figure(
#             data=[go.Bar(x=self.summary['Date'],y=self.summary['VIC_active'])]
#         )
        bar.update_layout(
            template="plotly_white",
            title={
                'text': "Covid-19 Cases in VIC",
                'y':0.9,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'})
        
        bar.update_xaxes(
            rangeslider_visible=True,
            rangeselector=dict(
                buttons=list([
                    dict(count=1, label="1 month", step="month", stepmode="backward"),
                    dict(count=6, label="6 months", step="month", stepmode="backward"),
                    dict(count=1, label="1 year", step="year", stepmode="backward"),
                    dict(step="all")
                ])
            ),
        )
        
        
#         bar.write_html("Chart_HTML/Covid-related_glance/covidCases_bar.html")
        bar.write_json("graph_json/covidCases_bar.json")
        
        fig = make_subplots(specs=[[{"secondary_y": True}]])
        fig.update_layout(xaxis2= {'anchor': 'y', 'overlaying': 'x', 'side': 'top'},
                template="plotly_white",
                title={
                    'text':"Trends of sentiment scores and Australian stock market",
                    'y':0.98,
                    'x':0.5,
                    'xanchor': 'center',
                    'yanchor': 'top'},
                legend=dict(
                orientation="h",yanchor="bottom",y=1.1),
                         yaxis_range=[-9,11])
      
        fig.add_trace(go.Box(x=self.by_week.week_of_year,
            y=self.by_week.scaled_score,
            boxpoints=False, # no data points
            marker_color='rgb(9,56,125)',
            line_color='rgb(9,56,125)',
            name='Distribution of sentiment scores'),secondary_y=False)
        
        fig.add_trace(go.Scatter(x=self.mean_sentiment.time, y=self.mean_sentiment['scaled_mean'],
            mode='lines',
            name='Average sentiment score'),secondary_y=False)
        
        fig.add_trace(go.Scatter(x=self.ASX.Date, y=self.ASX['Price'],
                    mode='lines',
                    name='ASX 200',visible="legendonly"),secondary_y=True)
        
        fig.add_trace(go.Scatter(x=self.SP500.Date, y=self.SP500['Price'],
            mode='lines',
            name='SP500',visible="legendonly"),secondary_y=True)
        
        fig.add_trace(go.Scatter(x=self.AUD_index.Date, y=self.AUD_index['Price'],
            mode='lines',
            name='Scaled AUD Index',visible="legendonly"),secondary_y=True)
        
        fig.data[1].update(xaxis='x2')
        fig.data[2].update(xaxis='x2')
        fig.data[3].update(xaxis='x2')
        fig.data[4].update(xaxis='x2')
        
        fig.update_layout(yaxis_title ="Standardised sentiment score",width=1000, height=475,template="plotly_white",)
        fig.update_xaxes(dict(
                    tickmode = 'array',
                    tickvals = self.by_week.week_of_year,
                    ticktext = self.by_week.week_strt_day
                ), row=1, col=1)
        fig.update_yaxes(title_text ="Stock market index",
                       secondary_y = True)
        fig.write_json("graph_json/stockMarket_line.json")
#         plt_div = plotly.offline.plot(fig,output_type='div')
#         return plt_div 
#         fig.write_html("Chart_HTML/Covid-related_glance/stockMarket_line.html")

In [None]:
class map:
    def __init__(self,city_mean_score):
        with open('city_mean.geojson') as response:
            self.cities_id = json.load(response)
            
        with open('regional_geo.geojson') as response:
            self.region_id = json.load(response)    
            
        self.city_mean_score = city_mean_score.toPandas()
        
        def to_geopoint(coordinates):
            return geometry.Point(coordinates)
        
        townGeo = pd.read_json("2020Mean_cityGeo.json")
        townGeo['geometry'] = townGeo['geometry'].apply(to_geopoint)
        self.townGeo = gpd.GeoDataFrame(townGeo,geometry=townGeo.geometry)
        self.townGeo.crs={'init': 'epsg:4326'}
        
        self.age_regional = gpd.read_file("age_regional.geojson")
        
        VIC = gpd.read_file('australia_administrative_victoria_boundary/australia_administrative_victoria_boundary.shp')[['name','geometry']]
        VIC.crs={'init': 'epsg:4326'}
        self.VIC = VIC.drop(0)
        
        self.VIC_regional = gpd.read_file('australia_administrative_boundaries_level6_counties_polygon/australia_administrative_boundaries_level6_counties_polygon.shp')[['name','geometry']]
        self.VIC_regional.crs={'init': 'epsg:4326'}
        region_join = sjoin(self.VIC_regional,self.VIC,"inner")
        self.region_join = region_join.rename(columns={'name_left':'city','name_right':'suburb'})
        
        age_dis = pd.read_json("peopleAge_distribution2020.json")
        def clean_name(name):
            l = name.split('(')[0].strip(" ")
            return l
        age_dis['lga_name'] = age_dis['lga_name'].apply(clean_name)
        names = age_dis.columns
        new = []
        for i in range(len(names)):
            if names[i].startswith("_"):
                name = names[i][1:]
                name = name.replace("_yrs_proj","")
                new.append(name)
            else:
                new.append(names[i])
        age_dis.columns = new

        age_dis = age_dis.melt(id_vars=["lga_name",'lga_code','tot_proj_pop_denom','ste_name'], 
                var_name="age_group", 
                value_name="population")

        age_dis['contain'] = age_dis['age_group'].str.contains('count')
        age_dis = age_dis[age_dis['contain']==True]
        age_dis['age_group'] = age_dis['age_group'].str.replace("_count","")
        age_dis['age_group'] = age_dis['age_group'].str.replace("_","-")
        age_dis['age_group'] = age_dis['age_group'].str.replace("85-yrs-over-proj","over 85")
        self.age_dis = age_dis.drop(['ste_name','contain'],axis=1)
        
        

    def payroll(self,city):
        payroll = pd.read_json("weekly_payroll2020.json")
        payroll = payroll.drop(['sa3_code16','sa4_code16','ogc_fid','ste','sa3_name16'],axis=1)
        
        new_names = []
        for column in payroll.columns:
            if "wk_end_" in column:
                name = column
                name = name.replace("wk_end_","")
                name = name.replace("_","-")
                new_names.append(name)

            else:
                new_names.append(column)

        payroll.columns = new_names

        payroll = payroll.melt(id_vars=["sa4_name16"], 
                var_name="week", 
                value_name="payroll")
        payroll['week'] = pd.to_datetime(payroll['week']).dt.date
        
        payroll_gpd = gpd.read_file('payroll_geo.geojson',geometry=geometry)

        payroll_VIC = sjoin(self.VIC,payroll_gpd,how='inner')[['name','sa4_name16']]
        payroll_by_city = payroll.merge(payroll_VIC,left_on="sa4_name16",right_on="sa4_name16",how="inner")
        payroll_by_city = payroll_by_city.groupby(by=['name','week']).mean().reset_index()
        payroll_mean = payroll_by_city.groupby(by=['week']).mean().reset_index()
        payroll_mean = payroll_mean.assign(name='Average in VIC')
        payroll_city = payroll_by_city[payroll_by_city['name']==city]
        payroll_city = pd.concat([payroll_city,payroll_mean])
        payroll_city.columns = ['Region','Date','Average payroll']
        fig = px.line( payroll_city, x="Date", y="Average payroll", color="Region",
              line_group="Region", hover_name="Region")
        fig.update_layout(
            title={
                    'text':"Average payroll in {} during Covid-19 pandemic".format(city),
                    'y':0.98,
                    'x':0.5,
                    'xanchor': 'center',
                    'yanchor': 'top'},
             autosize=False,
                width=650,
            template="plotly_white",
            legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.02,
                xanchor="right",
                x=1
            )
            )

#         fig.write_html("Chart_HTML/RegionalSentiment/Payroll_LineGraph/{}.html".format(city))
        fig.write_json("graph_json/payroll_line_{}.json".format(city))

    def age_distrubion(self, city,color):
#         cities_id = gpd.GeoDataFrame(self.cities_id,geometry=self.cities_id['geometry'])
        age_VIC = sjoin(self.VIC,self.age_regional)
        age_geo = self.age_dis.merge(age_VIC,left_on="lga_name",right_on="lga_name",how="inner")
        
        age_by_city = age_geo[['name','age_group','population']].groupby(by=['name','age_group']).mean().reset_index().sort_values(by=['name','age_group'])
        age_by_city['population'] = age_by_city['population'].astype(int)
        age_city = age_by_city[age_by_city['name']==city]

        def group_age(group):
            if group in ['0-4','5-9','10-14']:
                return 'Children: 0-14'
            elif group == '15-19':
                return 'Teenager: 15-19'
            elif group in ['20-24','25-29','30-34', '35-39']:
                return 'Adult: 20-39'
            elif group in ['40-44','45-49','50-54','55-59']:
                return 'Middle-aged: 40-59'
            else:
                return 'Elderly: above 60'
        age_city['age_class'] = age_city['age_group'].apply(group_age)
        
        labels = ['Children: 0-14','Teenager: 15-19','Adult: 20-39','Middle-aged: 40-59','Elderly: above 60']
        title = "<b>Distribution of age groups<b> \n in {}<b>".format(city)
        age_city = age_city[['age_class','population']].groupby(by='age_class').sum()
        age_city = age_city.reindex(labels).reset_index()

        annotations=[]
        annotations.append(dict(xref='paper', yref='paper',
          x=0.5, y=0.5,
          xanchor= 'center',
          yanchor='middle',
          text= '<b>{}<b>'.format(city),
          font=dict(family="Arial", size=12),
          showarrow=False,
          ))

        fig = go.Figure(data=[go.Pie(labels=age_city['age_class'], values=age_city['population'], hole=.4,
                                     direction ='clockwise', sort=False,
                                     marker=dict(colors=color,
                                                line=dict(color='white',width=2)),
                                    )])

        fig.update_traces(
          textposition='inside',
          textinfo='value+percent',
          showlegend=True,
        )

        fig.update_layout(
        title={
                'text':"Distribution of population age groups in 2020",
                'y':0.95,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'},
         annotations=annotations,
         autosize=False,
            width=650,
            height=650,
        )

#         fig.write_html("Chart_HTML/RegionalSentiment/AgeDistribution_PieChart/{}.html".format(city))
        fig.write_json("graph_json/age_pie_{}.json".format(city))
    def map_with_suburb(self):        
        city_join = sjoin(self.townGeo,self.VIC_regional,how="left")[['city','name']].sort_values(by=['name'])
        city_mean = self.city_mean_score.merge(city_join,left_on='city',right_on='city')
        city_mean = city_mean.groupby(['name','month']).mean().sort_values(by=['name','month']).reset_index()
        overall_mean = city_mean.rename(columns={"avg(sentiment_score)":"score"})
#         overall_mean['score'] = preprocessing.scale(overall_mean['score'])
        overall_mean  = overall_mean.merge(self.region_join,left_on="name",right_on="city",how="inner")
#         overall_mean['month'] = overall_mean['month'].replace({3:"March",4:"April",5:"May",6:"June",7:"July",8:"August",9:"September",10:"October",11:"November",12:"December"})
#         overall_mean = overall_mean[overall_mean['month']==mon]
#         overall_mean_geo = gpd.GeoDataFrame(overall_mean,geometry=overall_mean.geometry)
#         overall_mean_geo.to_file('regional_geo.geojson', driver="GeoJSON")

        fig = px.choropleth_mapbox(overall_mean, geojson=self.region_id, locations='name', color='score',
                                   color_continuous_scale="balance",
                                   range_color=(-0.5,0.5),
                                   mapbox_style="carto-positron",
                                   zoom=5.5, center = {"lat": -36, "lon": 144.9631},
                                   opacity=0.5,
                                   labels={'score':'Scaled sentiment score'})

        fig.show()
#         fig.write_html("Chart_HTML/RegionalSentiment/sentimentMap_bySuburb/sentimentMap_bySuburb_{}.html".format(mon))
#         return overall_mean
#         fig.write_json("graph_json/Map_bySuburb_{}.json".format(mon))
    def map_with_city(self):
        city_join = sjoin(self.townGeo,self.VIC,how="right")[['city','name']]
        city_mean = self.city_mean_score.merge(city_join,left_on='city',right_on='city')
        city_mean = city_mean.groupby(['name','month']).mean().sort_values(by=['name','month']).reset_index()
        overall_mean = city_mean.rename(columns={"avg(sentiment_score)":"score"})
#         overall_mean['score'] = preprocessing.scale(overall_mean['score'])
        overall_mean  = overall_mean.merge(self.VIC,left_on="name",right_on="name",how="left")
        overall_mean_geo = gpd.GeoDataFrame(overall_mean,geometry=overall_mean.geometry)
        fig = px.choropleth_mapbox(overall_mean_geo, geojson=self.cities_id, locations='name', color='score',
                                   color_continuous_scale="balance",
                                   range_color=(-0.2,0.2),
                                   mapbox_style="carto-positron",
                                   zoom=5.5, center = {"lat": -36, "lon": 144.9631},
                                   opacity=0.5,
                                   labels={'score':'Scaled sentiment score'}
                                  )
        fig.write_json("graph_json/Map_byCity.json")
#         fig.write_html("Chart_HTML/RegionalSentiment/sentimentMap_byCity.html")

# Data preparation

In [None]:
data = read_json("2020historical/*.json")
df = spark_df(data)
SA = Sentiment(df)
df.show()

# Covid-19 in Australia at a glance

## Line graph of Covid cases

### Word cloud

In [None]:
topic_analysis = Topic_analysis(df)
topic_analysis.word_cloud_fall("March")
# topic_analysis.word_cloud_fall("April")
# topic_analysis.word_cloud_fall("May")
# topic_analysis.word_cloud_winter("June")
# topic_analysis.word_cloud_winter("July")
# topic_analysis.word_cloud_winter("August")
# topic_analysis.word_cloud_spring("September")
# topic_analysis.word_cloud_spring("October")
# topic_analysis.word_cloud_summer("November")
# topic_analysis.word_cloud_summer("December")

### Barchart of covid-related tweets

In [None]:
SA = Sentiment(df)
SA.covid_overall_chart()

## Topic analysis

In [None]:
topic_analysis = Topic_analysis(df)
formatted=topic_analysis.lda()
py_lda_prepared_data = pyLDAvis.prepare(formatted['topic_term_dists'],formatted['doc_topic_dists'],formatted['doc_lengths'],formatted['vocab'],formatted['term_frequency'])
# pyLDAvis.display(py_lda_prepared_data)
pyLDAvis.save_html(py_lda_prepared_data , 'Chart_HTML/Covid-related_glance/lda.html')

### Top reactive tweets

In [None]:
topic_analysis.top_react_tweet()

# Regional sentiment

In [None]:
city_mean_score = SA.by_city()
sentiment_map = map(city_mean_score)
sentiment_map.map_with_suburb()
# sentiment_map.map_with_suburb('April')
# sentiment_map.map_with_suburb('May')
# sentiment_map.map_with_suburb('June')
# sentiment_map.map_with_suburb('July')
# sentiment_map.map_with_suburb('August')
# sentiment_map.map_with_suburb('September')
# sentiment_map.map_with_suburb('October')
# sentiment_map.map_with_suburb('November')
# sentiment_map.map_with_suburb('December')

### Line graph of payroll

In [None]:
# city_mean_score = SA.by_city()
sentiment_map = map(city_mean_score)
sentiment_map.payroll("Hume")
sentiment_map.payroll("Loddon Mallee")
sentiment_map.payroll("Grampians")
sentiment_map.payroll("Barwon South West")
sentiment_map.payroll("Gippsland")
sentiment_map.payroll("Greater Melbourne")

In [None]:
# payroll = pd.read_json("weekly_payroll2020.json")
# payroll = payroll.drop(['sa3_code16','sa4_code16','ogc_fid','sa3_name16'],axis=1)
# new_names = []
# for column in payroll.columns:
#     if "wk_end_" in column:
#         name = column
#         name = name.replace("wk_end_","")
#         name = name.replace("_","-")
#         new_names.append(name)

#     else:
#         new_names.append(column)

# payroll.columns = new_names

# payroll = payroll.melt(id_vars=["sa4_name16"], 
#         var_name="week", 
#         value_name="payroll")

### Pie chart of age group

In [None]:
age_dis = pd.read_json("peopleAge_distribution2020.json")

In [None]:
sentiment_map = map(city_mean_score)
sentiment_map.age_distrubion("Hume",px.colors.sequential.Burgyl)
sentiment_map.age_distrubion("Loddon Mallee",px.colors.sequential.Burgyl)
sentiment_map.age_distrubion("Grampians",px.colors.sequential.Burgyl)
sentiment_map.age_distrubion("Barwon South West",px.colors.sequential.Burgyl)
sentiment_map.age_distrubion("Gippsland",px.colors.sequential.Burgyl)
sentiment_map.age_distrubion("Greater Melbourne",px.colors.sequential.Burgyl)

### GoogleMap API

In [None]:
import googlemaps
from datetime import datetime

gmaps = googlemaps.Client(key='AIzaSyB47aIQkask-naqcK7kCGHZjxtSJsZe-3k')
def age_geo(city):
    try:
        name = city+", AU"
        geocode_result = gmaps.geocode(name)
    #     lng = geocode_result[0]['geometry']['location']['lng']
    #     lat = geocode_result[0]['geometry']['location']['lat']
        return geocode_result
    except:
        return

results['geo'] = results['sa4_name16'].apply(age_geo)

In [None]:
# test = age_region.iloc[0]['geo']
def get_Point(row):
    lat = 0
    lng = 0
    for i in row:
        lat = i['geometry']['location']['lat']
        lng = i['geometry']['location']['lng']
    return geometry.Point(lng,lat)
results['geoPoint'] = results['geo'].apply(get_Point)

In [None]:
# results = payroll[['sa4_name16','week','payroll','geoPoint']]
results = results[['sa4_name16','geometry']]
results_gpd = gpd.GeoDataFrame(results,geometry=results.geometry)
results_gpd.to_file('payroll_geo.geojson', driver="GeoJSON") 
# results.to_file('regional_geo.geojson', driver="GeoJSON") 

In [None]:
payroll

In [None]:
# age_regional_gpd = gpd.GeoDataFrame(age_region[['lga_name','geometry']],geometry=age_region.geometry)
# age_regional_gpd.to_file('age_regional.geojson', driver="GeoJSON") 

# Relationships between public sentiment & stock market

In [None]:
SA_results = SA.sentiment_results()
sentiment_impact(SA_results).overall()
# chart_dic = {'stock_line':sentiment_impact(SA_results).overall()}
# with open('person.json', 'w') as json_file:
#     json.dump(chart_dic, json_file)

# Others

In [None]:
with open('regional_geo.geojson') as response:
    cities_id = json.load(response)
results = region[['name','geometry']]
results = gpd.GeoDataFrame(results ,geometry=results.geometry)
results=results.drop_duplicates()

In [None]:
results.to_file('regional_geo.geojson', driver="GeoJSON") 

In [None]:
with open('regional_geo.geojson') as file:
    geo = json.load(file)
for i in geo['features']:
    i['id'] = i['properties']['name']
# geo

In [None]:
with open('regional_geo.geojson','w') as file:
    json.dump(geo, file)

In [None]:
import os
cur_path = os.path.dirname(__file__)
new_path = os.path.relpath('..\\stock_line.json', cur_path)
with open(new_path, 'r') as json_file:
    plots = json.load(json_file)

In [None]:
import plotly.io as pio
import json
import plotly
with open('payroll_line_Barwon South West.json', 'r') as json_file:
#     plots = json.load(json_file)
    print(json_file)
# pio.show(plots)
# plt_div = plotly.offline.plot(plots,output_type='div')
# pio.show(plt_div)