In [1]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
data = pd.read_csv('owid-covid-data.csv')

In [142]:
# Slicing out relevant countries and information
data = data[(data['location'] == 'United States') | (data['location'] == 'United Kingdom') | (data['location'] == 'Canada') | (data['location'] == 'Australia')]
data = data[['location', 'date', 'total_cases', 'total_deaths', 'population']]

In [143]:
# Calculating cases and deaths by population size
data['cases_by_pop'] = data.apply(lambda row: row['total_cases'] / row['population'], axis='columns')
data['deaths_by_pop'] = data.apply(lambda row: row['total_deaths'] / row['population'], axis='columns')

In [144]:
# Ordering relevant data by country
usa_pop_tweets = pd.read_csv('usa_pop_tweets.csv').drop_duplicates()
usa_inf_tweets = pd.read_csv('usa_inf_tweets.csv').drop_duplicates()
usa_covid = data[data['location'] == 'United States']

uk_pop_tweets = pd.read_csv('uk_pop_tweets.csv').drop_duplicates()
uk_inf_tweets = pd.read_csv('uk_inf_tweets.csv').drop_duplicates()
uk_covid = data[data['location'] == 'United Kingdom']

canada_pop_tweets = pd.read_csv('canada_pop_tweets.csv').drop_duplicates()
canada_inf_tweets = pd.read_csv('canada_inf_tweets.csv').drop_duplicates()
canada_covid = data[data['location'] == 'Canada']

australia_pop_tweets = pd.read_csv('australia_pop_tweets.csv').drop_duplicates()
australia_inf_tweets = pd.read_csv('australia_inf_tweets.csv').drop_duplicates()
australia_covid = data[data['location'] == 'Australia']

countries = {
    'United States': {'covid': usa_covid, 'inf': usa_inf_tweets, 'pop': usa_pop_tweets},
    'United Kingdom': {'covid': uk_covid, 'inf': uk_inf_tweets, 'pop': uk_pop_tweets},
    'Canada': {'covid': canada_covid, 'inf': canada_inf_tweets, 'pop': canada_pop_tweets},
    'Australia': {'covid': australia_covid, 'inf': australia_inf_tweets, 'pop': australia_pop_tweets}
}

In [145]:
def analyze_tweets(tweets, source):
    """This function takes in a dataframe with tweets, calculates average sentiment scores for each day"""
    analyzer = SentimentIntensityAnalyzer()
    sentiment = tweets['tweet'].map(lambda tweet: analyzer.polarity_scores(tweet))
    tweets['neg'] = sentiment.map(lambda res: res['neg'])
    tweets['neu'] = sentiment.map(lambda res: res['neu'])
    tweets['pos'] = sentiment.map(lambda res: res['pos'])
    neg = source + "_neg_avg"
    neu = source + "_neu_avg"
    pos = source + "_pos_avg"
    amount = source + "_amount"
    return tweets.groupby(by='date').apply(lambda df: pd.Series({neg: df['neg'].mean(), neu: df['neu'].mean(), pos: df['pos'].mean(), amount: df.shape[0]}))

In [146]:
def join_data(covid_data, overall_sentiment, inf_sentiment):
    """This function merges all data of a given country"""
    merged_sentiment = pd.merge(overall_sentiment, inf_sentiment, left_index=True, right_index=True)
    return pd.merge(covid_data, merged_sentiment, left_on='date', right_index=True)

In [147]:
ready_data = None
for country in countries.keys():
    if ready_data is None:
        ready_data = join_data(countries[country]['covid'], analyze_tweets(countries[country]['inf'], 'inf'), analyze_tweets(countries[country]['pop'], 'pop'))
    else:
        ready_data = ready_data.append(join_data(countries[country]['covid'], analyze_tweets(countries[country]['inf'], 'inf'), analyze_tweets(countries[country]['pop'], 'pop')))

In [151]:
ready_data.to_csv('ready_data.csv')