# Covid-19 EDA and Visualization

In [None]:
# imports 
import math
import numpy as np 
import pandas as pd 
import plotly.express as ex
import plotly.graph_objects as go
import plotly.offline as pyo
from datetime import datetime
pyo.init_notebook_mode()

In [None]:
# load data
vacc_df = pd.read_csv("/kaggle/input/covid-world-vaccination-progress/country_vaccinations.csv")
summary_df = pd.read_csv("/kaggle/input/covid19-global-dataset/worldometer_coronavirus_summary_data.csv")
daily_df = pd.read_csv("/kaggle/input/covid19-global-dataset/worldometer_coronavirus_daily_data.csv")

# Data Cleaning

In [None]:

print("Countries in the Vaccination Data not in Summary Data")
print([x for x in vacc_df.country.unique() if x not in summary_df.country.unique()])

In [None]:
#try to find equivalents in summary dataset-standardize country names
print(summary_df.country.unique().tolist())

In [None]:
# for consistency between databases
vacc_df.country = vacc_df.country.replace().replace({
    "Czechia": "Czech Republic", 
    "United States": "USA", 
    "United Kingdom": "UK", 
    "Isle of Man": "Isle Of Man",
    "Republic of Ireland": "Ireland",
    "Northern Cyprus" : "Cyprus"
})

# drop these 3 since they are included in UK 
vacc_df = vacc_df[vacc_df.country.apply(lambda x: x not in ['England', 'Scotland', 'Wales', 'Northern Ireland'])]

In [None]:
# function to easily agrregate columns
"""" The numeric columns in the Vaccination dataset other than the ones marked "daily" are cumulative data. Hence the 
latest value will be the maximum among the column. 
For this reason, we will using the groupby.max() function as it works well with NaN values."""
def aggregate(df: pd.Series, agg_col: str) -> pd.DataFrame:
    
    data = df.groupby("country")[agg_col].max()
    data = pd.DataFrame(data)
    
    return data

In [None]:
# define the columns we want to summarize
# Use vaccination dataset to make new aggregate data in summary dataset
cols_to_summarize = ['people_vaccinated', 
                     'people_vaccinated_per_hundred', 
                     'people_fully_vaccinated', 
                     'people_fully_vaccinated_per_hundred', 
                     'total_vaccinations_per_hundred', 
                     'total_vaccinations']
#join dataframes 
summary = summary_df.set_index("country")
vaccines = vacc_df[['country', 'vaccines']].drop_duplicates().set_index('country')
summary = summary.join(vaccines)

for col in cols_to_summarize:   
    summary = summary.join(aggregate(vacc_df, col))

summary['percentage_vaccinated'] = summary.total_vaccinations / summary.population * 100
summary['tested_positive'] = summary.total_confirmed / summary.total_tests * 100

In [None]:
# Data used for this section
summary.head(5)

**Note**: Not all countries have all summary information available. So we need to be careful about how we handle `NaN` values.

In [None]:
# helper functions 
def get_multi_line_title(title:str, subtitle:str):
    return f"{title}<br><sub>{subtitle}</sub>"

def visualize_column(data: pd.DataFrame, xcolumn: str, ycolumn:str, title:str, colors:str, ylabel="Count", n=None):
    hovertemplate ='<br><b>%{x}</b>'+f'<br><b>{ylabel}: </b>'+'%{y}<br><extra></extra>'    
    data = data.sort_values(ycolumn, ascending=False).dropna(subset=[ycolumn])        
    
    if n is not None: 
        data = data.iloc[:n]
    else:
        n = ""
    fig = go.Figure(go.Bar(
                    hoverinfo='skip',
                     x=data[xcolumn], 
                     y=data[ycolumn], 
                     hovertemplate = hovertemplate,
                     marker=dict(
                         color = data[ycolumn],
                         colorscale=colors,
                        ),
                    ),
                )
    
    fig.update_layout(
        title=title,
        xaxis_title=f"Top {n} {xcolumn.title()}",
        yaxis_title=ylabel,
        plot_bgcolor='rgba(0,0,0,0)',
        hovermode="x"
    )
    
    fig.show()
    
def visualize_column2(data: pd.DataFrame, xcolumn: str, ycolumn:str, title:str, colors:str, ylabel="Count", n=None):
    hovertemplate ='<br><b>%{x}</b>'+f'<br><b>{ylabel}: </b>'+'%{y}<br><extra></extra>'    
    data = data.sort_values(ycolumn, ascending=True).dropna(subset=[ycolumn])        
    
    if n is not None: 
        data = data.iloc[:n]
    else:
        n = ""
    fig = go.Figure(go.Bar(
                    hoverinfo='skip',
                     x=data[xcolumn], 
                     y=data[ycolumn], 
                     hovertemplate = hovertemplate,
                     marker=dict(
                         color = data[ycolumn],
                         colorscale=colors,
                        ),
                    ),
                )
    
    fig.update_layout(
        title=title,
        xaxis_title=f"Top {n} {xcolumn.title()}",
        yaxis_title=ylabel,
        plot_bgcolor='rgba(0,0,0,0)',
        hovermode="x"
    )
    
    fig.show()   

In [None]:
title = get_multi_line_title("People Vaccinated", "Individuals who received the first dose of the vaccine")
visualize_column(summary.reset_index(), 'country', "total_vaccinations", title, "Blugrn", n=20 )

We see that the USA and China are leading in terms of the total number of vaccinations administered. This is an indicator of people who have received at least the first dose since the vaccination process generally consists of multiple doses administered over time.

In [None]:
title = get_multi_line_title("Percentage Vaccinated", "Percentage of the total population that have received the first dose")
visualize_column(summary.reset_index(), 'country', "percentage_vaccinated", title, "Purp", "Percentage(%)", n=100)

We can see that Israel and a few smaller countries have already given vaccine to a majority of citizens, along with a couple other smaller countries.  Other countries are vaccinating but the percent vaccinated falls off quickly and many countries have not started vaccinating at all.  Interestingly, Australia and New Zealand, two more well-off countries, have both vaccinated far less than 1%  Perhaps they are slower to vaccinate as these countries were not as hard-hit by Covid-19.  

In [None]:
title = get_multi_line_title("Tested Positive ", "Percentage tested positive among those that were tested")
visualize_column(summary.reset_index(), 'country',"tested_positive", title, "Reds", n=25, ylabel='Percentage' )

In [None]:
title = get_multi_line_title("Tested Positive ", "Countries with the lowest percentage of people who tested positive among those that were tested")
visualize_column2(summary.reset_index(), 'country',"tested_positive", title, "Purples", n=25, ylabel='Positivity Rate' )

At first glance, it might seem that French Polynesia is suffering the worst since almost 70% of those tested for Covid-19 came back positive. However, we must consider the possibility that only those exhibiting severe symptoms were tested, in the first place. I will leave this investigation to the readers.

In [None]:
data = summary.dropna(subset=['vaccines'])
data = summary.groupby('vaccines')['total_vaccinations'].sum()
data = pd.DataFrame(data).reset_index()

title = get_multi_line_title("Vaccines In Use", "Popular Vaccine Combinations that are used around the globe")
visualize_column(data, 'vaccines',"total_vaccinations", title, "GnBu" )

In [None]:
data = summary.dropna(subset=['serious_or_critical'])
data = data.reset_index()

title = get_multi_line_title("Serious or Critical Cases", "Number of people who are currently critically ill due to Covid-19")
visualize_column(data, 'country',"serious_or_critical", title, "turbid", n=20)


The above chart shows the United States with the most critical cases by almost double the rate of India.  To get a better
idea of how any individual country is doing, we woulld want to consider the population size of the country as well.
For instance, the the population of Argentina is 44.94 million while the population of the USA 328.2 million, so 
Argentina has a higher rate of severe cases. 

In [None]:
title = get_multi_line_title("Death Rates", "Percentage of the confirmed cases who died from Covid-19")
data = summary_df.copy()
data['death_rate'] =  data['total_deaths']*100/data['total_confirmed']
data = data.dropna(subset=['death_rate'])
fig = ex.scatter_geo(data, locations="country", color="continent",
                     locationmode='country names',
                     hover_name="country", size="death_rate")
fig.update_layout(title=title, title_x=0.45)
fig.show()

The death rates may vary for a few reasons.  First, from desparities in treatment availability- either from the normal healthcare of a country or because of particular strain during the pandemic.  Second, from testing rate desparities.  If only the more severe cases are being tested, they death rate will appear higher. Notice Yemen, with an apparent death rate of over 27.7.   

In [None]:
title = get_multi_line_title("Percentage Statistics", "Active, Recovered and Deaths in terms of percentage of population")

data = summary.reset_index().dropna(subset=['active_cases', 'total_recovered', 'total_deaths','population'])
data['active_percent'] = data['active_cases']/data['population'] * 100
data['recovered_percent'] = data['total_recovered']/data['population'] * 100
data['deaths_percent'] = data['total_deaths']/data['population'] * 100
data['confirmed_percent'] = data['total_confirmed']/data['population'] * 100
data['serious_or_critical_percent'] = data['serious_or_critical']/data['population']*100
data = data.sort_values('confirmed_percent', ascending=True).drop_duplicates(subset=['country'])

fig = go.Figure(data=[
                go.Bar(
                    name="Deaths",
                    x=data['country'], 
                    y=data['deaths_percent'],
                    marker_color='crimson',
                    marker=dict(line=dict(
                                  width=0.5,
                                  color='red'
                                )
                            )
                ),
                go.Bar(
                    name="Active",
                    x=data['country'], 
                    y=data['active_percent'],
                    marker_color='royalblue',
                    marker=dict(
                              line=dict(
                                  width=0.5,
                                  color='blue'
                              )
                        )
                ),
                go.Bar(
                    name="Recovered",
                    x=data['country'], 
                    y=data['recovered_percent'],
                    marker_color='lightseagreen',
                    marker=dict(
                              line=dict(
                                  width=0.5,
                                  color='green'
                              )
                        )
                )

            ])

fig.update_layout(
        title=title,
        xaxis_title="Country",
        yaxis_title="Percentages(%)",
        plot_bgcolor='rgba(0,0,0,0)',
        hovermode="x",
        barmode='stack'
    )


In [None]:
# choropleth map
title = get_multi_line_title("Total Confirmed Cases", "Active Cases, Recovered Cases, and Deaths in terms of total counts")

fig = ex.choropleth(data, locations="country", 
                    locationmode='country names',
                    color="total_confirmed", 
                    hover_name="country", 
                    hover_data=['total_deaths', 'active_cases', 'total_recovered'],
                    title='Global Vaccinations over time',
                    color_continuous_scale="Sunset"
                   )

fig.update_layout(title=title, 
#                   coloraxis_showscale=False, 
                  title_x=0.5)
fig.show()

## Vaccine Type Used by Country

Different countries are using different vaccines. Let us try to visualize this using the data we prepared earlier.

In [None]:
# plot popular vaccines around the world
title = get_multi_line_title("Popular Vaccines", "Vaccines being admisitered around the world")
data = summary.reset_index().dropna(subset=['vaccines'])
fig = ex.choropleth(data, locations="country", 
                    locationmode='country names',
                    color="vaccines", 
                    hover_name="country", 
                   )


fig.update_layout(title=title, 
                  title_x=0.5,
    legend_orientation = 'h'
)
fig.show()

## Confirmed Cases by Continent 


In [None]:
title = get_multi_line_title("Covid-19 Continents", "Visualizing the total number of confirmed cases by Continent")

continent_confirmed = summary_df.groupby(['continent'])['total_confirmed'].sum()
continent_active = summary_df.groupby(['continent'])['active_cases'].sum()
continent_deaths = summary_df.groupby(['continent'])['total_deaths'].sum()
continent_recovered = summary_df.groupby(['continent'])['total_recovered'].sum()

data = summary_df.copy()
data['Total Confirmed'] = data.continent.apply(lambda x: continent_confirmed[x])
data['Active Cases'] = data.continent.apply(lambda x: continent_active[x])
data['Total Deaths'] = data.continent.apply(lambda x: continent_deaths[x])
data['Total Recovered'] = data.continent.apply(lambda x: continent_recovered[x])


fig = ex.choropleth(data, locations="country", 
                    locationmode='country names',
                    color="Total Confirmed", 
                    hover_name="continent", 
                    hover_data=['Active Cases', 'Total Confirmed','Total Deaths', 'Total Recovered' ],
                    title=title,
                    color_continuous_scale="reds"
                   )


fig.update_layout(title=title, 
                  title_x=0.5)

fig.show();

In [None]:
# unique dates 
dates = vacc_df.date.unique().tolist()
dates.extend(['2020-12-12','2020-12-13']) #add 2 dates to improve animation 

# unique countries 
countries = vacc_df.country.unique().tolist()

# for easy processing 
short = vacc_df[['date', 'country', 'total_vaccinations']]

# values of unqiue (date, country) already in short 
# i.e we want to make sure we have some data for each, even if it is 0 
keys= list(zip(short.date.tolist(), short.country.tolist()))
for date in dates:
    for country in countries:
        idx = (date, country)
        if idx not in keys:
            if date == min(dates):
                # this means there's no entry for {country} on the earliest date 
                short = short.append({
                    "date": date, 
                    "country": country, 
                    "total_vaccinations": 0
                }, ignore_index=True)
            else:
                # entry for {country} is missing on a date other than the earliest
                short = short.append({
                    "date": date, 
                    "country": country, 
                    "total_vaccinations": pd.NA
                }, ignore_index=True)
                
#fill missing values with previous day values (this is OK since it is cumulative)
short = short.sort_values(['country', 'date'])

short.total_vaccinations = short.total_vaccinations.fillna(method='ffill')

# scale the number by log to make the color transitions smoother
vaccines = short.sort_values('date')
vaccines['log_scale'] = vaccines['total_vaccinations'].apply(lambda x : math.log2(x+1))

fig = ex.choropleth(vaccines, locations="country", 
                    locationmode='country names',
                    color="log_scale", 
                    hover_name="country", 
                    hover_data=['log_scale', "total_vaccinations"],
                    animation_frame="date",
                    color_continuous_scale="BuGn",
                   )

title = get_multi_line_title("Vaccination Progress", "Number of Vaccines Administered Around the World")
fig.update_layout(coloraxis={"cmax":25,"cmin":0})
fig.update_layout(title=title, title_x=0.5, coloraxis_showscale=False)

fig.show()

## Rate of Vaccination



In [None]:
# choose only top-10 
countries = short.groupby('country')['total_vaccinations'].max().sort_values(ascending=False)[:10].index.tolist()

title = get_multi_line_title("Vaccination Progress", "Rate of vaccinations for the top-10 vaccinated countries")

line_plots = []
for c in countries:
    vacc_data = short[short.country == c]
    line_plots.append(
        go.Scatter(
            name = c,
            x = vacc_data.date,
            mode='lines+markers',
            y=vacc_data['total_vaccinations'],
        )
    )
    
fig = go.Figure(line_plots)
fig.update_layout(
    title =title,
    yaxis_title="Count",
    hovermode='x',
    legend_orientation = 'h',

)


fig.show()

We can consider the rate of vaccination and the the rate of new cases.  


In [None]:
# copy the datasets
vaccs = vacc_df.copy()
daily = daily_df.copy()

# standardise the dates 
vaccs.date =vaccs.date.apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
daily.date =daily.date.apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))

# use only common countries and dates 
countries = vaccs.dropna(subset=['daily_vaccinations'])['country'].unique()
dates = vaccs.dropna(subset=['daily_vaccinations'])['date'].unique()
country_mask = daily.country.apply(lambda x: x in countries)
date_mask = daily.date.apply(lambda x: x in dates)

# generate the visualization data 
columns_to_sum = ['daily_new_cases', 'cumulative_total_cases', 'cumulative_total_deaths', 'active_cases']
daily_cases = daily[country_mask & date_mask].groupby('date')[columns_to_sum].sum()
daily_vaccs = vaccs.groupby('date')[[ 'daily_vaccinations']].sum()

# make it a dataframe for convenience  
data = pd.DataFrame(daily_cases).join(pd.DataFrame(daily_vaccs))

# bring back the vaccine data we prepared in the previous section 
cumulative_vaccines = pd.DataFrame(vaccines.groupby('date')['total_vaccinations'].sum())
data = data.join(cumulative_vaccines).reset_index()

In [None]:
# Data Format used for this section
data.head()

In [None]:
title = get_multi_line_title("Vaccine vs Virus", "Comparing the total number of daily new cases and daily vaccinations globally")
fig = go.Figure(data=[
                go.Bar(
                    name="New Cases",
                    x=data['date'], 
                    y=data['daily_new_cases'],
                    marker_color="crimson",
                ),
                go.Bar(
                    name="Vaccinations",
                    x=data['date'], 
                    y=data['daily_vaccinations'],
                    marker_color="lightseagreen"
                ),

            ])

fig.update_layout(
        title=title,
        xaxis_title="Date",
        yaxis_title="Count",
        plot_bgcolor='rgba(0,0,0,0)',
        barmode='stack',
        hovermode="x"
    )

fig.show()

##New Cases and New Vaccines administered 

The vaccinations and the new cases seem to follow a common "wave-like" trend. If we look carefully, peaks usually fall on Thursdays. 



In [None]:
title = get_multi_line_title("Cumulative Statistics","Visualizing Cumulative Statistics of Disease vs Vaccine")

fig = go.Figure(data=[
                go.Scatter(
                    mode="lines+markers",
                    name="Total Deaths",
                    x=data['date'], 
                    y=data['cumulative_total_deaths'],
                    marker_color="crimson",
                ),
                go.Scatter(
                    mode="lines+markers",
                    name="Total Cases",
                    x=data['date'], 
                    y=data['cumulative_total_cases'],
                    marker_color="royalblue"
                ),
    
                go.Scatter(
                    mode="lines+markers",
                    name="Total Vaccinated",
                    x=data['date'], 
                    y=data['total_vaccinations'],
                    marker_color="lightseagreen"
                ),
            ])

fig.update_layout(
        title = title,
        xaxis_title="",
        yaxis_title="Count",
        hovermode="x",
    legend_orientation = 'h',
    width=800,
    height=800)
fig.show()



In [None]:
title = get_multi_line_title("Vaccine Breakdown", "Cumulative totals of each vaccine administered over time")
vacc_plot = pd.DataFrame(vacc_df.groupby(['vaccines','date'])['daily_vaccinations'].sum()).reset_index()
dates = vacc_plot.date.unique().tolist() 
vaccines = vacc_plot.vaccines.unique().tolist()
vacc_plot = vacc_plot.set_index(['date', 'vaccines'])

data = []
for date in dates:
    for vac in vaccines:
        if (date, vac) not in vacc_plot.index:
            value = pd.NA
            if date == (min(dates)):
                value = 0
            data.append([date, vac, value])  
        else:
            data.append([date, vac, vacc_plot.loc[(date, vac)]['daily_vaccinations']])
            
data = pd.DataFrame(data, columns = ['date', 'vaccine', 'count'])
data = data.sort_values(['vaccine', 'date'])
data['count'] = data['count'].fillna(method='ffill')
data = data[data['date'] != max(dates)]

line_plots = []
for v in vaccines:
    vacc_data = data[data.vaccine == v]
    line_plots.append(
        go.Scatter(
            name = v,
            x = vacc_data.date,
            mode='lines+markers',
            y=vacc_data['count'],
        )
    )
    
fig = go.Figure(line_plots)
fig.update_layout(
    title =title,
#     title = title,
    xaxis_title="Date",
    yaxis_title="Count",
    hovermode='x',
    legend_orientation = 'h',
    width=800,
    height=1600
)


fig.show()