# Predicting Disease Outbreaks Milestone 2
### This module loads data from the John Hopkins COVID-19 data. The loaded data is used to determine/develop the following:
- Aggregate numbers are determined for each category of cases (confirmed/death/recovered/active) in the world.
- Countries are displayed sorted by the total number of confirmed cases with the ability to choose the number of countries included.
- A bubble chart plots the worst-affected countries with an option to choose the number of countries to be plotted using Plotly.
- The daily confirmed and death cases are plotted with an option to choose the country (or world) you want to see the plot for.
- The top 10 worst-hit countries in terms of the total number of confirmed cases, death, active cases, recovered cases, and mortality rate are plotted.
- The COVID-19 spread is displayed on a global map using Folium. 

In [94]:
# import needed modules
import utils
import ipywidgets as widgets
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import folium

# instantiate covid data
covid_data = utils.CovidDataService()

In [95]:
# load data
global_aggregates = covid_data.get_global_aggregates()
global_confirmed = covid_data.get_global_confirmed()
global_death = covid_data.get_global_death()
global_recovered = covid_data.get_global_recovered()

## Data Cleaning
### The following cleans up the column names a bit to make them a bit easier to use in the analysis that follows.

In [96]:
# convert column headers to lower case and strip white space
global_aggregates.columns = global_aggregates.columns.str.lower().str.strip()
global_confirmed.columns = global_confirmed.columns.str.lower().str.strip()
global_death.columns = global_death.columns.str.lower().str.strip()
global_recovered.columns = global_recovered.columns.str.lower().str.strip()

# simplify some column names and correct mortality_rate to case_fatality_rate
global_aggregates.rename(columns={'country_region': 'country', 'long_': 'long', 'mortality_rate': 'case_fatality_ratio'}, inplace=True)
global_confirmed.rename(columns={'province/state' : 'state', 'country/region' : 'country'}, inplace=True)
global_death.rename(columns={'province/state' : 'state', 'country/region' : 'country'}, inplace=True)
global_recovered.rename(columns={'province/state' : 'state', 'country/region' : 'country'}, inplace=True)

# delete some unused columns
global_aggregates.drop(['uid', 'iso3'], axis=1, inplace=True)

## World Totals
### The world totals for confirmed, deaths, recovered, and active. The totals for recovered and active are inaccurate because there the global aggregates data does not include data for US recovered and active cases. The descrepancy can be noted because the sum of deaths, recovered, and active does not equal confirmed.

In [97]:
total_cofirmed = global_aggregates["confirmed"].sum()
total_deaths = global_aggregates["deaths"].sum()
total_recovered = global_aggregates["recovered"].sum()
total_active = global_aggregates["active"].sum()

print(f"Total confirmed: {int(total_cofirmed):,}")
print(f"Total deaths: {int(total_deaths):,}")
print(f"Total recovered: {int(total_recovered):,}")
print(f"Total active: {int(total_active):,}")

Total confirmed: 138,851,279
Total deaths: 2,984,413
Total recovered: 79,056,259
Total active: 25,880,726


## Confirmed Cases
### The following is an interactive table that shows the top n countries according to confirmed cases

In [98]:
def column_style(column):
    green_color = 'color : green'
    purple_color = 'color : purple'
    red_color = 'color : red'
    if column.name == 'recovered':
        return pd.Series(green_color, column.index)
    elif column.name == 'deaths':
        return pd.Series(red_color, column.index)
    elif column.name == 'confirmed':
        return pd.Series(purple_color, column.index)
    else:
        return pd.Series('', column.index)

def show_n_countries(n=10):
    #get n countries sorted by confirmed
    n_countries = global_aggregates.sort_values(by='confirmed', ascending=False, ignore_index=True).head(n)

    #style the dataframe with colors for certain columns
    n_countries = n_countries.style.set_properties(**{'background-color' : 'white'}).apply(column_style, axis=0).hide_index()

    #format the output for certain columns
    n_countries = n_countries.format({"confirmed": "{:,.0f}", "deaths": "{:,.0f}", "recovered": "{:,.0f}", "active": "{:,.0f}"})

    return n_countries

widgets.interact(show_n_countries, n=widgets.IntSlider(value=10, min=1, max=20, step=1, description = "Number of Countries", continuous_update=False))

interactive(children=(IntSlider(value=10, continuous_update=False, description='Number of Countries', max=20, …

<function __main__.show_n_countries(n=10)>

## Top n Countries By Confirmed Cases
### The following is an interactive bubble chart showing the top n countries by confirmed cases, with the y axis showing the log number of deaths, the x axis showing the log number of confirmed cases, and the size of the bubbles indicating the mortality rate

In [99]:
def plot_n_countries_by_confirmed(n):
    n_countries = global_aggregates.sort_values(by='confirmed', 
ascending=False, 
ignore_index=True).head(n)

    worst_affected_bubble = px.scatter(n_countries, 
x="confirmed", y="deaths", size="case_fatality_ratio", 
color="country", hover_name="country", 
log_x=True, log_y=True,size_max=60) 

    fig = go.FigureWidget(worst_affected_bubble)

    fig.layout.title = f"{n} Worst Affected Countries By Confirmed Cases"
    fig.layout.title.x = 0.5
    fig.layout.xaxis.title = "Confirmed Cases"
    fig.layout.yaxis.title = "Deaths"

    return fig

widgets.interact(plot_n_countries_by_confirmed, n=widgets.IntSlider(value=10, min=1, max=20, step=1, description = "Number of Countries", continuous_update=False))

interactive(children=(IntSlider(value=10, continuous_update=False, description='Number of Countries', max=20, …

<function __main__.plot_n_countries_by_confirmed(n)>

## Top n Countries By Case Fatality Rate
### The following is an interactive bubble chart showing the top n countries by case fatality rate, with the y axis showing the log number of deaths, the x axis showing the log number of confirmed cases, and the size of the bubbles indicating the case fatality rate

In [100]:
def plot_n_countries_by_fatality(n):
    n_countries = global_aggregates.sort_values(by='case_fatality_ratio', 
ascending=False, 
ignore_index=True).head(n)

    worst_affected_bubble = px.scatter(n_countries, 
x="confirmed", y="deaths", size="case_fatality_ratio", 
color="country", hover_name="country", 
log_x=True, log_y=True,size_max=60) 

    fig = go.FigureWidget(worst_affected_bubble)

    fig.layout.title = f"{n} Worst Affected Countries By Case Fatality Rate"
    fig.layout.title.x = 0.5
    fig.layout.xaxis.title = "Confirmed Cases"
    fig.layout.yaxis.title = "Deaths"

    return fig

widgets.interact(plot_n_countries_by_fatality, n=widgets.IntSlider(value=10, min=1, max=20, step=1, description = "Number of Countries", continuous_update=False))

interactive(children=(IntSlider(value=10, continuous_update=False, description='Number of Countries', max=20, …

<function __main__.plot_n_countries_by_fatality(n)>

## Confirmed Cases Over Time
### The following shows the confirmed cases over time for the selected country (or the world)

In [101]:
countries = global_confirmed['country'].unique().tolist()
countries.append("World")
country_combobox = widgets.Combobox(
    options=countries,
    value='US',
    description='Country:',
    ensure_option = True
)

def get_confirmed_data_by_country(country: str) -> pd.DataFrame:
    if country == "World":
        final_data = global_confirmed.iloc[:,4:].sum().reset_index().rename(columns={"index":"date", 0:"count"})
    else:
        country_confirmed_data = global_confirmed.loc[global_confirmed['country'] == country].iloc[:,4:]
        data_index = country_confirmed_data.index[0]
        final_data = country_confirmed_data.T.reset_index().rename(columns={"index": "date", data_index: "count"})
    
    return final_data

initial_data = get_confirmed_data_by_country("US")
initial_trace = px.line(initial_data, x="date", y="count", title='Confirmed Cases')
confirmed_fig = go.FigureWidget(initial_trace)
confirmed_fig.layout.title = "COVID 19 Confirmed Cases - US"
confirmed_fig.layout.title.x = 0.5
confirmed_fig.layout.xaxis.title = "Date"
confirmed_fig.layout.yaxis.title = "Confirmed Cases"

def validate():
    if country_combobox.value in countries:
        return True
    else:
        return False

def update_figure(change):
    if validate():
        country = change.new
        country_data = get_confirmed_data_by_country(country)

        with confirmed_fig.batch_update():
            confirmed_fig.data[0].x = country_data['date']
            confirmed_fig.data[0].y = country_data['count']
            confirmed_fig.layout.title.text = f"COVID 19 Confirmed Cases - {country}"

country_combobox.observe(update_figure, names='value')

widgets.VBox([country_combobox, confirmed_fig])


VBox(children=(Combobox(value='US', description='Country:', ensure_option=True, options=('Afghanistan', 'Alban…

## Worst Affected Countries
### The following are plots of the top 10 worst affected countries in terms of confirmed, deaths, recovered, active, and case fatlity ratio.  Note that the US is not listed in recovered and active, despite being at the top of confirmed and deaths, because recovered and active US cases is not included in the data.

In [102]:
def sort_aggregate_cases(sort_label, number_of_countries = 10):
    if sort_label in global_aggregates.columns:
        sorted_df = global_aggregates.sort_values(
            by=sort_label, 
            ascending=False, 
            ignore_index=True).head(number_of_countries)
    else:
        sorted_df = global_aggregates
    
    return sorted_df
    
def create_top_figures():
    sort_labels = ['Confirmed', 'Deaths', 'Recovered', 'Active', 'Case_Fatality_Ratio']
    figures = []

    for label in sort_labels:
        lowercase_label = label.lower()
        top_fig = go.FigureWidget(px.bar(sort_aggregate_cases(lowercase_label), x='country', y=lowercase_label))
        top_fig.layout.title.text = f"Top 10 Worst Affected Countries By {label.replace('_',' ')}"
        top_fig.layout.title.x = 0.5
        top_fig.layout.xaxis.title = "Country"
        top_fig.layout.yaxis.title = f"{label.replace('_',' ')}"
        figures.append(top_fig)

    return figures

widgets.VBox(create_top_figures())

VBox(children=(FigureWidget({
    'data': [{'alignmentgroup': 'True',
              'hovertemplate': 'country=…

## World Map
### The following is a world map with dots on each country with confirmed cases, where the size of the dot indicates the relative amount of confirmed cases.

In [163]:
covid_map = folium.Map(location=[11, 0], zoom_start=2)

def scale_confirmed(confirmed_value):
    old_max = global_aggregates['confirmed'].max()
    old_min = global_aggregates['confirmed'].min()
    new_max = 500000
    new_min = 10000

    old_range = (old_max - old_min)  
    new_range = (new_max - new_min)  
    scaled_value = (((confirmed_value - old_min) * new_range) / old_range) + new_min

    return scaled_value

def add_circles(row_data):
    if pd.isna(row_data['lat']) or pd.isna(row_data['long']) or pd.isna(row_data['confirmed']):
        return

    folium.Circle(
        location=[row_data['lat'], row_data['long']],
        radius=scale_confirmed(row_data['confirmed']),
        fill=True,
        fill_color='red',
        fill_opacity=1,
        color='red',
        opacity=0.0,
        tooltip=f"Country: {row_data['country']}<br>Confirmed Cases: {row_data['confirmed']:,.0f}<br>Deaths: {row_data['deaths']:,.0f}"
        ).add_to(covid_map)

_ = global_aggregates.apply(add_circles, axis=1)

covid_map