# Install Essential Package

In [None]:
# Installation of packages
!pip install wget

# Implementation

In [None]:
# Imports of libraries
from plotly.subplots import make_subplots
from urllib.request import urlopen
from datetime import date, timedelta
import plotly.express as px
import pandas as pd 
import numpy as np
import warnings
import datetime
import folium
import json
import wget
import os
    
warnings.filterwarnings('ignore')

In [None]:
# Remove old csv data files (if any)
! rm *.csv

# Download latest data files from John Hopkins datasets and other essential datasets
urls = ["https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv",
        "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv",
        "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv",
        "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv",
        "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv"]

for url in urls:
    wget.download(url)

## Global Stats

In [None]:
# Create dataframes from the files
confirmed_wide = pd.read_csv("./time_series_covid19_confirmed_global.csv")
deceased_wide = pd.read_csv("./time_series_covid19_deaths_global.csv")
recovered_wide = pd.read_csv("time_series_covid19_recovered_global.csv")

In [None]:
confirmed_wide

# Data Cleaning

In [None]:
# Reshaping dataframe. Converting Date representing columns and their values into separate column 
confirmedDF = pd.melt(confirmed_wide, id_vars=["Province/State", "Country/Region", "Lat", "Long"],
                           var_name="Date", value_name="Confirmed")
deceasedDF = pd.melt(deceased_wide, id_vars=["Province/State", "Country/Region", "Lat", "Long"],
                           var_name="Date", value_name="Deceased")
recoveredDF = pd.melt(recovered_wide, id_vars=["Province/State", "Country/Region", "Lat", "Long"],
                           var_name="Date", value_name="Recovered")

print("confirmedDF Shape: ", confirmedDF.shape)
print("deceasedDF Shape: ", deceasedDF.shape)
print("recoveredDF Shape: ", recoveredDF.shape)
confirmedDF.head()

In [None]:
# Merging all the dataframes into one
totalDF = pd.merge(left=confirmedDF, right=deceasedDF, how='outer', 
                   on=["Province/State", "Country/Region", "Date", "Lat", "Long"])
totalDF = pd.merge(left=totalDF, right=recoveredDF, on=["Province/State", "Country/Region", "Date", "Lat", "Long"],
                  how='outer')
totalDF

In [None]:
# Count total current NaN values
print("Before NaN removal:")
print(totalDF.isna().sum())
print()

# Removing all NaN values
totalDF["Confirmed"] = totalDF["Confirmed"].fillna(0)
totalDF["Deceased"] = totalDF["Deceased"].fillna(0)
totalDF["Recovered"] = totalDF["Recovered"].fillna(0)
totalDF.isna().sum()

print("After NaN removal:")
print(totalDF.isna().sum())

In [None]:
# Print all Countries 
totalDF["Country/Region"].unique()

In [None]:
# Converting the Date column into proper datetime formate and sort
totalDF.Date = pd.to_datetime(totalDF.Date)
totalDF.sort_values(by=["Date"], inplace=True)
totalDF

In [None]:
# Create a dataframe for frequency of cases based on Recorvered, Confirmed and Deceased
date_groupedDF = totalDF.groupby('Date')['Recovered', 'Confirmed', 'Deceased'].sum().reset_index()
date_groupedDF

# Visualization

In [None]:
# Color pallete
Recovered, Confirmed, Deceased = '#28a745', '#007bff', '#ff073a'

## Line Tree Map

In [None]:
temp = date_groupedDF[date_groupedDF['Date']==max(date_groupedDF['Date'])].reset_index(drop=True)

melted_temp = temp.melt(id_vars="Date", value_vars=['Recovered', 'Confirmed', 'Deceased'])
fig = px.treemap(melted_temp, path=["variable"], values="value", height=250, width=1200,
                 color_discrete_sequence=[Recovered, Confirmed, Deceased])
fig.data[0].textinfo = 'label+text+value'
fig.show()

## Pie Chart

In [None]:
fig = px.pie(melted_temp, values="value", height=750, names='variable', title='Covid 19',
                 color_discrete_sequence=[Recovered, Confirmed, Deceased])
fig.data[0].textinfo = 'label+text+value'
fig.show()

## Area Chart

In [None]:
# Creating columns to indicate case type and frequency based on the Date
date_countDF = date_groupedDF.melt(id_vars=['Date'], var_name='Case Type', value_name='Frequency')

fig = px.area(date_countDF, x='Date', y='Frequency', title='Cases Over Time Slider', 
              color='Case Type', color_discrete_sequence=[Recovered, Confirmed, Deceased])
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

## Bar Chart

In [None]:
# Generating bar graphs
fig1 = px.bar(date_groupedDF, x="Date", y="Confirmed", color_discrete_sequence=[Confirmed])
fig2 = px.bar(date_groupedDF, x="Date", y="Deceased", color_discrete_sequence=[Deceased])
fig3 = px.bar(date_groupedDF, x="Date", y="Recovered", color_discrete_sequence=[Recovered])

fig = make_subplots(rows=2, cols=2,shared_xaxes=False, horizontal_spacing=0.1, vertical_spacing=0.1,
                   subplot_titles=("Confirmed Cases", "Deceased Cases", "Recovered Cases"))

fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig3['data'][0], row=2, col=1)

fig.update_layout(height=700, title='Day Wise Cases')
fig.show()

## Bar Chart Logarithm

In [None]:
# Logarithm graphs
fig1 = px.bar(date_groupedDF, x='Date', y='Confirmed', color_discrete_sequence=[Confirmed])
fig2 = px.bar(date_groupedDF, x='Date', y='Deceased', color_discrete_sequence=[Deceased])
fig3 = px.bar(date_groupedDF, x='Date', y='Recovered', color_discrete_sequence=[Recovered])

fig = make_subplots(rows=2, cols=2, shared_xaxes=False, horizontal_spacing=0.1, 
                    subplot_titles=("Confirmed Cases", "Deceased Cases", "Recovered Cases"))

fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig3['data'][0], row=2, col=1)

fig.update_layout(height=800, yaxis_type='log', yaxis2_type='log', yaxis3_type='log', title='Day Wise Cases(Log Scale)')
fig.show()

## Maps

## Confirmed Cases Map

In [None]:
# retrieving only latest date data from the dataframe
temp = totalDF[totalDF['Date'] == max(totalDF['Date'])]
_map = folium.Map(location=[0,0], tiles='cartodbpositron',
                 min_zoom=1, max_zoon=4, zoom_start=1.5)

for i in range(len(temp)):
    folium.Circle(
            location=[temp.iloc[i]['Lat'], temp.iloc[i]['Long']],
            color=Confirmed, fill='crimson',
            tooltip =   '<li> Country: ' + str(temp.iloc[i]['Country/Region']) +
                        '<li> Confirmed: ' + str(temp.iloc[i]['Confirmed']) +
                        '<li> Deceased: ' + str(temp.iloc[i]['Deceased']) +
                        '<li> Recovered: ' + str(temp.iloc[i]['Recovered']),
            radius=int(temp.iloc[i]['Confirmed'])//2).add_to(_map)
_map

## Deceased Cases Map

In [None]:
# retrieving only latest date data from the dataframe
temp = totalDF[totalDF['Date'] == max(totalDF['Date'])]
_map = folium.Map(location=[0,0], tiles='cartodbpositron',
                 min_zoom=1, max_zoon=4, zoom_start=1.5)

for i in range(len(temp)):
    folium.Circle(
            location=[temp.iloc[i]['Lat'], temp.iloc[i]['Long']],
            color=Deceased, fill='crimson',
            tooltip =   '<li> Country: ' + str(temp.iloc[i]['Country/Region']) +
                        '<li> Confirmed: ' + str(temp.iloc[i]['Confirmed']) +
                        '<li> Deceased: ' + str(temp.iloc[i]['Deceased']) +
                        '<li> Recovered: ' + str(temp.iloc[i]['Recovered']),
            radius=int(temp.iloc[i]['Deceased'])**1.2).add_to(_map)
_map

# US Based Stats

In [None]:
# Create Dataframes for US datasets
us_confirmed = pd.read_csv("./time_series_covid19_confirmed_US.csv")
us_deceased = pd.read_csv("./time_series_covid19_deaths_US.csv")

# We are importing two datasets for population county wise because
# For visualization, we need 5 figure FIPS which are missing in us_population but gives more accurate measure of counties
# On the other hand, us_county has consistent 5 figure FIPS which will be used to correct us_population FIPS later. 
us_population = pd.read_csv("../input/covid19county/covid_county_population_usafacts.csv")
headers = ['idx', 'FIPS', 'County Name', 'State']
dtypes = {'idx': 'str', 'FIPS': 'str', 'County Name': 'str', 'State': 'str'}
us_county = pd.read_csv("../input/usa-county-info/usa_county_info", header=0, names=headers, dtype=dtypes)
us_county.drop(columns=['idx'], inplace=True)

In [None]:
us_confirmed

In [None]:
us_deceased.columns

In [None]:
us_population

In [None]:
us_county

## Printing some useful stats

In [None]:
print("us_confirmed shape", us_confirmed.shape)
print("us_deceased shape", us_deceased.shape)
print("us_population shape", us_population.shape)
print("us_county shape", us_county.shape, '\n')

print("Fips in confirmed: ", len(us_confirmed.FIPS.unique()))
print("Fips in deceased: ", len(us_deceased.FIPS.unique()))
print("Fips in population data: ", len(us_population.countyFIPS.unique()))
print("Fips in us_county data: ", len(us_county.FIPS.unique()))

# Data Cleaning

In [None]:
# Droping Unwanted Columns
non_usable_columns = ["UID", "iso2", "iso3", "code3", "Admin2", "Province_State", "Country_Region", "Lat", "Long_", "Combined_Key"]
us_confirmed = us_confirmed.drop(columns = non_usable_columns)

# There is an extra column in deceased dataframe named Population. 
# We have to remove that as well since we are taking population data from an entire new data frame
us_deceased = us_deceased.drop(columns = non_usable_columns + ["Population"])

In [None]:
# Removing all NaN values
print("Initially Total NaN count:")
print("us_confirmed = ", us_confirmed.FIPS.isna().sum())
print("us_deceased = ", us_deceased.FIPS.isna().sum())

# Removing NaN
us_confirmed.dropna(inplace=True)
us_deceased.dropna(inplace=True)

In [None]:
# Reshaping dataframes. Converting Date representing columns and their values into separate column 
us_confirmedDF = pd.melt(us_confirmed, id_vars=["FIPS"], var_name="Date", value_name="Confirmed")
us_deceasedDF = pd.melt(us_deceased, id_vars=["FIPS"],var_name="Date", value_name="Deceased")

In [None]:
us_confirmedDF

In [None]:
us_deceasedDF

In [None]:
# Merging the dataframes
completeDF = pd.merge(left=us_confirmedDF, right=us_deceasedDF, how='inner', on=["FIPS", "Date"])

# Changing datatypes of columns to more appropriate format
completeDF['Confirmed'] = completeDF['Confirmed'].astype('int')
completeDF['Deceased'] = completeDF['Deceased'].astype('int')
completeDF['FIPS'] = completeDF["FIPS"].astype('int')

# Converting the Date column into proper datetime formate and sort
completeDF.Date = pd.to_datetime(completeDF.Date)
completeDF = completeDF.sort_values(by=["Date", "FIPS"]).reset_index(drop=True)

completeDF

In [None]:
latestDate

In [None]:
# Creating dataframes only consisting current, 2 weeks old and a month old date data
latestDate = date.today() - timedelta(days=1)
twoWeekDate = date.today() - timedelta(weeks=2)
monthOldDate = date.today() - timedelta(weeks=4)

latestDateDF = completeDF[completeDF['Date'] == np.datetime64(latestDate)].reset_index(drop=True)
twoWeekDateDF = completeDF[completeDF['Date'] == np.datetime64(twoWeekDate)].reset_index(drop=True)
monthOldDateDF = completeDF[completeDF['Date'] == np.datetime64(monthOldDate)].reset_index(drop=True)

latestDateDF

In [None]:
print("latestDate: ", latestDate)
print("twoWeekDate: ", twoWeekDate)
print("monthOldDate: ", monthOldDate)

In [None]:
# Converting countyFIPS column name to FIPS to be used as common name accross all the DFs 
us_population.rename(columns = {'countyFIPS':'FIPS'}, inplace = True)
us_population

## Creating Final DataFrame

In [None]:
# We will add all our dataframes into finalDF. We want all the columns from DFs except Date column.
finalDF = pd.merge(left=us_population, right=latestDateDF.loc[:, ["FIPS", "Confirmed", "Deceased"]], how='left', on=["FIPS"])
finalDF.rename(columns = {'Confirmed':'T_Confirmed', 'Deceased':'T_Deceased'}, inplace = True)

finalDF = pd.merge(left=finalDF, right=twoWeekDateDF.loc[:, ["FIPS", "Confirmed", "Deceased"]], how='left', on=["FIPS"])
finalDF.rename(columns = {'Confirmed':'Two_Week_Old_Confirmed', 'Deceased':'Two_Week_Old_Deceased'}, inplace = True)

finalDF = pd.merge(left=finalDF, right=monthOldDateDF.loc[:, ["FIPS", "Confirmed", "Deceased"]], how='left', on=["FIPS"])
finalDF.rename(columns = {'Confirmed':'Month_Old_Confirmed', 'Deceased':'Month_Old_Deceased'}, inplace = True)

finalDF.dropna(inplace=True)
finalDF = finalDF.reset_index(drop=True)
finalDF

In [None]:
# Adding Per capita stats
finalDF['Conf_100k'] = round(finalDF['T_Confirmed'] * 100000 / finalDF['population'], 2)
finalDF['Dec_100k'] = round(finalDF['T_Deceased'] * 100000 / finalDF['population'], 2)

# Percentage increase and decrease for two weekold and month old stats
finalDF['%_Conf_Inc_BW'] = round((finalDF['T_Confirmed'] - finalDF['Two_Week_Old_Confirmed']) * 100 / finalDF['Two_Week_Old_Confirmed'], 2)
finalDF['%_Dec_Inc_BW'] = round((finalDF['T_Deceased'] - finalDF['Two_Week_Old_Deceased']) * 100 / finalDF['Two_Week_Old_Deceased'], 2)

finalDF['%_Conf_Inc_M'] = round((finalDF['T_Confirmed'] - finalDF['Month_Old_Confirmed']) * 100 / finalDF['Month_Old_Confirmed'], 2)
finalDF['%_Dec_Inc_M'] = round((finalDF['T_Deceased'] - finalDF['Month_Old_Deceased']) * 100 / finalDF['Month_Old_Deceased'], 2)

# Dropping unwanted columns
finalDF.drop(["Two_Week_Old_Confirmed", "Two_Week_Old_Deceased", "Month_Old_Confirmed", "Month_Old_Deceased"], axis=1, inplace=True)

# Removing NaN occure when a number is divided by 0. This exists since some deceased cases might be 0 from start.
finalDF = finalDF.fillna(0)
finalDF

In [None]:
# Removing incorrect FIPS by merging finalDF with us_county based on County Name and State. 
# This will add correct FIPS where they should be
finalDF = finalDF.drop(columns=['FIPS'])
finalDF = pd.merge(left=us_county, right=finalDF, how='left', on=["County Name","State"])
finalDF.rename(columns = {'County Name':'County_Name', 'Total_Deceased':"T_Deceased"}, inplace = True)
finalDF.dropna(inplace=True)
finalDF

# Visualization

## Choropleths

## Confirmed Cases by County

In [None]:
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)


fig = px.choropleth(finalDF, geojson=counties, locations='FIPS', color='T_Confirmed',
                           color_continuous_scale="Plasma_r",
                           range_color=(0, 1000),
                           scope="usa",
                           hover_name="County_Name",
                           hover_data=["T_Confirmed", "T_Deceased", "Conf_100k", "%_Conf_Inc_BW", "%_Conf_Inc_M"],
                           labels={
                               'T_Confirmed':'Total Confirmed Cases',
                               'T_Deceased':'Total Deceased',
                               'Conf_100k':'Confirmed Cases per 100,000',
                               '%_Conf_Inc_BW': '% Increase in Cases Bi-Weekly',
                               '%_Conf_Inc_M':'% Increase in Cases Monthly'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

## Deceased Cases by County

In [None]:
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)


fig = px.choropleth(finalDF, geojson=counties, locations='FIPS', color='T_Deceased',
                           color_continuous_scale="Inferno_r",
                           range_color=(0, 50),
                           scope="usa",
                           hover_name="County_Name",
                           hover_data=["T_Confirmed", "T_Deceased", "Dec_100k", "%_Dec_Inc_BW", "%_Dec_Inc_M"],
                           labels={
                               'T_Confirmed':'Total Confirmed Cases',
                               'T_Deceased':'Total Deceased',
                               'Dec_100k':'Deceased Cases per 100,000',
                               '%_Dec_Inc_BW': '% Increase in Deaths Bi-Weekly',
                               '%_Dec_Inc_M':'% Increase in Deaths Monthly'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

# Tree Maps

## Confirmed Cases US County wise

In [None]:
fig = px.treemap(finalDF.sort_values(by='T_Confirmed', ascending=False).reset_index(drop=True), 
                 path=["County_Name"], values="T_Confirmed", height=700,
                 title='Number of Confirmed Cases',
                 color_discrete_sequence = px.colors.qualitative.Dark2)
fig.data[0].textinfo = 'label+text+value'
fig.show()

## Deceased Cases per US State-County wise

In [None]:
fig = px.treemap(finalDF.sort_values(by='T_Deceased', ascending=False).reset_index(drop=True), 
                 path=["State", "County_Name"], values="T_Deceased", height=700,
                 title='Total Deceased',
                 color_discrete_sequence = px.colors.qualitative.Dark24)
fig.data[0].textinfo = 'label+text+value'
fig.show()