## Implementation

In [1]:
# Installation of packages
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9681 sha256=a799f713c8675d7e6e9f7def0d20bf3166c6b3ab5a21dab768959f096d293a6e
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [2]:
# Imports of libraries
from plotly.subplots import make_subplots
import pandas as pd 
import plotly.express as px
import numpy as np
import warnings
import folium
import wget
import os

warnings.filterwarnings('ignore')

In [3]:
! rmdir Images
# We will store all our generated graphs in this folder
if 'images' not in os.listdir():
    os.mkdir('images')

rmdir: failed to remove 'Images': No such file or directory


In [4]:
# Remove old csv data files
! rm *.csv

# Download latest data files from John Hopkins datasets
urls = ["https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv",
        "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv",
        "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"]
for url in urls:
    wget.download(url)

rm: cannot remove '*.csv': No such file or directory


In [5]:
# Create dataframes from the files
confirmed_wide = pd.read_csv("./time_series_covid19_confirmed_global.csv")
deceased_wide = pd.read_csv("./time_series_covid19_deaths_global.csv")
recovered_wide = pd.read_csv("time_series_covid19_recovered_global.csv")

In [6]:
confirmed_wide

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,6/7/20,6/8/20,6/9/20,6/10/20,6/11/20,6/12/20,6/13/20,6/14/20,6/15/20,6/16/20
0,,Afghanistan,33.000000,65.000000,0,0,0,0,0,0,...,20342,20917,21459,22142,22890,23546,24102,24766,25527,26310
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,1246,1263,1299,1341,1385,1416,1464,1521,1590,1672
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,10154,10265,10382,10484,10589,10698,10810,10919,11031,11147
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,852,852,852,852,852,853,853,853,853,854
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,91,92,96,113,118,130,138,140,142,148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,,Sao Tome and Principe,0.186360,6.613081,0,0,0,0,0,0,...,513,513,514,611,632,639,659,661,662,671
262,,Yemen,15.552727,48.516388,0,0,0,0,0,0,...,484,496,524,560,591,632,705,728,844,885
263,,Comoros,-11.645500,43.333300,0,0,0,0,0,0,...,141,141,141,162,162,163,176,176,176,197
264,,Tajikistan,38.861034,71.276093,0,0,0,0,0,0,...,4529,4609,4690,4763,4834,4902,4971,5035,5097,5160


## Data Cleaning

In [7]:
# Reshaping dataframe from wide to high in dimensions
confirmedDF = pd.melt(confirmed_wide, id_vars=["Province/State", "Country/Region", "Lat", "Long"],
                           var_name="Date", value_name="Confirmed")
deceasedDF = pd.melt(deceased_wide, id_vars=["Province/State", "Country/Region", "Lat", "Long"],
                           var_name="Date", value_name="Deceased")
recoveredDF = pd.melt(recovered_wide, id_vars=["Province/State", "Country/Region", "Lat", "Long"],
                           var_name="Date", value_name="Recovered")

print("confirmedDF Shape: ", confirmedDF.shape)
print("deceasedDF Shape: ", deceasedDF.shape)
print("recoveredDF Shape: ", recoveredDF.shape)
confirmedDF.head()

confirmedDF Shape:  (39102, 6)
deceasedDF Shape:  (39102, 6)
recoveredDF Shape:  (37191, 6)


Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed
0,,Afghanistan,33.0,65.0,1/22/20,0
1,,Albania,41.1533,20.1683,1/22/20,0
2,,Algeria,28.0339,1.6596,1/22/20,0
3,,Andorra,42.5063,1.5218,1/22/20,0
4,,Angola,-11.2027,17.8739,1/22/20,0


In [8]:
# Merging all the dataframes into one
totalDF = pd.merge(left=confirmedDF, right=deceasedDF, how='outer', 
                   on=["Province/State", "Country/Region", "Date", "Lat", "Long"])
totalDF = pd.merge(left=totalDF, right=recoveredDF, on=["Province/State", "Country/Region", "Date", "Lat", "Long"],
                  how='outer')
totalDF

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deceased,Recovered
0,,Afghanistan,33.0000,65.0000,1/22/20,0.0,0.0,0.0
1,,Albania,41.1533,20.1683,1/22/20,0.0,0.0,0.0
2,,Algeria,28.0339,1.6596,1/22/20,0.0,0.0,0.0
3,,Andorra,42.5063,1.5218,1/22/20,0.0,0.0,0.0
4,,Angola,-11.2027,17.8739,1/22/20,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
39685,,Timor-Leste,-8.8742,125.7275,6/15/20,,,24.0
39686,,Canada,56.1304,-106.3468,6/16/20,,,61899.0
39687,,Mozambique,-18.6657,35.5296,6/16/20,,,160.0
39688,,Syria,34.8021,38.9968,6/16/20,,,78.0


In [9]:
# Count total NaN values
print("Before NaN removal:")
print(totalDF.isna().sum())
print()

# Removing all NaN values
totalDF["Confirmed"] = totalDF["Confirmed"].fillna(0)
totalDF["Deceased"] = totalDF["Deceased"].fillna(0)
totalDF["Recovered"] = totalDF["Recovered"].fillna(0)
totalDF.isna().sum()

print("After NaN removal:")
print(totalDF.isna().sum())

Before NaN removal:
Province/State    27783
Country/Region        0
Lat                   0
Long                  0
Date                  0
Confirmed           588
Deceased            588
Recovered          2499
dtype: int64

After NaN removal:
Province/State    27783
Country/Region        0
Lat                   0
Long                  0
Date                  0
Confirmed             0
Deceased              0
Recovered             0
dtype: int64


In [10]:
# Print all Countries 
totalDF["Country/Region"].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Benin', 'Bhutan', 'Bolivia',
       'Bosnia and Herzegovina', 'Brazil', 'Brunei', 'Bulgaria',
       'Burkina Faso', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada',
       'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia',
       'Congo (Brazzaville)', 'Congo (Kinshasa)', 'Costa Rica',
       "Cote d'Ivoire", 'Croatia', 'Diamond Princess', 'Cuba', 'Cyprus',
       'Czechia', 'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador',
       'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon',
       'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala',
       'Guinea', 'Guyana', 'Haiti', 'Holy See', 'Honduras', 'Hungary',
       'Iceland', 'India

In [11]:
# Converting the Date column into proper datetime formate and sort
totalDF.Date = pd.to_datetime(totalDF.Date)
totalDF.sort_values(by=["Date"], inplace=True)
totalDF

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deceased,Recovered
0,,Afghanistan,33.0000,65.0000,2020-01-22,0.0,0.0,0.0
171,,Nicaragua,12.8654,-85.2072,2020-01-22,0.0,0.0,0.0
172,,Niger,17.6078,8.0817,2020-01-22,0.0,0.0,0.0
173,,Nigeria,9.0820,8.6753,2020-01-22,0.0,0.0,0.0
174,,North Macedonia,41.6086,21.7453,2020-01-22,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
39006,,New Zealand,-40.9006,174.8860,2020-06-16,1506.0,22.0,1482.0
39005,,Netherlands,52.1326,5.2913,2020-06-16,49087.0,6070.0,0.0
39004,Sint Maarten,Netherlands,18.0425,-63.0548,2020-06-16,77.0,15.0,62.0
39018,,Philippines,13.0000,122.0000,2020-06-16,26781.0,1103.0,6552.0


In [12]:
# Create a dataframe for frequency of cases based on Recorvered, Confirmed and Deceased
date_groupedDF = totalDF.groupby('Date')['Recovered', 'Confirmed', 'Deceased'].sum().reset_index()
date_groupedDF

Unnamed: 0,Date,Recovered,Confirmed,Deceased
0,2020-01-22,28.0,555.0,17.0
1,2020-01-23,30.0,654.0,18.0
2,2020-01-24,36.0,941.0,26.0
3,2020-01-25,39.0,1434.0,42.0
4,2020-01-26,52.0,2118.0,56.0
...,...,...,...,...
142,2020-06-12,3620412.0,7644260.0,425780.0
143,2020-06-13,3706353.0,7778881.0,430047.0
144,2020-06-14,3777131.0,7912426.0,433391.0
145,2020-06-15,3857338.0,8034461.0,436899.0


# Visualization

In [13]:
# Color pallete
Recovered, Confirmed, Deceased= '#28a745', '#007bff', '#ff073a'

### Bar Charts

In [14]:
# Creating columns to indicate case type and frequency based on the Date
date_countDF = date_groupedDF.melt(id_vars=['Date'], var_name='Case Type', value_name='Frequency')

fig = px.area(date_countDF, x='Date', y='Frequency', title='Cases Over Time Slider', 
              color='Case Type', color_discrete_sequence=[Recovered, Confirmed, Deceased])
fig.update_layout(xaxis_rangeslider_visible=True)

fig.show()


In [15]:
# Generating bar graphs
fig1 = px.bar(date_groupedDF, x="Date", y="Confirmed", color_discrete_sequence=[Confirmed])
fig2 = px.bar(date_groupedDF, x="Date", y="Deceased", color_discrete_sequence=[Deceased])
fig3 = px.bar(date_groupedDF, x="Date", y="Recovered", color_discrete_sequence=[Recovered])

fig = make_subplots(rows=2, cols=2,shared_xaxes=False, horizontal_spacing=0.1, vertical_spacing=0.1,
                   subplot_titles=("Confirmed Cases", "Deceased Cases", "Recovered Cases"))

fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig3['data'][0], row=2, col=1)

fig.update_layout(height=700, title='Day Wise Cases')
fig.show()


In [16]:
# Logarithm graphs
fig1 = px.bar(date_groupedDF, x='Date', y='Confirmed', color_discrete_sequence=[Confirmed])
fig2 = px.bar(date_groupedDF, x='Date', y='Deceased', color_discrete_sequence=[Deceased])
fig3 = px.bar(date_groupedDF, x='Date', y='Recovered', color_discrete_sequence=[Recovered])

fig = make_subplots(rows=2, cols=2, shared_xaxes=False, horizontal_spacing=0.1, 
                    subplot_titles=("Confirmed Cases", "Deceased Cases", "Recovered Cases"))

fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig3['data'][0], row=2, col=1)

fig.update_layout(height=800, yaxis_type='log', yaxis2_type='log', yaxis3_type='log', title='Day Wise Cases(Log Scale)')
fig.show()


## Maps

In [17]:
# retrieving only latest date data from the dataframe
temp = totalDF[totalDF['Date'] == max(totalDF['Date'])]
_map = folium.Map(location=[0,0], tiles='cartodbpositron',
                 min_zoom=1, max_zoon=4, zoom_start=1.5)

for i in range(len(temp)):
    folium.Circle(
            location=[temp.iloc[i]['Lat'], temp.iloc[i]['Long']],
            color=Confirmed, fill='crimson',
            tooltip =   '<li> Country: ' + str(temp.iloc[i]['Country/Region']) +
                        '<li> Confirmed: ' + str(temp.iloc[i]['Confirmed']) +
                        '<li> Deceased: ' + str(temp.iloc[i]['Deceased']) +
                        '<li> Recovered: ' + str(temp.iloc[i]['Recovered']),
            radius=int(temp.iloc[i]['Confirmed'])).add_to(_map)
_map

## Choropleths

In [18]:
fig = px.choropleth(totalDF, locations="Country/Region", locationmode='country names', color=np.log(totalDF["Confirmed"]), 
                    hover_name="Country/Region",
                    title='Confirmed Cases', color_continuous_scale=px.colors.sequential.Blues)
fig.update(layout_coloraxis_showscale=False)
fig.show()