# Yet Another COVID-19 Dashbord (data wrangling)

Yet another COVID-19 dashboard built by yet another data science enthousiast.   
I made this so I can look at the data the way I like.  
Please feel free to fork, modify and distribute.  

### Imports

In [74]:
import pandas as pd
import numpy as np

### Get the data

In [75]:
confirmed_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
dead_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
recovered_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"

In [76]:
def get_and_set_data(url, val_name):
    return pd.read_csv(
        url, 
        error_bad_lines=False
    ).rename(
        columns={
            "Province/State": "province_state",
            "Country/Region": "country_region",
            "Lat": "lat",
            "Long": "long",
        }
    ).melt(
        id_vars=["province_state", "country_region", "lat", "long"],
        var_name='date', 
        value_name=val_name,
    )
    

In [77]:
df_confirmed = get_and_set_data(url=confirmed_url,val_name="confirmed")
df_confirmed.head()

Unnamed: 0,province_state,country_region,lat,long,date,confirmed
0,,Afghanistan,33.0,65.0,1/22/20,0
1,,Albania,41.1533,20.1683,1/22/20,0
2,,Algeria,28.0339,1.6596,1/22/20,0
3,,Andorra,42.5063,1.5218,1/22/20,0
4,,Angola,-11.2027,17.8739,1/22/20,0


In [78]:
df_confirmed.describe()

Unnamed: 0,lat,long,confirmed
count,18176.0,18176.0,18176.0
mean,21.823943,23.158832,634.737016
std,24.437152,70.983731,5792.747349
min,-41.4545,-135.0,0.0
25%,7.844875,-16.237775,0.0
50%,23.6925,20.921188,0.0
75%,41.2272,84.497525,38.0
max,71.7069,178.065,213372.0


In [79]:
df_confirmed.confirmed.isna().sum()

0

In [80]:
df_dead = get_and_set_data(url=dead_url,val_name="dead")
df_dead.head()

Unnamed: 0,province_state,country_region,lat,long,date,dead
0,,Afghanistan,33.0,65.0,1/22/20,0
1,,Albania,41.1533,20.1683,1/22/20,0
2,,Algeria,28.0339,1.6596,1/22/20,0
3,,Andorra,42.5063,1.5218,1/22/20,0
4,,Angola,-11.2027,17.8739,1/22/20,0


In [81]:
df_dead.describe()

Unnamed: 0,lat,long,dead
count,18176.0,18176.0,18176.0
mean,21.823943,23.158832,26.452795
std,24.437152,70.983731,336.836453
min,-41.4545,-135.0,0.0
25%,7.844875,-16.237775,0.0
50%,23.6925,20.921188,0.0
75%,41.2272,84.497525,0.0
max,71.7069,178.065,13155.0


In [82]:
df_dead.dead.isna().sum()

0

In [83]:
df_recovered = get_and_set_data(url=recovered_url,val_name="recovered")
df_recovered.head()

Unnamed: 0,province_state,country_region,lat,long,date,recovered
0,,Afghanistan,33.0,65.0,1/22/20,0
1,,Albania,41.1533,20.1683,1/22/20,0
2,,Algeria,28.0339,1.6596,1/22/20,0
3,,Andorra,42.5063,1.5218,1/22/20,0
4,,Angola,-11.2027,17.8739,1/22/20,0


In [84]:
df_recovered.describe()

Unnamed: 0,lat,long,recovered
count,17182.0,17182.0,17182.0
mean,20.542559,29.173319,190.950064
std,24.092554,67.583039,2444.025765
min,-41.4545,-106.3468,0.0
25%,7.0,-7.0926,0.0
50%,22.08335,23.4094,0.0
75%,39.549,90.3563,2.0
max,71.7069,178.065,63326.0


In [85]:
df_recovered.recovered.isna().sum()

0

In [86]:
df_recovered.tail()

Unnamed: 0,province_state,country_region,lat,long,date,recovered
17177,Turks and Caicos Islands,United Kingdom,21.694,-71.7979,4/1/20,0
17178,,MS Zaandam,0.0,0.0,4/1/20,0
17179,,Botswana,-22.3285,24.6849,4/1/20,0
17180,,Burundi,-3.3731,29.9189,4/1/20,0
17181,,Sierra Leone,8.460555,-11.779889,4/1/20,0


In [107]:
df_recovered[df_recovered.date == '2020-03-20']

Unnamed: 0,province_state,country_region,lat,long,date,recovered


In [88]:
df_merged = df_confirmed.join(
    df_dead["dead"]
).join(
    df_recovered["recovered"]
).sort_values(
    ['country_region', 'date']
).reset_index(

).drop(
    columns=['index']
)
df_merged.head()

Unnamed: 0,province_state,country_region,lat,long,date,confirmed,dead,recovered
0,,Afghanistan,33.0,65.0,1/22/20,0,0,0.0
1,,Afghanistan,33.0,65.0,1/23/20,0,0,0.0
2,,Afghanistan,33.0,65.0,1/24/20,0,0,0.0
3,,Afghanistan,33.0,65.0,1/25/20,0,0,0.0
4,,Afghanistan,33.0,65.0,1/26/20,0,0,1.0


In [89]:
df_merged.describe()

Unnamed: 0,lat,long,confirmed,dead,recovered
count,18176.0,18176.0,18176.0,18176.0,17182.0
mean,21.823943,23.158832,634.737016,26.452795,190.950064
std,24.437152,70.983731,5792.747349,336.836453,2444.025765
min,-41.4545,-135.0,0.0,0.0,0.0
25%,7.844875,-16.237775,0.0,0.0,0.0
50%,23.6925,20.921188,0.0,0.0,0.0
75%,41.2272,84.497525,38.0,0.0,2.0
max,71.7069,178.065,213372.0,13155.0,63326.0


In [90]:
df_merged.isna().sum()

province_state    12567
country_region        0
lat                   0
long                  0
date                  0
confirmed             0
dead                  0
recovered           994
dtype: int64

In [91]:
df_merged.dtypes

province_state     object
country_region     object
lat               float64
long              float64
date               object
confirmed           int64
dead                int64
recovered         float64
dtype: object

In [92]:
df_merged.date = pd.to_datetime(df_merged.date)

In [93]:
df_merged.dtypes

province_state            object
country_region            object
lat                      float64
long                     float64
date              datetime64[ns]
confirmed                  int64
dead                       int64
recovered                float64
dtype: object

In [94]:
df_merged.head(10)

Unnamed: 0,province_state,country_region,lat,long,date,confirmed,dead,recovered
0,,Afghanistan,33.0,65.0,2020-01-22,0,0,0.0
1,,Afghanistan,33.0,65.0,2020-01-23,0,0,0.0
2,,Afghanistan,33.0,65.0,2020-01-24,0,0,0.0
3,,Afghanistan,33.0,65.0,2020-01-25,0,0,0.0
4,,Afghanistan,33.0,65.0,2020-01-26,0,0,1.0
5,,Afghanistan,33.0,65.0,2020-01-27,0,0,0.0
6,,Afghanistan,33.0,65.0,2020-01-28,0,0,0.0
7,,Afghanistan,33.0,65.0,2020-01-29,0,0,0.0
8,,Afghanistan,33.0,65.0,2020-01-30,0,0,0.0
9,,Afghanistan,33.0,65.0,2020-01-31,0,0,0.0


Since there is no world entries we create them

In [95]:
len(df_merged.country_region.unique())

180

In [96]:
df_world = df_merged.drop(
    columns=['province_state', 'country_region', 'lat', 'long']
).set_index(
    'date'
).resample(
    'd'
).sum(
    min_count=180
).reset_index(

).assign(
    country_region='World',
    lat=np.nan,
    long=np.nan    
)
df_world

Unnamed: 0,date,confirmed,dead,recovered,country_region,lat,long
0,2020-01-22,555,17,28.0,World,,
1,2020-01-23,654,18,30.0,World,,
2,2020-01-24,941,26,38.0,World,,
3,2020-01-25,1434,42,83.0,World,,
4,2020-01-26,2118,56,63.0,World,,
...,...,...,...,...,...,...,...
66,2020-03-28,660706,30652,213156.0,World,,
67,2020-03-29,720117,33925,,World,,
68,2020-03-30,782365,37582,,World,,
69,2020-03-31,857487,42107,,World,,


In [97]:
df_world.dtypes

date              datetime64[ns]
confirmed                  int64
dead                       int64
recovered                float64
country_region            object
lat                      float64
long                     float64
dtype: object

In [98]:
df = pd.concat((df_merged, df_world)).sort_values(['country_region', 'date'])

df.recovered = df.recovered.fillna(-1)
df.recovered = pd.to_numeric(df.recovered, downcast="integer")
df.recovered = df.recovered.replace("-1", np.nan)

df.head()

Unnamed: 0,province_state,country_region,lat,long,date,confirmed,dead,recovered
0,,Afghanistan,33.0,65.0,2020-01-22,0,0,0
1,,Afghanistan,33.0,65.0,2020-01-23,0,0,0
2,,Afghanistan,33.0,65.0,2020-01-24,0,0,0
3,,Afghanistan,33.0,65.0,2020-01-25,0,0,0
4,,Afghanistan,33.0,65.0,2020-01-26,0,0,1


In [99]:
df.tail()

Unnamed: 0,province_state,country_region,lat,long,date,confirmed,dead,recovered
18164,,Zimbabwe,-20.0,30.0,2020-03-28,7,1,0
18165,,Zimbabwe,-20.0,30.0,2020-03-29,7,1,-1
18167,,Zimbabwe,-20.0,30.0,2020-03-30,7,1,-1
18168,,Zimbabwe,-20.0,30.0,2020-03-31,8,1,-1
18175,,Zimbabwe,-20.0,30.0,2020-04-01,8,1,-1


In [100]:
df.describe()

Unnamed: 0,lat,long,confirmed,dead,recovered
count,18176.0,18176.0,18247.0,18247.0,18247.0
mean,21.823943,23.158832,1264.534444,52.699731,359.048611
std,24.437152,70.983731,17688.383367,845.400905,4909.787328
min,-41.4545,-135.0,0.0,0.0,-1.0
25%,7.844875,-16.237775,0.0,0.0,0.0
50%,23.6925,20.921188,0.0,0.0,0.0
75%,41.2272,84.497525,40.0,0.0,1.0
max,71.7069,178.065,932605.0,46809.0,213156.0


In [101]:
import plotly.express as px

In [102]:
fig = px.scatter(
    data_frame=df[df.country_region.isin(["World", "Portugal", "Italy", "Spain", "France"])],
    x="date",
    y="confirmed",
    facet_row="country_region",
)
fig.layout.yaxis2.update(matches=None)
fig

In [103]:
df.date.dt.date.min()

datetime.date(2020, 1, 22)

In [104]:
from datetime import datetime as dt
from datetime import timedelta

dt.utcfromtimestamp(0).date() + timedelta(days=18283)

datetime.date(2020, 1, 22)

In [109]:
df[df.date == dt.strftime(
    dt.utcfromtimestamp(0).date() + timedelta(days=18283),
    "%Y-%m-%d"
    )
]

Unnamed: 0,province_state,country_region,lat,long,date,confirmed,dead,recovered
0,,Afghanistan,33.0000,65.0000,2020-01-22,0,0,0
71,,Albania,41.1533,20.1683,2020-01-22,0,0,0
142,,Algeria,28.0339,1.6596,2020-01-22,0,0,0
213,,Andorra,42.5063,1.5218,2020-01-22,0,0,0
284,,Angola,-11.2027,17.8739,2020-01-22,0,0,0
...,...,...,...,...,...,...,...,...
17892,,Vietnam,16.0000,108.0000,2020-01-22,0,0,0
17963,,West Bank and Gaza,31.9522,35.2332,2020-01-22,0,0,0
0,,World,,,2020-01-22,555,17,28
18034,,Zambia,-15.4167,28.2833,2020-01-22,0,0,0


In [106]:
df.date.unique()

array(['2020-01-22T00:00:00.000000000', '2020-01-23T00:00:00.000000000',
       '2020-01-24T00:00:00.000000000', '2020-01-25T00:00:00.000000000',
       '2020-01-26T00:00:00.000000000', '2020-01-27T00:00:00.000000000',
       '2020-01-28T00:00:00.000000000', '2020-01-29T00:00:00.000000000',
       '2020-01-30T00:00:00.000000000', '2020-01-31T00:00:00.000000000',
       '2020-02-01T00:00:00.000000000', '2020-02-02T00:00:00.000000000',
       '2020-02-03T00:00:00.000000000', '2020-02-04T00:00:00.000000000',
       '2020-02-05T00:00:00.000000000', '2020-02-06T00:00:00.000000000',
       '2020-02-07T00:00:00.000000000', '2020-02-08T00:00:00.000000000',
       '2020-02-09T00:00:00.000000000', '2020-02-10T00:00:00.000000000',
       '2020-02-11T00:00:00.000000000', '2020-02-12T00:00:00.000000000',
       '2020-02-13T00:00:00.000000000', '2020-02-14T00:00:00.000000000',
       '2020-02-15T00:00:00.000000000', '2020-02-16T00:00:00.000000000',
       '2020-02-17T00:00:00.000000000', '2020-02-18

In [108]:
df[df.date == "2020-01-22"]

Unnamed: 0,province_state,country_region,lat,long,date,confirmed,dead,recovered
0,,Afghanistan,33.0000,65.0000,2020-01-22,0,0,0
71,,Albania,41.1533,20.1683,2020-01-22,0,0,0
142,,Algeria,28.0339,1.6596,2020-01-22,0,0,0
213,,Andorra,42.5063,1.5218,2020-01-22,0,0,0
284,,Angola,-11.2027,17.8739,2020-01-22,0,0,0
...,...,...,...,...,...,...,...,...
17892,,Vietnam,16.0000,108.0000,2020-01-22,0,0,0
17963,,West Bank and Gaza,31.9522,35.2332,2020-01-22,0,0,0
0,,World,,,2020-01-22,555,17,28
18034,,Zambia,-15.4167,28.2833,2020-01-22,0,0,0
