In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
import math

In [55]:
# Jupyter configuration
pd.set_option('display.max_rows', 500)

In [56]:
dataset = pd.read_csv('owid-covid-data.csv')

In [57]:
dataset.describe()

Unnamed: 0,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
count,120774.0,120770.0,119752.0,109778.0,109931.0,119752.0,120140.0,120136.0,119123.0,109157.0,...,87877.0,86605.0,56763.0,102516.0,121209.0,112889.0,4580.0,4580.0,4580.0,4580.0
mean,1544711.0,6460.637079,6482.477304,38524.23,143.42483,131.157306,19239.874988,85.694527,85.756789,396.336812,...,10.591196,32.746339,50.875129,3.030193,73.258386,0.726279,30724.468472,8.455262,15.6589,755.471243
std,10282660.0,39146.952783,38748.402434,229170.2,782.39257,733.45403,32106.971979,197.875181,167.394861,657.2187,...,10.503276,13.510481,31.823036,2.456243,7.533127,0.150053,86337.136808,15.822375,31.093173,1139.369805
min,1.0,-74347.0,-6223.0,1.0,-1918.0,-232.143,0.001,-3125.829,-272.971,0.001,...,0.1,7.7,1.188,0.1,53.28,0.394,-31959.4,-27.35,-95.92,-1749.128494
25%,2196.0,3.0,9.714,74.0,0.0,0.0,392.156,0.292,1.6025,11.422,...,1.9,21.6,20.859,1.3,67.92,0.602,-186.075,-1.09,-0.99,-46.8954
50%,23754.0,94.0,118.5,653.0,2.0,1.714,3034.0445,10.9725,15.401,76.08,...,6.3,31.4,49.839,2.4,74.62,0.744,1992.15,5.03,6.275,345.233709
75%,233536.0,947.0,997.4645,5698.5,19.0,16.0,24206.24725,83.50775,93.3505,504.273,...,19.3,41.3,83.241,4.0,78.74,0.845,19256.7,13.44,21.7425,1304.780877
max,247140200.0,905975.0,826467.571,5006081.0,18007.0,14703.286,230382.564,8620.69,3385.473,6003.582,...,44.0,78.1,100.0,13.8,86.75,0.957,754457.3,106.83,373.48,6142.92247


In [58]:
# All available variables
dataset.columns.tolist()

['iso_code',
 'continent',
 'location',
 'date',
 'total_cases',
 'new_cases',
 'new_cases_smoothed',
 'total_deaths',
 'new_deaths',
 'new_deaths_smoothed',
 'total_cases_per_million',
 'new_cases_per_million',
 'new_cases_smoothed_per_million',
 'total_deaths_per_million',
 'new_deaths_per_million',
 'new_deaths_smoothed_per_million',
 'reproduction_rate',
 'icu_patients',
 'icu_patients_per_million',
 'hosp_patients',
 'hosp_patients_per_million',
 'weekly_icu_admissions',
 'weekly_icu_admissions_per_million',
 'weekly_hosp_admissions',
 'weekly_hosp_admissions_per_million',
 'new_tests',
 'total_tests',
 'total_tests_per_thousand',
 'new_tests_per_thousand',
 'new_tests_smoothed',
 'new_tests_smoothed_per_thousand',
 'positive_rate',
 'tests_per_case',
 'tests_units',
 'total_vaccinations',
 'people_vaccinated',
 'people_fully_vaccinated',
 'total_boosters',
 'new_vaccinations',
 'new_vaccinations_smoothed',
 'total_vaccinations_per_hundred',
 'people_vaccinated_per_hundred',
 'peo

In [59]:
# The size of the dataset
dataset.size

8308105

In [60]:
# Variable types
dataset.dtypes


iso_code                                    object
continent                                   object
location                                    object
date                                        object
total_cases                                float64
new_cases                                  float64
new_cases_smoothed                         float64
total_deaths                               float64
new_deaths                                 float64
new_deaths_smoothed                        float64
total_cases_per_million                    float64
new_cases_per_million                      float64
new_cases_smoothed_per_million             float64
total_deaths_per_million                   float64
new_deaths_per_million                     float64
new_deaths_smoothed_per_million            float64
reproduction_rate                          float64
icu_patients                               float64
icu_patients_per_million                   float64
hosp_patients                  

In [65]:
# fix data types WIP
dataset['date'] = pd.to_datetime(dataset['date'])

for column in dataset.columns:
    if dataset[column].dtype == np.dtype('float64'):
        only_ints = True
        mean = dataset[column].mean()
        for val in dataset[column]:
            if pd.isnull(val):
                dataset[column] = dataset[column].fillna(mean)
            fractional, whole = math.modf(val)
            if fractional > 0:
                only_ints = False
        if only_ints:
            dataset[column] = dataset[column].astype('int64')



In [66]:
# Variable types after fixed types
dataset.dtypes

iso_code                                           object
continent                                          object
location                                           object
date                                       datetime64[ns]
total_cases                                       float64
new_cases                                         float64
new_cases_smoothed                                float64
total_deaths                                      float64
new_deaths                                        float64
new_deaths_smoothed                               float64
total_cases_per_million                           float64
new_cases_per_million                             float64
new_cases_smoothed_per_million                    float64
total_deaths_per_million                          float64
new_deaths_per_million                            float64
new_deaths_smoothed_per_million                   float64
reproduction_rate                                 float64
icu_patients  

In [64]:
dataset.head(20)

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
5,AFG,Asia,Afghanistan,2020-02-29,5.0,0.0,0.714,,,0.0,...,,,37.746,0.5,64.83,0.511,,,,
6,AFG,Asia,Afghanistan,2020-03-01,5.0,0.0,0.714,,,0.0,...,,,37.746,0.5,64.83,0.511,,,,
7,AFG,Asia,Afghanistan,2020-03-02,5.0,0.0,0.0,,,0.0,...,,,37.746,0.5,64.83,0.511,,,,
8,AFG,Asia,Afghanistan,2020-03-03,5.0,0.0,0.0,,,0.0,...,,,37.746,0.5,64.83,0.511,,,,
9,AFG,Asia,Afghanistan,2020-03-04,5.0,0.0,0.0,,,0.0,...,,,37.746,0.5,64.83,0.511,,,,
