In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

## Investigate data

In [2]:
df = pd.read_csv("covid_19_data.csv")

In [None]:
df.head()

In [None]:
df.info()

## Data cleaning

In [3]:
# rename columns for convenience
df.columns = ['serial', 'obsv_date', 'province_state', 'country_region', 'last_update', 'confirmed', 'deaths', 'recovered']

# set serial number as index
df = df.set_index('serial')

# convert floats to integers
df.iloc[:,-3:] = df.iloc[:,-3:].astype(int)

# strip whitespace
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# rename values
df = df.replace({'country_region': {"Bahamas, The": "Bahamas", 
                                    "The Bahamas": "Bahamas",
                                    "(\'St. Martin\',)": "St. Martin",
                                    "UK": "United Kingdom",
                                    "US": "United States", 
                                    " Azerbaijan": "Azerbaijan",
                                    "Gambia, The": "Gambia",
                                    "occupied Palestinian territory": "Occupied Palestinian Territory"},
                 'province_state': {"Grand Princess Cruise Ship": "Grand Princess"}})

# remove duplicates
df = df.drop_duplicates()

# remove non country/regions
df = df[df['country_region'] != 'MS Zaandam']

# convert observation_date and last_update to datetime format
df['obsv_date'] = pd.to_datetime(df['obsv_date'])
df['last_update'] = pd.to_datetime(df['last_update'])

In [None]:
# remove rows with all zeros
df = df[~((df['confirmed'] == 0) & (df['deaths'] == 0) & (df['recovered'] == 0))]

In [None]:
# check for outliers
df[(df['confirmed'] < 0) | (df['deaths'] < 0) | (df['recovered'] < 0)]

In [4]:
print(df[((df['country_region'] == 'Colombia') & (df['province_state'] == "Unknown"))].to_string())

        obsv_date province_state country_region         last_update  confirmed  deaths  recovered
serial                                                                                           
33583  2020-06-01        Unknown       Colombia 2021-04-02 15:13:53       1087       0          0
34250  2020-06-02        Unknown       Colombia 2021-04-02 15:13:53         50       0          0
34918  2020-06-03        Unknown       Colombia 2021-04-02 15:13:53         51       0          0
35586  2020-06-04        Unknown       Colombia 2021-04-02 15:13:53         56       0          0
36268  2020-06-05        Unknown       Colombia 2021-04-02 15:13:53         62       0          0
36956  2020-06-06        Unknown       Colombia 2021-04-02 15:13:53         62       0          0
37644  2020-06-07        Unknown       Colombia 2021-04-02 15:13:53         61       0          0
38332  2020-06-08        Unknown       Colombia 2021-04-02 15:13:53         64       0          0
39020  2020-06-09   

In [None]:
# drop Colombia:Unknown data
df = df[~((df['country_region'] == 'Colombia') & (df['province_state'] == "Unknown"))]

In [None]:
# ISSUES
# df_new = df[~(df['confirmed'] != 0) & (df['deaths'] != 0) & (df['recovered'] != 0)]
# df_new

In [5]:
temp = df.groupby(['country_region', 'obsv_date']).sum(['confirmed', 'deaths', 'recovered'])
temp = temp.sort_index(level = ['country_region', 'obsv_date'])

In [6]:
json = pd.read_csv("covid_data_from_json.csv")
json['Date'] = pd.to_datetime(json['Date'])

In [7]:
json_temp = json.groupby(['Country', 'Date']).sum(['Confirmed', 'Deaths', 'Recovered'])
json_temp = json_temp.sort_index(level = ['Country', 'Date'])

In [17]:
json_temp.loc[('India', '2021-05-19'):('India','2021-05-29')]

Unnamed: 0_level_0,Unnamed: 1_level_0,Confirmed,Deaths,Recovered
Country,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
India,2021-05-19,25772440,287122,22355440
India,2021-05-20,26031991,291331,22712735
India,2021-05-21,26289290,295525,23070365
India,2021-05-22,26530132,299266,23425467
India,2021-05-23,26752447,303720,23728011
India,2021-05-24,26948874,307231,24054861
India,2021-05-25,27157795,311388,24350816
India,2021-05-26,27369093,315235,24633951
India,2021-05-27,27555457,318895,24893410
India,2021-05-28,27729247,322512,25178011


In [16]:
temp.loc[('India', '2021-05-19'):('India','2021-05-29')]

Unnamed: 0_level_0,Unnamed: 1_level_0,confirmed,deaths,recovered
country_region,obsv_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
India,2021-05-19,25772440,287122,22355440
India,2021-05-20,26031991,291331,22712735
India,2021-05-21,26289290,295525,23070365
India,2021-05-22,26530132,299266,23425467
India,2021-05-23,26752447,303720,23728011
India,2021-05-24,26948874,307231,24054861
India,2021-05-25,27157795,311388,24350816
India,2021-05-26,27369093,315235,24633951
India,2021-05-27,27555457,318895,24893410
India,2021-05-28,27729247,322512,25178011


In [12]:
json_temp.loc[('Colombia', '2020-10-20'):('Colombia','2020-11-02')]

Unnamed: 0_level_0,Unnamed: 1_level_0,Confirmed,Deaths,Recovered
Country,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Colombia,2020-10-20,974139,29272,876731
Colombia,2020-10-21,981700,29464,884895
Colombia,2020-10-22,990373,29637,893810
Colombia,2020-10-23,998942,29802,901652
Colombia,2020-10-24,1007711,30000,907379
Colombia,2020-10-25,1015885,30154,907379
Colombia,2020-10-26,1025052,30348,924044
Colombia,2020-10-27,1033218,30565,932882
Colombia,2020-10-28,1041935,30753,941874
Colombia,2020-10-29,1053122,30926,950348


In [13]:
temp.loc[('Colombia', '2020-10-20'):('Colombia','2020-11-02')]

Unnamed: 0_level_0,Unnamed: 1_level_0,confirmed,deaths,recovered
country_region,obsv_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Colombia,2020-10-20,974139,29272,876731
Colombia,2020-10-21,981700,29464,884895
Colombia,2020-10-22,990373,29637,893810
Colombia,2020-10-23,998942,29802,901652
Colombia,2020-10-24,1007711,30000,907379
Colombia,2020-10-25,1015885,30000,907379
Colombia,2020-10-26,1025052,30348,924044
Colombia,2020-10-27,1033218,30565,932882
Colombia,2020-10-28,1041936,30753,941874
Colombia,2020-10-29,1048055,30926,950348
