In [2]:
# Import modules
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

In [3]:
# Load data
file = 'data/owid-covid-data.csv'
df = pd.read_csv(file)

In [4]:
# Preview data
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,


In [5]:
# List columns
df.columns

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       't

In [6]:
# Rename location to country
df.rename(columns={'location': 'country'}, inplace=True)

In [7]:
# Specify columns of interest
df = df[['country', 'date', 'new_cases', 'total_cases', 'new_vaccinations', 'total_vaccinations', 'new_deaths', 'total_deaths']] 

# Get rows with United States only
US = df[df.country == 'United States']


In [8]:
# Confirm United States is the only country in the dataframe
US.country.unique()

array(['United States'], dtype=object)

In [9]:
# Drop country column 
US.drop('country', axis=1, inplace=True)

In [10]:
# Check data types
US.dtypes

date                   object
new_cases             float64
total_cases           float64
new_vaccinations      float64
total_vaccinations    float64
new_deaths            float64
total_deaths          float64
dtype: object

In [11]:
# Convert 'date' to datetime
US['date'] = pd.to_datetime(US['date'])

In [12]:
US.dtypes

date                  datetime64[ns]
new_cases                    float64
total_cases                  float64
new_vaccinations             float64
total_vaccinations           float64
new_deaths                   float64
total_deaths                 float64
dtype: object

In [13]:
# Set index to 'date' column
US.set_index('date', inplace=True)

# Sort index (dates)
US.sort_index(inplace=True)

In [14]:
# Verify dates are in order
US

Unnamed: 0_level_0,new_cases,total_cases,new_vaccinations,total_vaccinations,new_deaths,total_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-22,,1.0,,,,
2020-01-23,0.0,1.0,,,,
2020-01-24,1.0,2.0,,,,
2020-01-25,0.0,2.0,,,,
2020-01-26,3.0,5.0,,,,
...,...,...,...,...,...,...
2022-10-23,8100.0,97203611.0,118883.0,636589699.0,1.0,1067810.0
2022-10-24,37641.0,97241252.0,241983.0,636831682.0,196.0,1068006.0
2022-10-25,41954.0,97283206.0,39875.0,636871557.0,415.0,1068421.0
2022-10-26,69897.0,97353103.0,,,1095.0,1069516.0


In [15]:
# Check missing data
null_data = US[US.isnull().any(axis=1)]
null_data

Unnamed: 0_level_0,new_cases,total_cases,new_vaccinations,total_vaccinations,new_deaths,total_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-22,,1.0,,,,
2020-01-23,0.0,1.0,,,,
2020-01-24,1.0,2.0,,,,
2020-01-25,0.0,2.0,,,,
2020-01-26,3.0,5.0,,,,
...,...,...,...,...,...,...
2020-12-13,190625.0,16439296.0,,39445.0,1596.0,302746.0
2022-06-30,167429.0,87692440.0,279928.0,598670692.0,,1017651.0
2022-08-22,129194.0,93677758.0,163417.0,610373762.0,,1041182.0
2022-10-26,69897.0,97353103.0,,,1095.0,1069516.0


## Note
<ul> 
    <li> COVID vaccines did not exist before Decemeber 2020. Therefore, the 'new_vaccinations' and 'total_vaccinations' columns show <b> missing values before December 2020 </b> </li>
    <li> The United States <b> might not have kept track </b> of the 'new_cases' or 'new_deaths' when the coronavirus was first discovered; hence, the missing values </li>
    <li> <b> Recent data </b> may not have be inputted in the dataset yet </li>
</ul>

In [16]:
# Download dataframe as csv
US.to_csv('data/US-covid.csv')