## World Covid data set from January 2020 to August 2022

Dataset was downloaded from [here](https://ourworldindata.org/covid-deaths)

Split the dataset into 2 sections:
* death info
* vaccinations info 

In [1]:
# import packages 

import pandas as pd
import numpy as np

# read the data
df = pd.read_csv('covid-data.csv')


# shape and datatypes of the data
print(df.shape)
print(df.dtypes)

(208798, 67)
iso_code                                    object
continent                                   object
location                                    object
date                                        object
total_cases                                float64
                                            ...   
human_development_index                    float64
excess_mortality_cumulative_absolute       float64
excess_mortality_cumulative                float64
excess_mortality                           float64
excess_mortality_cumulative_per_million    float64
Length: 67, dtype: object


In [2]:
# select non numeric columns
df_non_numeric = df.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values
print(non_numeric_cols)

['iso_code' 'continent' 'location' 'date' 'tests_units']


In [3]:
# select numeric columns
df_numeric = df.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values
print(numeric_cols)


['total_cases' 'new_cases' 'new_cases_smoothed' 'total_deaths'
 'new_deaths' 'new_deaths_smoothed' 'total_cases_per_million'
 'new_cases_per_million' 'new_cases_smoothed_per_million'
 'total_deaths_per_million' 'new_deaths_per_million'
 'new_deaths_smoothed_per_million' 'reproduction_rate' 'icu_patients'
 'icu_patients_per_million' 'hosp_patients' 'hosp_patients_per_million'
 'weekly_icu_admissions' 'weekly_icu_admissions_per_million'
 'weekly_hosp_admissions' 'weekly_hosp_admissions_per_million'
 'total_tests' 'new_tests' 'total_tests_per_thousand'
 'new_tests_per_thousand' 'new_tests_smoothed'
 'new_tests_smoothed_per_thousand' 'positive_rate' 'tests_per_case'
 'total_vaccinations' 'people_vaccinated' 'people_fully_vaccinated'
 'total_boosters' 'new_vaccinations' 'new_vaccinations_smoothed'
 'total_vaccinations_per_hundred' 'people_vaccinated_per_hundred'
 'people_fully_vaccinated_per_hundred' 'total_boosters_per_hundred'
 'new_vaccinations_smoothed_per_million' 'new_people_vaccinate

In [4]:
# Create new dataframe with selected death related columns

column_names = ['iso_code','continent','location','date',
                'population',
                'total_cases','new_cases','new_cases_smoothed','total_deaths',
                'new_deaths','new_deaths_smoothed',
                'total_cases_per_million',
                'new_cases_per_million','new_cases_smoothed_per_million',
                'total_deaths_per_million','new_deaths_per_million', 'new_deaths_smoothed_per_million',
                'reproduction_rate',
                'icu_patients', 'icu_patients_per_million',
                'hosp_patients','hosp_patients_per_million',
                'weekly_icu_admissions','weekly_icu_admissions_per_million',
                'weekly_hosp_admissions','weekly_hosp_admissions_per_million']

df_death = df.loc[:, column_names]

In [5]:
print (df_death.head())

  iso_code continent     location        date  population  total_cases  \
0      AFG      Asia  Afghanistan  2020-02-24  40099462.0          5.0   
1      AFG      Asia  Afghanistan  2020-02-25  40099462.0          5.0   
2      AFG      Asia  Afghanistan  2020-02-26  40099462.0          5.0   
3      AFG      Asia  Afghanistan  2020-02-27  40099462.0          5.0   
4      AFG      Asia  Afghanistan  2020-02-28  40099462.0          5.0   

   new_cases  new_cases_smoothed  total_deaths  new_deaths  ...  \
0        5.0                 NaN           NaN         NaN  ...   
1        0.0                 NaN           NaN         NaN  ...   
2        0.0                 NaN           NaN         NaN  ...   
3        0.0                 NaN           NaN         NaN  ...   
4        0.0                 NaN           NaN         NaN  ...   

   new_deaths_smoothed_per_million  reproduction_rate  icu_patients  \
0                              NaN                NaN           NaN   
1         

In [6]:
print(type(df_death))

<class 'pandas.core.frame.DataFrame'>


In [7]:
# shape and datatypes of the data
print(df_death.shape)
print(df_death.dtypes)

(208798, 26)
iso_code                               object
continent                              object
location                               object
date                                   object
population                            float64
total_cases                           float64
new_cases                             float64
new_cases_smoothed                    float64
total_deaths                          float64
new_deaths                            float64
new_deaths_smoothed                   float64
total_cases_per_million               float64
new_cases_per_million                 float64
new_cases_smoothed_per_million        float64
total_deaths_per_million              float64
new_deaths_per_million                float64
new_deaths_smoothed_per_million       float64
reproduction_rate                     float64
icu_patients                          float64
icu_patients_per_million              float64
hosp_patients                         float64
hosp_patients_per_mil

In [8]:
df_death.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208798 entries, 0 to 208797
Data columns (total 26 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   iso_code                            208798 non-null  object 
 1   continent                           196736 non-null  object 
 2   location                            208798 non-null  object 
 3   date                                208798 non-null  object 
 4   population                          207561 non-null  float64
 5   total_cases                         200303 non-null  float64
 6   new_cases                           199943 non-null  float64
 7   new_cases_smoothed                  198766 non-null  float64
 8   total_deaths                        181503 non-null  float64
 9   new_deaths                          181458 non-null  float64
 10  new_deaths_smoothed                 180286 non-null  float64
 11  total_cases_per_million   

In [9]:
df_death = df_death.fillna(0)

In [10]:
# saving the dataframe
df_death.to_csv('covidDeaths.csv')

In [11]:
# Create new dataframe with selected vaccination related columns

column_names = ['iso_code','continent','location','date',
                'population',
                'total_tests','new_tests','total_tests_per_thousand',
                'new_tests_per_thousand','new_tests_smoothed',
                'new_tests_smoothed_per_thousand','positive_rate','tests_per_case',
                'total_vaccinations','people_vaccinated','people_fully_vaccinated',
                'total_boosters','new_vaccinations','new_vaccinations_smoothed',
                'total_vaccinations_per_hundred','people_vaccinated_per_hundred',
                'people_fully_vaccinated_per_hundred','total_boosters_per_hundred',
                'new_vaccinations_smoothed_per_million','new_people_vaccinated_smoothed',
                'new_people_vaccinated_smoothed_per_hundred','stringency_index',
                'population_density','median_age','aged_65_older',
                'aged_70_older','gdp_per_capita','extreme_poverty',
                'cardiovasc_death_rate','diabetes_prevalence','female_smokers',
                'male_smokers','handwashing_facilities','hospital_beds_per_thousand',
                'life_expectancy','human_development_index',
                'excess_mortality_cumulative_absolute','excess_mortality_cumulative',
                'excess_mortality','excess_mortality_cumulative_per_million']

df_vaccinations = df.loc[:, column_names]

In [12]:
df_vaccinations = df_vaccinations.fillna(0)

In [13]:
# saving the dataframe
df_vaccinations.to_csv('covidVaccinations.csv')