# Enterprise Data Science_COVID-19

### Author: Akanksha Parashar
### Matrikelnummer: 423022

### Notebook Description
* This notebook contains the following three time series graphs for the selected countries (i.e. Germany, India and Italy) :
  - Percentage of COVID infected population 
  - Percentage of partially vaccinated population against COVID
  - Percentage of fully vaccinated population against COVID
* The dataset of the COVID infected population is taken from John Hopkins COVID-19 GitHub page.
* The dataset of the vaccinated population against COVID is taken from John Hopkins COVID-19 vaccination GitHub page.
* The datapath given in this notebook is of user's computer folder where the required csv files are saved.
* The data source is given in the Readme file.  


In [1]:
# Importing the required libraries
import pandas as pd
pd.set_option('display.max_rows',11)
import numpy as np
import plotly.graph_objects as go
import plotly
from datetime import datetime
import warnings
warnings.filterwarnings(action = 'ignore')

# Data visualization of percentage of COVID infected population over time

In [2]:
# COVID dataset path  
data_path = '../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'

In [3]:
# COVID dataset (showing number of infected population on daily basis for almost 2.5 yrs)
data_raw = pd.read_csv(data_path)
data_raw.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,6/10/22,6/11/22,6/12/22,6/13/22,6/14/22,6/15/22,6/16/22,6/17/22,6/18/22,6/19/22
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,180864,180864,180864,181120,181178,181236,181465,181534,181574,181666
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,276638,276690,276731,276731,276821,276821,276821,277141,277141,277409
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,265925,265925,265927,265937,265943,265952,265964,265968,265971,265975
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,43224,43224,43224,43224,43224,43449,43449,43449,43449,43449
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,99761,99761,99761,99761,99761,99761,99761,99761,99761,99761


In [4]:
#Slicing out the date columns from the dataset and converting it to dataframe
time_idx = data_raw.columns[4:]
df_plot = pd.DataFrame({'Date' : time_idx})
df_plot

Unnamed: 0,Date
0,1/22/20
1,1/23/20
2,1/24/20
3,1/25/20
4,1/26/20
...,...
875,6/15/22
876,6/16/22
877,6/17/22
878,6/18/22


In [5]:
# List of countries selected from the dataset
country_list = ['Germany','India','Italy']
country_list

['Germany', 'India', 'Italy']

In [6]:
# Slicing out the data of selected countries from the dataset
for each in country_list:
    df_plot[each] = np.array(data_raw[data_raw['Country/Region'] == each].iloc[:,4::]. sum(axis = 0))
df_plot


Unnamed: 0,Date,Germany,India,Italy
0,1/22/20,0,0,0
1,1/23/20,0,0,0
2,1/24/20,0,0,0
3,1/25/20,0,0,0
4,1/26/20,0,0,0
...,...,...,...,...
875,6/15/22,27096571,43257730,17736696
876,6/16/22,27124689,43270577,17773764
877,6/17/22,27204953,43283793,17809934
878,6/18/22,27204955,43296692,17844905


In [7]:
#changing the data type of the date from string to date format for all the dates
time_idx = [datetime.strptime(each, "%m/%d/%y") for each in df_plot.Date]
df_plot['Date'] = time_idx
type(df_plot['Date'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [8]:
# Vaccination dataset path
data_path1 = 'C:\\Users\\hp\\Documents\\ads_covid_19\\Vaccination_covid_data\\COVID-19\\data_tables\\vaccine_data\\global_data\\time_series_covid19_vaccine_doses_admin_global.csv'

In [9]:
# Time-series vaccination dataset of overall doses
data_raw1 = pd.read_csv(data_path1)
data_raw1.head()

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\hp\\Documents\\ads_covid_19\\Vaccination_covid_data\\COVID-19\\data_tables\\vaccine_data\\global_data\\time_series_covid19_vaccine_doses_admin_global.csv'

In [81]:
# Extracting total population data from the dataset
df1 = data_raw1[['Combined_Key','Population']].set_index('Combined_Key').loc[['Germany','India','Italy']].T
df1

Combined_Key,Germany,India,Italy
Population,83783945.0,1380004000.0,60461828.0


In [82]:
# Combining the required rows and columns using concatenate function
final_data1 = pd.concat([df_plot,df1], axis = 0)
final_data1

Unnamed: 0,Date,Germany,India,Italy
0,2020-01-22,0.0,0.000000e+00,0.0
1,2020-01-23,0.0,0.000000e+00,0.0
2,2020-01-24,0.0,0.000000e+00,0.0
3,2020-01-25,0.0,0.000000e+00,0.0
4,2020-01-26,0.0,0.000000e+00,0.0
...,...,...,...,...
873,2022-06-13,26915085.0,4.323670e+07,17664043.0
874,2022-06-14,27007429.0,4.324552e+07,17703887.0
875,2022-06-15,27096571.0,4.325773e+07,17736696.0
876,2022-06-16,27124689.0,4.327058e+07,17773764.0


In [83]:
# Dataframe showing percentage of infected population wrt date
final_data2 = pd.DataFrame((final_data1.iloc[:,1::].div(final_data1.iloc[-1,1::]))*100).astype(float)
final_data2

Unnamed: 0,Germany,India,Italy
0,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000
...,...,...,...
873,32.124394,3.133084,29.215198
874,32.234611,3.133723,29.281098
875,32.341006,3.134608,29.335362
876,32.374567,3.135539,29.396670


In [84]:
# Generating the final dataframe that is to be plotted(adding the date column to the final_data2 and dropping the population row)
plotting_data = pd.concat([final_data1['Date'], final_data2],axis = 1).drop('Population',axis =0)
plotting_data

Unnamed: 0,Date,Germany,India,Italy
0,2020-01-22,0.000000,0.000000,0.000000
1,2020-01-23,0.000000,0.000000,0.000000
2,2020-01-24,0.000000,0.000000,0.000000
3,2020-01-25,0.000000,0.000000,0.000000
4,2020-01-26,0.000000,0.000000,0.000000
...,...,...,...,...
872,2022-06-12,31.998070,3.132606,29.197554
873,2022-06-13,32.124394,3.133084,29.215198
874,2022-06-14,32.234611,3.133723,29.281098
875,2022-06-15,32.341006,3.134608,29.335362


In [85]:
# Checking the datatypes of the columns in 'plotting_data' dataframe
plotting_data.dtypes

Date       datetime64[ns]
Germany           float64
India             float64
Italy             float64
dtype: object

# The relative cases overtime of Covid infectors

In [86]:
#To include every country in the graph(we will use loop)
fig = go.Figure()
for each in country_list: 
    fig.add_trace(go.Scatter(x = plotting_data.Date, y = plotting_data[each], mode = 'markers+lines', opacity = 0.9, line_width = 2, marker_size = 4, name = each))
#defines the overall layout properties
fig.update_layout(width = 900, height = 600, xaxis_title = "Time", yaxis_title = "Percentage of Confirmed infected people") 
fig.update_yaxes(type = 'linear', range = [0,35])
#range slider gives you the possibility to slide across the x scale 
fig.update_layout(xaxis_rangeslider_visible = True) 

# Result
- Among the selected countries the rise in COVID cases is lowest in India and highest in Germany.
- The difference between COVID cases of Germany and Italy is not very large.

# Partially and fully vaccinated population against COVID

In [87]:
# Vaccination dataset path
vacc_data_path = 'C:\\Users\\hp\\Documents\\ads_covid_19\\Vaccination_covid_data\\COVID-19\\data_tables\\vaccine_data\\global_data\\time_series_covid19_vaccine_global.csv'

In [88]:
# Vaccination data (reading the csv file)
vacc_data_raw = pd.read_csv(vacc_data_path)
vacc_data_raw.head()

Unnamed: 0,Country_Region,Date,Doses_admin,People_partially_vaccinated,People_fully_vaccinated,Report_Date_String,UID,Province_State
0,Canada,2020-12-14,5.0,0.0,0.0,2020-12-14,124.0,
1,World,2020-12-14,5.0,0.0,0.0,2020-12-14,,
2,Canada,2020-12-15,723.0,0.0,0.0,2020-12-15,124.0,
3,China,2020-12-15,1500000.0,0.0,0.0,2020-12-15,156.0,
4,Russia,2020-12-15,28500.0,28500.0,0.0,2020-12-15,643.0,


In [89]:
# Checking the datatype of Date column after conversion
time_idx = [datetime.strptime(each, "%Y-%m-%d") for each in vacc_data_raw.Date]
vacc_data_raw['Date'] = time_idx
type(vacc_data_raw['Date'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [90]:
# Creating a dataframe
vacc_df = pd.DataFrame(vacc_data_raw)
vacc_df

Unnamed: 0,Country_Region,Date,Doses_admin,People_partially_vaccinated,People_fully_vaccinated,Report_Date_String,UID,Province_State
0,Canada,2020-12-14,5.000000e+00,0.000000e+00,0.000000e+00,2020-12-14,124.0,
1,World,2020-12-14,5.000000e+00,0.000000e+00,0.000000e+00,2020-12-14,,
2,Canada,2020-12-15,7.230000e+02,0.000000e+00,0.000000e+00,2020-12-15,124.0,
3,China,2020-12-15,1.500000e+06,0.000000e+00,0.000000e+00,2020-12-15,156.0,
4,Russia,2020-12-15,2.850000e+04,2.850000e+04,0.000000e+00,2020-12-15,643.0,
...,...,...,...,...,...,...,...,...
255199,West Bank and Gaza,2022-06-17,3.729762e+06,,,2022-06-18,275.0,
255200,World,2022-06-17,1.160320e+10,4.986783e+09,4.556617e+09,2022-06-18,,
255201,Yemen,2022-06-17,8.386210e+05,6.929680e+05,4.414040e+05,2022-06-18,887.0,
255202,Zambia,2022-06-17,6.675386e+06,3.240892e+06,4.165002e+06,2022-06-18,894.0,


In [91]:
# Extracting the partially vaccinated population information of different countries wrt Date column using pivot table
vacc_df1 = vacc_df.pivot_table(index = 'Date', columns = 'Country_Region', values = 'People_partially_vaccinated',dropna = False)
vacc_df11 = vacc_df1.reset_index()
vacc_df11

Country_Region,Date,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,...,Uruguay,Uzbekistan,Vanuatu,Venezuela,Vietnam,West Bank and Gaza,World,Yemen,Zambia,Zimbabwe
0,2020-12-14,,,,,,,,,,...,,,,,,,0.000000e+00,,,
1,2020-12-15,,,,,,,,,,...,,,,,,,2.850000e+04,,,
2,2020-12-16,,,,,,,,,,...,,,,,,,2.850000e+04,,,
3,2020-12-17,,,,,,,,,,...,,,,,,,2.850000e+04,,,
4,2020-12-18,,,,,,,,,,...,,,,,,,2.850000e+04,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546,2022-06-13,5456919.0,1320244.0,8210605.0,57880.0,12847899.0,63995.0,41131841.0,1129669.0,22332746.0,...,2994777.0,19463364.0,172232.0,22157232.0,85085764.0,,4.990275e+09,672794.0,3240892.0,6267930.0
547,2022-06-14,5456919.0,1320244.0,8210605.0,57880.0,12847899.0,63995.0,41131841.0,1129669.0,22332746.0,...,2994952.0,19463364.0,172232.0,22157232.0,85130811.0,,4.991902e+09,672794.0,3240892.0,6267930.0
548,2022-06-15,5456919.0,1320244.0,8210605.0,57880.0,12700926.0,63995.0,41155212.0,1129669.0,22335032.0,...,2995204.0,19463364.0,172232.0,22157232.0,85130811.0,,4.980908e+09,672794.0,3240892.0,6267930.0
549,2022-06-16,5573175.0,1322864.0,8210605.0,57880.0,12700926.0,63995.0,41155212.0,1129669.0,22336215.0,...,2995383.0,19509330.0,172232.0,22157232.0,85130811.0,,4.985876e+09,692968.0,3240892.0,6267930.0


In [92]:
# Extracting the partially vaccinated population information of the selected countries
vacc_d1 = vacc_df11[['Date','Germany','India','Italy']]
vacc_d1

Country_Region,Date,Germany,India,Italy
0,2020-12-14,,,
1,2020-12-15,,,
2,2020-12-16,,,
3,2020-12-17,,,
4,2020-12-18,,,
...,...,...,...,...
546,2022-06-13,64557969.0,1.013620e+09,50792865.0
547,2022-06-14,64558995.0,1.013814e+09,50793615.0
548,2022-06-15,64560231.0,1.013988e+09,50794503.0
549,2022-06-16,64561679.0,1.014178e+09,50795192.0


In [93]:
# Extracting the fully vaccinated population information of different countries wrt Date column using pivot table
vacc_df2 = vacc_df.pivot_table(index = ['Date'], columns = 'Country_Region', values = 'People_fully_vaccinated',dropna = False )
vacc_df22 = vacc_df2.reset_index()
vacc_df22

Country_Region,Date,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,...,Uruguay,Uzbekistan,Vanuatu,Venezuela,Vietnam,West Bank and Gaza,World,Yemen,Zambia,Zimbabwe
0,2020-12-14,,,,,,,,,,...,,,,,,,0.000000e+00,,,
1,2020-12-15,,,,,,,,,,...,,,,,,,0.000000e+00,,,
2,2020-12-16,,,,,,,,,,...,,,,,,,0.000000e+00,,,
3,2020-12-17,,,,,,,,,,...,,,,,,,0.000000e+00,,,
4,2020-12-18,,,,,,,,,,...,,,,,,,0.000000e+00,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546,2022-06-13,4807917.0,1241712.0,6851660.0,53450.0,7054640.0,61882.0,37510951.0,985807.0,21637279.0,...,2882314.0,15232341.0,118158.0,14287370.0,79538300.0,,4.558278e+09,429721.0,3878201.0,4559839.0
547,2022-06-14,4807917.0,1241712.0,6851660.0,53450.0,7054640.0,61882.0,37510951.0,985807.0,21637279.0,...,2882657.0,15232341.0,118158.0,14287370.0,79550401.0,,4.560444e+09,429721.0,3878201.0,4559839.0
548,2022-06-15,4807917.0,1241712.0,6851660.0,53450.0,6811268.0,61882.0,37527594.0,985807.0,21641140.0,...,2882979.0,15232341.0,118158.0,14287370.0,79550401.0,,4.550304e+09,429721.0,3878201.0,4559839.0
549,2022-06-16,4923085.0,1244383.0,6851660.0,53450.0,6811268.0,61882.0,37527594.0,985807.0,21643147.0,...,2883973.0,15319040.0,118158.0,14287370.0,79550401.0,,4.554646e+09,441404.0,3878201.0,4559839.0


In [94]:
# Extracting the fully vaccinated population information of the selected countries
vacc_d2 = vacc_df22[['Date','Germany','India','Italy']]
vacc_d2

Country_Region,Date,Germany,India,Italy
0,2020-12-14,,,
1,2020-12-15,,,
2,2020-12-16,,,
3,2020-12-17,,,
4,2020-12-18,,,
...,...,...,...,...
546,2022-06-13,64527483.0,899977040.0,47940398.0
547,2022-06-14,64529524.0,901001260.0,47941105.0
548,2022-06-15,64531934.0,901971812.0,47941786.0
549,2022-06-16,64534057.0,902972911.0,47942365.0


In [95]:
# Population information of the selected countries
vacc_df1_p = data_raw1[['Combined_Key','Population']]
vacc_df_p = vacc_df1_p.dropna().set_index('Combined_Key').loc[['Germany','India','Italy']].T
vacc_df_p

Combined_Key,Germany,India,Italy
Population,83783945.0,1380004000.0,60461828.0


In [96]:
# Adding the country's total population row to the dataframe of partially vaccinated population
vacc_final_data1 = pd.concat([vacc_d1,vacc_df_p,],axis = 0)
vacc_final_data1

Unnamed: 0,Date,Germany,India,Italy
0,2020-12-14,,,
1,2020-12-15,,,
2,2020-12-16,,,
3,2020-12-17,,,
4,2020-12-18,,,
...,...,...,...,...
547,2022-06-14,64558995.0,1.013814e+09,50793615.0
548,2022-06-15,64560231.0,1.013988e+09,50794503.0
549,2022-06-16,64561679.0,1.014178e+09,50795192.0
550,2022-06-17,64562766.0,1.014376e+09,50795903.0


In [97]:
# Adding the country's total population row to the dataframe of fully vaccinated population
vacc_final_data2 = pd.concat([vacc_d2,vacc_df_p,],axis = 0)
vacc_final_data2

Unnamed: 0,Date,Germany,India,Italy
0,2020-12-14,,,
1,2020-12-15,,,
2,2020-12-16,,,
3,2020-12-17,,,
4,2020-12-18,,,
...,...,...,...,...
547,2022-06-14,64529524.0,9.010013e+08,47941105.0
548,2022-06-15,64531934.0,9.019718e+08,47941786.0
549,2022-06-16,64534057.0,9.029729e+08,47942365.0
550,2022-06-17,64535996.0,9.040216e+08,47942921.0


In [98]:
# Calculating the percenatge of partially vaccinated population
vacc_final_data1 = pd.DataFrame((vacc_final_data1.iloc[:,1::].div(vacc_final_data1.iloc[-1,1::]))*100).astype(float)
vacc_final_data1

Unnamed: 0,Germany,India,Italy
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
547,77.054136,73.464522,84.009393
548,77.055611,73.477184,84.010862
549,77.057340,73.490929,84.012002
550,77.058637,73.505303,84.013178


In [99]:
# Calculating the percenatge of fully vaccinated population
vacc_final_data2 = pd.DataFrame((vacc_final_data2.iloc[:,1::].div(vacc_final_data2.iloc[-1,1::]))*100).astype(float)
vacc_final_data2

Unnamed: 0,Germany,India,Italy
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
547,77.018961,65.289739,79.291524
548,77.021838,65.360069,79.292651
549,77.024371,65.432612,79.293608
550,77.026686,65.508606,79.294528


In [100]:
# Creating a dataframe after extracting the 'Date' column
vacc_percentage = pd.DataFrame(vacc_d2['Date'])
vacc_percentage

Unnamed: 0,Date
0,2020-12-14
1,2020-12-15
2,2020-12-16
3,2020-12-17
4,2020-12-18
...,...
546,2022-06-13
547,2022-06-14
548,2022-06-15
549,2022-06-16


In [101]:
# Adding the date column and dropping the population row from the partially vaccinated population dataframe 
vacc_plotting_data1 = pd.concat([vacc_percentage, vacc_final_data1],axis = 1).drop('Population',axis =0)
vacc_plotting_data1


Unnamed: 0,Date,Germany,India,Italy
0,2020-12-14,,,
1,2020-12-15,,,
2,2020-12-16,,,
3,2020-12-17,,,
4,2020-12-18,,,
...,...,...,...,...
546,2022-06-13,77.052912,73.450467,84.008153
547,2022-06-14,77.054136,73.464522,84.009393
548,2022-06-15,77.055611,73.477184,84.010862
549,2022-06-16,77.057340,73.490929,84.012002


In [102]:
# Checking the datatype of different columns of partially vaccinated population percentage dataframe 
vacc_plotting_data1.dtypes

Date       datetime64[ns]
Germany           float64
India             float64
Italy             float64
dtype: object

In [103]:
# Adding the date column and dropping the population row from the fully vaccinated population dataframe 
vacc_plotting_data2 = pd.concat([vacc_percentage, vacc_final_data2],axis = 1).drop('Population',axis =0)
vacc_plotting_data2

Unnamed: 0,Date,Germany,India,Italy
0,2020-12-14,,,
1,2020-12-15,,,
2,2020-12-16,,,
3,2020-12-17,,,
4,2020-12-18,,,
...,...,...,...,...
546,2022-06-13,77.016525,65.215520,79.290355
547,2022-06-14,77.018961,65.289739,79.291524
548,2022-06-15,77.021838,65.360069,79.292651
549,2022-06-16,77.024371,65.432612,79.293608


In [104]:
# Checking the datatype of different columns of fully vaccinated population percentage dataframe 
vacc_plotting_data2.dtypes

Date       datetime64[ns]
Germany           float64
India             float64
Italy             float64
dtype: object

# The vaccination rate of partially vaccinated population over time

In [105]:
#To include every country in the graph(we will use loop)
fig = go.Figure()
for each in country_list: 
    fig.add_trace(go.Scatter(x = vacc_plotting_data1.Date, y = vacc_plotting_data1[each], mode = 'markers+lines', opacity = 0.9, line_width = 2, marker_size = 4, name = each))
#defines the overall layout properties
fig.update_layout(width = 900, height = 600, xaxis_title = "Time", yaxis_title = "Percentage_of_People_partially_vaccinated") 
fig.update_yaxes(type = 'linear', range = [0,90])
#range slider gives you the possibility to slide across the x scale 
fig.update_layout(xaxis_rangeslider_visible = True) 

# The vaccination rate of fully vaccinated population over time 

In [106]:
#To include every country in the graph(we will use loop)
fig = go.Figure()
for each in country_list: 
    fig.add_trace(go.Scatter(x = vacc_plotting_data2.Date, y = vacc_plotting_data2[each], mode = 'markers+lines', opacity = 0.9, line_width = 2, marker_size = 4, name = each))
#defines the overall layout properties
fig.update_layout(width = 900, height = 600, xaxis_title = "Time", yaxis_title = "Percentage_of_People_fully_vaccinated") 
fig.update_yaxes(type = 'linear', range = [0,90])
#range slider gives you the possibility to slide across the x scale 
fig.update_layout(xaxis_rangeslider_visible = True) 

# Result
- Among the selected countries, the vaccination rate is highest in Italy and lowest in India.
- The difference between Germany and Italy vaccination rate is not very large.