In [79]:
import pandas as pd

In [80]:
vaccinations = pd.read_csv('./data/vaccinations.csv')
locations = pd.read_csv('./data/locations.csv')

In [81]:
print(vaccinations.shape)
print(vaccinations.dtypes)
vaccinations.head()

(682, 7)
location                           object
iso_code                           object
date                               object
total_vaccinations                float64
daily_vaccinations                float64
total_vaccinations_per_hundred    float64
daily_vaccinations_per_million    float64
dtype: object


Unnamed: 0,location,iso_code,date,total_vaccinations,daily_vaccinations,total_vaccinations_per_hundred,daily_vaccinations_per_million
0,Argentina,ARG,2020-12-29,700.0,,0.0,
1,Argentina,ARG,2020-12-30,,15656.0,,346.0
2,Argentina,ARG,2020-12-31,32013.0,15656.0,0.07,346.0
3,Argentina,ARG,2021-01-01,,11070.0,,245.0
4,Argentina,ARG,2021-01-02,,8776.0,,194.0


In [82]:
print(locations.shape)
locations.head()

(50, 6)


Unnamed: 0,location,iso_code,vaccines,last_observation_date,source_name,source_website
0,Argentina,ARG,Sputnik V,2021-01-08,Government of Argentina,https://www.argentina.gob.ar/noticias/ya-se-ap...
1,Austria,AUT,Pfizer/BioNTech,2021-01-08,Ministry of Health,https://twitter.com/redouad/status/13482561527...
2,Bahrain,BHR,"Pfizer/BioNTech, Sinopharm",2021-01-10,Ministry of Health,https://twitter.com/MOH_Bahrain/status/1348359...
3,Belgium,BEL,Pfizer/BioNTech,2021-01-07,Government vaccination taskforce,https://www.lalibre.be/belgique/societe/vaccin...
4,Bulgaria,BGR,Pfizer/BioNTech,2021-01-10,Ministry of Health,https://coronavirus.bg/bg/statistika


In [83]:
# Count missing values

print(vaccinations.isnull().sum())

print('\n\n', locations.isnull().sum())

location                            0
iso_code                          113
date                                0
total_vaccinations                273
daily_vaccinations                 79
total_vaccinations_per_hundred    273
daily_vaccinations_per_million     79
dtype: int64


 location                 0
iso_code                 4
vaccines                 0
last_observation_date    0
source_name              0
source_website           0
dtype: int64


In [84]:
# Goal: to find the total number of daily vaccinations for each country and compare to latest total vaccinations

# Fill in missing values in daily vaccinations
vaccinations['daily_vaccinations'] = vaccinations['daily_vaccinations'].fillna(vaccinations['total_vaccinations'])

# Current total vaccinations for each country (which is the same as the max)
given_num_vaccinations = vaccinations.groupby(['iso_code']).max()[['total_vaccinations']]
given_num_vaccinations.head()

Unnamed: 0_level_0,total_vaccinations
iso_code,Unnamed: 1_level_1
ARE,1167251.0
ARG,107542.0
AUT,30150.0
BEL,8000.0
BGR,13473.0


In [85]:
calc_num_vaccinations = vaccinations.groupby(['iso_code'], dropna=True).sum()[['daily_vaccinations']]
calc_num_vaccinations.head()

Unnamed: 0_level_0,daily_vaccinations
iso_code,Unnamed: 1_level_1
ARE,1081933.0
ARG,100780.0
AUT,30147.0
BEL,8000.0
BGR,13541.0


In [86]:
# Differences between given total and total calculated from sum of recorded daily vaccinations

given_calc_difference = given_num_vaccinations['total_vaccinations'] - calc_num_vaccinations['daily_vaccinations']

# We can see that many countries under-reported daily vaccinations, but ended up including them in their total
given_calc_difference.head()

iso_code
ARE    85318.0
ARG     6762.0
AUT        3.0
BEL        0.0
BGR      -68.0
dtype: float64

In [87]:
# We can spread the differences out across all daily vaccinations so that the total daily number is equal
# to the total reported

runoff = given_calc_difference / vaccinations.groupby('iso_code').count()['date']
runoff.head()

iso_code
ARE    12188.285714
ARG      614.727273
AUT        0.300000
BEL        0.000000
BGR       -5.230769
dtype: float64

In [88]:
vaccinations['runoff'] = vaccinations['iso_code'].map(runoff)

vaccinations.head()

Unnamed: 0,location,iso_code,date,total_vaccinations,daily_vaccinations,total_vaccinations_per_hundred,daily_vaccinations_per_million,runoff
0,Argentina,ARG,2020-12-29,700.0,700.0,0.0,,614.727273
1,Argentina,ARG,2020-12-30,,15656.0,,346.0,614.727273
2,Argentina,ARG,2020-12-31,32013.0,15656.0,0.07,346.0,614.727273
3,Argentina,ARG,2021-01-01,,11070.0,,245.0,614.727273
4,Argentina,ARG,2021-01-02,,8776.0,,194.0,614.727273


In [89]:
# Finally, add daily_vaccinations and runoff together to get our desired results for daily vaccinations
daily_vaccinations = vaccinations['daily_vaccinations']

vaccinations['daily_vaccinations'] = daily_vaccinations + vaccinations['runoff']
vaccinations = vaccinations.round({'daily_vaccinations' : 0})
vaccinations = vaccinations.drop(columns=['runoff'])

vaccinations.head()

Unnamed: 0,location,iso_code,date,total_vaccinations,daily_vaccinations,total_vaccinations_per_hundred,daily_vaccinations_per_million
0,Argentina,ARG,2020-12-29,700.0,1315.0,0.0,
1,Argentina,ARG,2020-12-30,,16271.0,,346.0
2,Argentina,ARG,2020-12-31,32013.0,16271.0,0.07,346.0
3,Argentina,ARG,2021-01-01,,11685.0,,245.0
4,Argentina,ARG,2021-01-02,,9391.0,,194.0


In [90]:
vaccinations.to_csv("./data/vaccinations_processed.csv")