In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter("ignore")

# Data Preparation

**Inspect data**

In [2]:
data = pd.read_csv('country_vaccinations.csv')
data.head()

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Afghanistan,AFG,2021-02-22,0.0,0.0,,,,0.0,0.0,,,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
1,Afghanistan,AFG,2021-02-23,,,,,1367.0,,,,35.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
2,Afghanistan,AFG,2021-02-24,,,,,1367.0,,,,35.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
3,Afghanistan,AFG,2021-02-25,,,,,1367.0,,,,35.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
4,Afghanistan,AFG,2021-02-26,,,,,1367.0,,,,35.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/


In [3]:
data.columns

Index(['country', 'iso_code', 'date', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated',
       'daily_vaccinations_raw', 'daily_vaccinations',
       'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred',
       'people_fully_vaccinated_per_hundred', 'daily_vaccinations_per_million',
       'vaccines', 'source_name', 'source_website'],
      dtype='object')

**Keep only useful columns**

In [4]:
relevant_columns = [
    'country',
    'iso_code',
    'date',
    'daily_vaccinations',
    'people_vaccinated',
    'people_vaccinated_per_hundred',
    'vaccines',
]

data = data[relevant_columns]
data.columns

Index(['country', 'iso_code', 'date', 'daily_vaccinations',
       'people_vaccinated', 'people_vaccinated_per_hundred', 'vaccines'],
      dtype='object')

**Check datatypes**

In [5]:
data.dtypes

country                           object
iso_code                          object
date                              object
daily_vaccinations               float64
people_vaccinated                float64
people_vaccinated_per_hundred    float64
vaccines                          object
dtype: object

**Inspect missing values**

In [6]:
data.isnull().sum()

country                              0
iso_code                             0
date                                 0
daily_vaccinations                 243
people_vaccinated                14761
people_vaccinated_per_hundred    14761
vaccines                             0
dtype: int64

Check if `people_vaccinated` and `people_vaccinated_per_hundred` line up in terms of missing values

In [7]:
data.loc[data.people_vaccinated.isnull() | data.people_vaccinated_per_hundred.isnull()].shape[0]

14761

Yes

Check which countries and dates have missing data

In [8]:
data.loc[data.people_vaccinated.isnull(), 'country'].value_counts()

China                   214
Monaco                  183
Kuwait                  183
United Arab Emirates    178
Saudi Arabia            172
                       ... 
Romania                   3
England                   2
Denmark                   2
Malta                     1
Scotland                  1
Name: country, Length: 195, dtype: int64

In [9]:
data.loc[data.people_vaccinated.isnull(), 'date'].value_counts()

2021-06-26    119
2021-06-19    119
2021-06-04    117
2021-05-15    117
2021-05-13    114
             ... 
2020-12-11      1
2020-12-07      1
2020-12-06      1
2020-12-10      1
2020-12-13      1
Name: date, Length: 225, dtype: int64

In [10]:
data.country.nunique()

219

Almost every country has some missing data as expected, but all the categorical features are present.

Check what proportion of data is missing.

In [11]:
pct = data.loc[data.people_vaccinated_per_hundred.isnull()].shape[0] / data.shape[0] * 100
print(f'{pct:.3f}% data missing')

46.938% data missing


A very big chunk of data is missing. We need to fill this up somehow. Since the `people_vaccinated*` columns are cumulative, we will use the last available value. Since `daily_vaccinations` is not we will leave them be for now.

In [12]:
data.sort_values(by='date', inplace=True)
all_countries = list(data.country.unique())

for country in all_countries:
    for col in ['people_vaccinated', 'people_vaccinated_per_hundred']:
        data.loc[data.country == country, col] = \
        data.loc[data.country == country, col].fillna(method='bfill').fillna(method='ffill')

In [13]:
pct = data.loc[data.people_vaccinated_per_hundred.isnull()].shape[0] / data.shape[0] * 100
print(f'{pct:.3f}% data missing')

0.000% data missing


Let's goooo

In [14]:
data.head()

Unnamed: 0,country,iso_code,date,daily_vaccinations,people_vaccinated,people_vaccinated_per_hundred,vaccines
21261,Norway,NOR,2020-12-02,,0.0,0.0,"Moderna, Pfizer/BioNTech"
21262,Norway,NOR,2020-12-03,0.0,0.0,0.0,"Moderna, Pfizer/BioNTech"
21263,Norway,NOR,2020-12-04,0.0,0.0,0.0,"Moderna, Pfizer/BioNTech"
15647,Latvia,LVA,2020-12-04,,1.0,0.0,"Johnson&Johnson, Moderna, Oxford/AstraZeneca, ..."
15648,Latvia,LVA,2020-12-05,0.0,2.0,0.0,"Johnson&Johnson, Moderna, Oxford/AstraZeneca, ..."


# Data Analysis

**What vaccines are used and in which countries?**

In [17]:
data.vaccines.value_counts().head()

Oxford/AstraZeneca                                               4654
Johnson&Johnson, Moderna, Oxford/AstraZeneca, Pfizer/BioNTech    4145
Moderna, Oxford/AstraZeneca, Pfizer/BioNTech                     2600
Oxford/AstraZeneca, Sinopharm/Beijing                            2386
Oxford/AstraZeneca, Pfizer/BioNTech                              2080
Name: vaccines, dtype: int64

Different combinations of vaccines are counted as unique values. We need to separate these into individual vaccines per country.

In [23]:
vaccines_df = data[['iso_code', 'vaccines']].drop_duplicates()
print(f'Rows: {vaccines_df.shape[0]}, Unique ISOs: {vaccines_df.iso_code.nunique()}')

Rows: 219, Unique ISOs: 219


In [34]:
# Make one row for each iso_code and each individual vaccine
map_plot_df = pd.concat([pd.Series(row['iso_code'], row['vaccines'].split(','))              
                         for _, row in vaccines_df.iterrows()]).reset_index()
# Rename columns
map_plot_df.columns = ['vaccine', 'iso_code']
# Remove whitespace from vaccine names
map_plot_df['vaccine'] = map_plot_df['vaccine'].str.strip(' ')

map_plot_df.head(10)

Unnamed: 0,vaccine,iso_code
0,Moderna,NOR
1,Pfizer/BioNTech,NOR
2,Johnson&Johnson,LVA
3,Moderna,LVA
4,Oxford/AstraZeneca,LVA
5,Pfizer/BioNTech,LVA
6,Moderna,OWID_ENG
7,Oxford/AstraZeneca,OWID_ENG
8,Pfizer/BioNTech,OWID_ENG
9,Moderna,OWID_SCT


In [36]:
vaccine_counts = map_plot_df.vaccine.value_counts()
print(f"There are {len(vaccine_counts)} unique vaccines used around the world.")
vaccine_counts

There are 16 unique vaccines used around the world.


Oxford/AstraZeneca    180
Pfizer/BioNTech       109
Sinopharm/Beijing      61
Moderna                56
Sputnik V              48
Sinovac                36
Johnson&Johnson        30
Covaxin                 6
CanSino                 4
EpiVacCorona            2
Sinopharm/Wuhan         2
Soberana02              1
Abdala                  1
QazVac                  1
RBD-Dimer               1
Sinopharm/HayatVax      1
Name: vaccine, dtype: int64

There are