In [1]:
import pandas as pd
import numpy as np
import os

df = pd.read_csv("data/country_vaccinations.csv")
df.head()

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Afghanistan,AFG,2021-02-22,0.0,0.0,,,,0.0,0.0,,,Oxford/AstraZeneca,Government of Afghanistan,https://reliefweb.int/report/afghanistan/afgha...
1,Afghanistan,AFG,2021-02-23,,,,,1367.0,,,,35.0,Oxford/AstraZeneca,Government of Afghanistan,https://reliefweb.int/report/afghanistan/afgha...
2,Afghanistan,AFG,2021-02-24,,,,,1367.0,,,,35.0,Oxford/AstraZeneca,Government of Afghanistan,https://reliefweb.int/report/afghanistan/afgha...
3,Afghanistan,AFG,2021-02-25,,,,,1367.0,,,,35.0,Oxford/AstraZeneca,Government of Afghanistan,https://reliefweb.int/report/afghanistan/afgha...
4,Afghanistan,AFG,2021-02-26,,,,,1367.0,,,,35.0,Oxford/AstraZeneca,Government of Afghanistan,https://reliefweb.int/report/afghanistan/afgha...


In [None]:
#fill NA values with forward fill
df_filled = df.groupby('country').fillna(method='ffill') # ffill: propagate last valid observation forward

#the rest NA values can be set to 0
df_filled.fillna(0, inplace=True) # Replace NaN->0's
df_filled["country"] = df["country"]

#reorder columns for ease of use
cols_to_order = ['country', 'iso_code', 'date', 'vaccines']
new_columns = cols_to_order + (df_filled.columns.drop(cols_to_order).tolist())
df_filled = df_filled[new_columns]
df_filled.head()

In [None]:
from collections import Counter

In [None]:
vaccines = df_filled.vaccines.unique()
for v in vaccines:
    countries = df_filled.loc[df_filled.vaccines==v, 'country'].values
    print(f"Vaccines: {v}: \nCountries: {(np.unique(countries))}\n")

In [None]:
# Some Funtions For Plotting Several Data 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from scipy import stats
from collections import Counter
from plotly.offline import iplot


cols_data = []
total_data = []
df_by_country = []

for country in np.unique(df['country']):
    df_by_country.append(df[df['country']==country])

def bars(data, x, y, title, figsize=(10, 12), rotation=75, size=8, width=None, height=None, colour=None):
    count = pd.DataFrame({x:data[x], y:data[y]}).sort_values(ascending=False, by=y)
    if not colour == None:
        c = count[colour]
    else:
        c = None
    fig = px.bar(count, x=x, y=y, title=title, width=width, height=height, color=c)
    fig.show()
    
def scatter(countries, col, threshold):
    i = 0
    fig = go.Figure()
    for country in countries:
        if list(df[df['country']==country][col])[-1]<threshold:
            if i % 2 == 0:
                i += 1
                continue
        i += 1
        df_temp = df[df['country']==country]
        trace = go.Scatter(x=df_temp['date'], y=df_temp[col], 
                           name=country,
                           mode='markers+lines')
        fig.add_trace(trace)
    fig.update_layout(legend_title=dict(text='Countries', font=dict(family="sans-serif",
                                         size=18)))
    fig.update_xaxes(title='date')
    fig.update_yaxes(title=col)
    fig.show()
    
def pie(data, x, y, title):
    data = pd.DataFrame({x:data.keys(), y:data.values()}).sort_values(ascending=False, by=y)
    fig = px.pie(data, values=y, names=x)
    fig.update_layout(legend_title=dict(text=title, font=dict(family="sans-serif",
                                         size=18)))
    fig.show()
    
def nulls(name, threshold, length):
    countries = np.array([])
    usable = 0
    for i in df_by_country:
        i = i.reset_index(drop=True)
        col = i[name]
        num_null = col.isnull().sum()
        col_len = len(col.index)
        usability = round(num_null/col_len*100, 2)<threshold
        if usability and col_len>length:
            usable += 1
            countries = np.append(countries, i['country'][0])
    return countries, name

def last_item(col):
    total_vaccs = []
    for country in df_by_country:
        total_vaccs.append(np.array(country[col])[-1])
    data = dict(zip(np.unique(df['country']), total_vaccs))
    return data

def preprocess(col, x, y):
    count = Counter(df[col])
    count = pd.DataFrame({x:count.keys(), y:count.values()}).sort_values(ascending=False, by=y)
    count = dict(zip(count[x], count[y]))
    return count

def bar_pre(col, values):
    k = 0
    for i in df[col]:
        country = df['country'][k]
        for j in i.split(', '):
            if j in values:
                values[j].append(country)
        k += 1
    return values

def bar_col(name, values, colour=False):
    col_data = bar_pre(name, values)
    cols_data.append(list(col_data.keys()))
    temp = []
    for point in list(col_data.keys()):
        column = 'days using '+point
        count = Counter(col_data[point])
        data = pd.DataFrame({'countries':count.keys(), column:count.values()})
        temp.append(data)
        if colour:
            c = column
        else:
            c = None
        bars(data, 'countries', column, 'Countries who use '+point, colour=c)
    total_data.append(temp)

In [None]:
df = df_filled

In [None]:
for i in [['total_vaccinations', 10, 15, 1000000],
          ['total_vaccinations_per_hundred', 10, 15, 11],
          ['daily_vaccinations', 2.5, 20, 200000],
          ['daily_vaccinations_per_million', 2.05, 20, 0],
          ['people_fully_vaccinated', 60, 20, 500000],
          ['people_fully_vaccinated_per_hundred', 60, 20, 4]]:
    countries, title = nulls(i[0], i[1], i[2])
    scatter(countries, title, i[3])

In [None]:
count = Counter(df['vaccines'])
count = pd.Series(count).sort_values(ascending=False)[:10]
count = dict(zip(count.keys(), count))
pie(count, 'vaccine', 'days using vaccine', '10 most used vaccine combinations')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
sns.heatmap(df.corr(), annot=True)
plt.show()

In [7]:
THE_DF = pd.read_csv("outputs/complete_df.csv")
THE_DF.drop(["Unnamed: 0"], axis=1, inplace = True)

THE_DF.head()

Unnamed: 0,country,iso_code,SE.XPD.TOTL.GB.ZS,NY.GDP.MKTP.CD,NY.GDP.MKTP.PP.CD,NY.GDP.PCAP.CD,NY.GDP.PCAP.PP.CD,IT.NET.USER.P2,SP.POP.GROW,SP.POP.TOTL,...,Health_exp_public_pct_2016,Health_exp_out_of_pocket_pct_2016,Health_exp_per_capita_USD_2016,per_capita_exp_PPP_2016,External_health_exp_pct_2016,Physicians_per_1000_2009-18,Nurse_midwife_per_1000_2009-18,Specialist_surgical_per_1000_2008-18,Completeness_of_birth_reg_2009-18,Completeness_of_death_reg_2008-16
0,Afghanistan,AFG,14.091753,12218640000.0,41653930000.0,414.75561,1422.970941,3.564658,3.406642,27818790.0,...,5.1,77.4,57.2,162.8,17.5,0.3,0.3,0.0,42.3,0.0
1,Albania,ALB,10.917204,10042290000.0,24293050000.0,3420.031627,8274.102258,30.964406,-0.446379,2955390.0,...,41.4,58.0,271.5,759.7,0.7,1.2,3.6,11.6,98.4,53.0
2,Algeria,DZA,11.42941,140430100000.0,432667900000.0,3871.85815,12018.055902,14.49339,1.650143,35559200.0,...,67.7,30.9,260.4,998.2,0.0,1.8,2.2,12.1,100.0,0.0
3,Angola,AGO,8.04397,68586080000.0,116463000000.0,2850.269551,4964.238398,4.167828,3.506682,22438880.0,...,44.1,35.2,95.2,185.8,3.6,0.2,1.3,0.0,25.0,0.0
4,Argentina,ARG,15.794843,361270700000.0,675257900000.0,8731.126258,16461.751899,36.869636,1.051542,40626220.0,...,74.4,15.8,955.2,1531.0,0.6,4.0,2.6,50.1,100.0,100.0


In [None]:
THE_DF.corr()


In [None]:
import matplotlib.pyplot as plt

plt.matshow(THE_DF.corr())
plt.show()

In [None]:
import seaborn as sns


# load the R package ISLR
#infert = com.importr("ISLR")

# calculate the correlation matrix
corr = THE_DF.corr()

# plot the heatmap
plt.figure(figsize=(20,20)) 

plt.title('THE_DF - Correlation Matrix')

sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

In [None]:
THE_DF.columns

 ADD CORRELATION & Then K-Means --> In the most correlated 

NY.GDP.PCAP.CD - GDP per Capita 

SP.POP.GROW - Population Growth 

SP.POP.TOTL - Total Population 

IT.NET.USER.P2 - Internet Users 



In [None]:
df_cor1 = THE_DF[['country','religiousity%', 'US$', 'NY.GDP.PCAP.CD', 'SP.POP.GROW', 'SP.POP.TOTL', 'IT.NET.USER.P2', 'january_total_vaccinations', 'may_total_vaccinations', 'Health_exp_public_pct_2016', 'Health_exp_out_of_pocket_pct_2016', 'Specialist_surgical_per_1000_2008-18',  ]].copy()

In [None]:
#THE_DF.columns

df_cor1 = df_cor1.rename(columns={"NY.GDP.PCAP.CD": "GDP/Capita", "SP.POP.GROW": "PopulationGrowth", 'SP.POP.TOTL':'PopulationTotal','IT.NET.USER.P2':'InternetUsers' })
df_cor1 = df_cor1.rename(columns={"Health_exp_public_pct_2016": "PublicExpedituresOnHealth", "Health_exp_out_of_pocket_pct_2016": "CitizensExpedituresOnHealth", 'Specialist_surgical_per_1000_2008-18':'Surgeons/1000 Citizens'})
df_cor1.head(4)

In [None]:
corr = df_cor1.corr()

# plot the heatmap
plt.figure(figsize=(10,10)) 

plt.title('Correlation Matrix for Selected Variables')

cmap = sns.cubehelix_palette(light=1, as_cmap=True)

sns.heatmap(corr,annot=True,cmap=cmap,  xticklabels=corr.columns,yticklabels=corr.columns)