# <a id='toc1_'></a>[Question 4 and 5](#toc0_)

In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.impute import KNNImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

warnings.filterwarnings("ignore")

**Table of contents**<a id='toc0_'></a>    
- [Question 4 and 5](#toc1_)    
  - [Implementation](#toc1_1_)    
  - [Other Methods To Fill the Missing Data](#toc1_2_)    
    - [Preprocessing](#toc1_2_1_)    
    - [Fill the Missing Data](#toc1_2_2_)    
      - [Using Polynomial Regression](#toc1_2_2_1_)    
      - [Using KNN](#toc1_2_2_2_)    
      - [Using Mean](#toc1_2_2_3_)    
    - [Application](#toc1_2_3_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

## <a id='toc1_1_'></a>[Implementation](#toc0_)

In [2]:
df = pd.read_csv("country_vaccination_stats.csv")

df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')

df = df.sort_values(by=['country', 'date'])

# Group by country and fill missing values
df['daily_vaccinations'] = df.groupby('country')['daily_vaccinations'].transform(lambda x: x.replace(np.NaN, x.min()))

df['daily_vaccinations'].fillna(0, inplace=True)
df['daily_vaccinations'] = df['daily_vaccinations'].astype(int)
df.to_csv("country_vaccination_stats_filled_minimum.csv")

df

Unnamed: 0,country,date,daily_vaccinations,vaccines
0,Argentina,2020-12-29,6483,Sputnik V
1,Argentina,2020-12-30,15656,Sputnik V
2,Argentina,2020-12-31,15656,Sputnik V
3,Argentina,2021-01-01,11070,Sputnik V
4,Argentina,2021-01-02,8776,Sputnik V
...,...,...,...,...
1497,Wales,2021-01-20,11105,"Oxford/AstraZeneca, Pfizer/BioNTech"
1498,Wales,2021-01-21,12318,"Oxford/AstraZeneca, Pfizer/BioNTech"
1499,Wales,2021-01-22,15148,"Oxford/AstraZeneca, Pfizer/BioNTech"
1500,Wales,2021-01-23,17371,"Oxford/AstraZeneca, Pfizer/BioNTech"


## <a id='toc1_2_'></a>[Other Methods To Fill the Missing Data](#toc0_)

In [3]:
df = pd.read_csv("country_vaccination_stats.csv")

### <a id='toc1_2_1_'></a>[Preprocessing](#toc0_)

In [4]:
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')

df = df.sort_values(by=['country', 'date'])

countries_with_no_vaccinations = df.groupby('country')['daily_vaccinations'].apply(lambda x: x.isna().all())

for country in countries_with_no_vaccinations[countries_with_no_vaccinations].index:
    df.loc[df['country'] == country, 'daily_vaccinations'] = df.loc[df['country'] == country, 'daily_vaccinations'].fillna(0)

# One hot encoding for vaccines
vaccine_list = df['vaccines'].str.get_dummies(sep=', ')

df = pd.concat([df, vaccine_list], axis=1)
df.drop('vaccines', axis=1, inplace=True)

df

Unnamed: 0,country,date,daily_vaccinations,CNBG,Covaxin,Covishield,Moderna,Oxford/AstraZeneca,Pfizer/BioNTech,Sinopharm,Sinovac,Sputnik V
0,Argentina,2020-12-29,,0,0,0,0,0,0,0,0,1
1,Argentina,2020-12-30,15656.0,0,0,0,0,0,0,0,0,1
2,Argentina,2020-12-31,15656.0,0,0,0,0,0,0,0,0,1
3,Argentina,2021-01-01,11070.0,0,0,0,0,0,0,0,0,1
4,Argentina,2021-01-02,8776.0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1497,Wales,2021-01-20,11105.0,0,0,0,0,1,1,0,0,0
1498,Wales,2021-01-21,12318.0,0,0,0,0,1,1,0,0,0
1499,Wales,2021-01-22,15148.0,0,0,0,0,1,1,0,0,0
1500,Wales,2021-01-23,17371.0,0,0,0,0,1,1,0,0,0


### <a id='toc1_2_2_'></a>[Fill the Missing Data](#toc0_)

#### <a id='toc1_2_2_1_'></a>[Using Polynomial Regression](#toc0_)

In [5]:
def polynomial_regression_impute(df):
    
    # Iterate over countries
    for country in df['country'].unique():
        country_df = df[df['country'] == country]

        # Split data
        missing_data = country_df[country_df['daily_vaccinations'].isnull()]
        non_missing_data = country_df[~country_df['daily_vaccinations'].isnull()]
        
        # Use Polynomial Regression
        if not missing_data.empty:
            X_train = non_missing_data.index.values.reshape(-1, 1)
            y_train = non_missing_data['daily_vaccinations']
            X_test = missing_data.index.values.reshape(-1, 1)
            
            poly_features = PolynomialFeatures(degree=2)
            X_train_poly = poly_features.fit_transform(X_train)
            X_test_poly = poly_features.transform(X_test)
            
            model = LinearRegression()
            model.fit(X_train_poly, y_train)
            predicted_values = model.predict(X_test_poly)
            
            predicted_values = np.maximum(predicted_values, 0)
            
            df.loc[missing_data.index, 'daily_vaccinations'] = predicted_values
    
    return df

#### <a id='toc1_2_2_2_'></a>[Using KNN](#toc0_)

In [6]:
def knn_impute(df):

    # One Hot Countries

    country_dummies = pd.get_dummies(df['country'], prefix='country')
    df = pd.concat([df, country_dummies], axis=1)

    # Convert Date to Number
    reference_date = df['date'].min()

    df['date'] = (df['date'] - reference_date).dt.days

    df.drop(columns=["country"], inplace=True)

    # KNN
    imp = KNNImputer(n_neighbors=5)
    df_imputed = imp.fit_transform(df)

    df = pd.DataFrame(df_imputed, columns=df.columns)

    return df

#### <a id='toc1_2_2_3_'></a>[Using Mean](#toc0_)

In [7]:
def mean_impute(df):
    countries = df['country'].unique()
    for country in countries:
        country_mean = df.loc[df['country'] == country, 'daily_vaccinations'].mean()
        df.loc[df['country'] == country, 'daily_vaccinations'] = df.loc[df['country'] == country, 'daily_vaccinations'].fillna(country_mean)
    
    return df

### <a id='toc1_2_3_'></a>[Application](#toc0_)

In [8]:
polynomial_regression_daily_vaccinations = polynomial_regression_impute(df = df.copy())["daily_vaccinations"].astype(int)
knn_daily_vaccinations = knn_impute(df = df.copy())["daily_vaccinations"].astype(int)
mean_daily_vaccinations = mean_impute(df = df.copy())["daily_vaccinations"].astype(int)

In [9]:
df = pd.read_csv("country_vaccination_stats.csv")

In [10]:
df["polynomial_regression"] = polynomial_regression_daily_vaccinations
df["knn"] = knn_daily_vaccinations
df["mean"] = mean_daily_vaccinations

In [11]:
df.to_csv("country_vaccination_stats_other_methods.csv")

In [12]:
df

Unnamed: 0,country,date,daily_vaccinations,vaccines,polynomial_regression,knn,mean
0,Argentina,12/29/2020,,Sputnik V,10301,42269,11446
1,Argentina,12/30/2020,15656.0,Sputnik V,15656,15656,15656
2,Argentina,12/31/2020,15656.0,Sputnik V,15656,15656,15656
3,Argentina,1/1/2021,11070.0,Sputnik V,11070,11070,11070
4,Argentina,1/2/2021,8776.0,Sputnik V,8776,8776,8776
...,...,...,...,...,...,...,...
1497,Wales,1/20/2021,11105.0,"Oxford/AstraZeneca, Pfizer/BioNTech",11105,11105,11105
1498,Wales,1/21/2021,12318.0,"Oxford/AstraZeneca, Pfizer/BioNTech",12318,12318,12318
1499,Wales,1/22/2021,15148.0,"Oxford/AstraZeneca, Pfizer/BioNTech",15148,15148,15148
1500,Wales,1/23/2021,17371.0,"Oxford/AstraZeneca, Pfizer/BioNTech",17371,17371,17371
