## Pre-process data for training

In this notebook, we will:
- Read the Dataset
- Handle NAN values by
    - Filling missing regional indicator
    - Replace a numeric NaN value by country level mean
- Encode the year as an ordinal categorical variable starting from 0
- Standardize numerical values
- Create One-hot encoding for regional indicator varaiables
- Save this data to a csv file and use for training models

In [15]:
# Locally defined imports
from src.utils import view_all_df

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# sklearn preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.impute import MissingIndicator

### Load and Visualize

In [27]:
df = pd.read_csv('../data/final_data.csv')

In [28]:
view_all_df(df[df['Country name']=='United States'])

Unnamed: 0,Country name,Regional indicator,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual,rank,year,Positive affect,Negative affect
1977,United States,,7.182,,,,10.924,0.965,68.06,0.911,,0.6,,,,,,,,,5,2006,0.827,0.261
1978,United States,,7.513,,,,10.933,,68.22,0.872,0.197,0.633,,,,,,,,,3,2007,0.829,0.232
1979,United States,,7.28,,,,10.922,0.953,68.38,0.878,0.255,0.668,,,,,,,,,10,2008,0.872,0.227
1980,United States,,7.158,,,,10.888,0.912,68.54,0.831,0.201,0.665,,,,,,,,,8,2009,0.843,0.262
1981,United States,,7.164,,,,10.905,0.926,68.7,0.828,0.244,0.69,,,,,,,,,14,2010,0.861,0.231
1982,United States,,7.115,,,,10.913,0.922,68.68,0.863,0.161,0.697,,,,,,,,,13,2011,0.836,0.273
1983,United States,,7.026,,,,10.928,0.903,68.66,0.823,0.215,0.71,,,,,,,,,17,2012,0.834,0.26
1984,United States,,7.249,,,,10.939,0.925,68.64,0.792,0.274,0.747,,,,,,,,,12,2013,0.814,0.26
1985,United States,,7.151,,,,10.956,0.902,68.62,0.866,0.221,0.702,,,,,,,,,12,2014,0.834,0.281
1986,United States,,6.864,,,,10.977,0.904,68.6,0.849,0.219,0.698,,,,,,,,,15,2015,0.814,0.275


In [29]:
cols_for_prediction = ['Country name', 'Regional indicator', 'Ladder score',
       'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'year']

In [30]:
X = df.loc[:, cols_for_prediction]
min_year = X['year'].min()
X.loc[:, 'year'] = X.loc[:, 'year'] - min_year
X.describe(include='all')

Unnamed: 0,Country name,Regional indicator,Ladder score,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,year
count,2098,149,2098.0,2062.0,2085.0,2043.0,2066.0,2009.0,1988.0,2098.0
unique,166,10,,,,,,,,
top,Ukraine,Sub-Saharan Africa,,,,,,,,
freq,16,36,,,,,,,,
mean,,,5.471402,9.37306,0.812709,63.478503,0.746094,-0.001027,0.74565,8.768827
std,,,1.112676,1.154247,0.118203,7.46878,0.140766,0.1614,0.186261,4.486449
min,,,2.375,6.635,0.29,32.3,0.258,-0.335,0.035,0.0
25%,,,4.65225,8.4705,0.75,58.7045,0.652,-0.115,0.68875,5.0
50%,,,5.392,9.462,0.835,65.28,0.767,-0.027,0.801,9.0
75%,,,6.2825,10.36075,0.905,68.66,0.859,0.089,0.869,13.0


### Handle NAN values

#### Fill Regional indicator

In [31]:
country_names = X['Country name'].unique()
print(f'Total countries = {len(country_names)}')

Total countries = 166


In [32]:
countries_without_region = []
for cname in country_names:
    cname_idx = X['Country name'] == cname
    temp = X[cname_idx]
    mode = temp['Regional indicator'].mode(dropna=True)
    if len(mode)>0:
        X.loc[cname_idx, 'Regional indicator'] = mode[0]
    else:
        countries_without_region.append(cname)

print(f'{len(countries_without_region)} countries without region. Complete List: \n {countries_without_region}')

17 countries without region. Complete List: 
 ['Angola', 'Belize', 'Bhutan', 'Central African Republic', 'Congo (Kinshasa)', 'Cuba', 'Djibouti', 'Guyana', 'Oman', 'Qatar', 'Somalia', 'Somaliland region', 'South Sudan', 'Sudan', 'Suriname', 'Syria', 'Trinidad and Tobago']


In [33]:
# Update the region of those we don't have as Unkown
X.loc[:, 'Regional indicator'] = X.loc[:, 'Regional indicator'].fillna('Unkown')

#### Fill numeric values with country level mean

In [34]:
all_but_region_col_index = list(range(X.shape[1]))
all_but_region_col_index.remove(1)
all_but_region_col_index

# Fill with country level mean
X.iloc[:, 2:] = X.iloc[:, all_but_region_col_index].groupby('Country name').transform(lambda x: x.fillna(x.mean()))

#### Drop any remaining NA values that can't be filled

In [35]:
# Dropping the remaining NA
X = X.dropna().reset_index(drop=True)
X

Unnamed: 0,Country name,Regional indicator,Ladder score,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,year
0,Afghanistan,South Asia,3.724,7.370,0.451,50.800,0.718,0.168,0.882,3
1,Afghanistan,South Asia,4.402,7.540,0.552,51.200,0.679,0.190,0.850,4
2,Afghanistan,South Asia,4.758,7.647,0.539,51.600,0.600,0.121,0.707,5
3,Afghanistan,South Asia,3.832,7.620,0.521,51.920,0.496,0.162,0.731,6
4,Afghanistan,South Asia,3.783,7.705,0.521,52.240,0.531,0.236,0.776,7
...,...,...,...,...,...,...,...,...,...,...
2080,Zimbabwe,Sub-Saharan Africa,3.638,8.016,0.754,55.000,0.753,-0.098,0.751,12
2081,Zimbabwe,Sub-Saharan Africa,3.616,8.049,0.775,55.600,0.763,-0.068,0.844,13
2082,Zimbabwe,Sub-Saharan Africa,2.694,7.950,0.759,56.200,0.632,-0.064,0.831,14
2083,Zimbabwe,Sub-Saharan Africa,3.160,7.829,0.717,56.800,0.643,-0.009,0.789,15


### Scaling and encoding

#### Standardize Numeric columns in the dataset

In [36]:
scaler = StandardScaler()
numeric_cols = X.columns[3:9]
X.loc[:, numeric_cols] = scaler.fit_transform(X.loc[:, numeric_cols])
X

Unnamed: 0,Country name,Regional indicator,Ladder score,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,year
0,Afghanistan,South Asia,3.724,-1.742158,-3.085132,-1.725207,-0.207794,1.052682,0.751257,3
1,Afghanistan,South Asia,4.402,-1.594696,-2.225499,-1.671500,-0.486326,1.189766,0.580125,4
2,Afghanistan,South Asia,4.758,-1.501881,-2.336145,-1.617792,-1.050533,0.759821,-0.184622,5
3,Afghanistan,South Asia,3.832,-1.525302,-2.489347,-1.574826,-1.793286,1.015295,-0.056273,6
4,Afghanistan,South Asia,3.783,-1.451570,-2.489347,-1.531860,-1.543321,1.476396,0.184382,7
...,...,...,...,...,...,...,...,...,...,...
2080,Zimbabwe,Sub-Saharan Africa,3.638,-1.181801,-0.506232,-1.161279,0.042171,-0.604787,0.050684,12
2081,Zimbabwe,Sub-Saharan Africa,3.616,-1.153176,-0.327496,-1.080718,0.113590,-0.417854,0.548038,13
2082,Zimbabwe,Sub-Saharan Africa,2.694,-1.239051,-0.463676,-1.000157,-0.821993,-0.392930,0.478515,14
2083,Zimbabwe,Sub-Saharan Africa,3.160,-1.344009,-0.821147,-0.919596,-0.743433,-0.050220,0.253904,15


#### Encode regional indicator

In [37]:
X = pd.get_dummies(X, columns=['Regional indicator'])
X

Unnamed: 0,Country name,Ladder score,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,year,Regional indicator_Central and Eastern Europe,Regional indicator_Commonwealth of Independent States,Regional indicator_East Asia,Regional indicator_Latin America and Caribbean,Regional indicator_Middle East and North Africa,Regional indicator_North America and ANZ,Regional indicator_South Asia,Regional indicator_Southeast Asia,Regional indicator_Sub-Saharan Africa,Regional indicator_Unkown,Regional indicator_Western Europe
0,Afghanistan,3.724,-1.742158,-3.085132,-1.725207,-0.207794,1.052682,0.751257,3,0,0,0,0,0,0,1,0,0,0,0
1,Afghanistan,4.402,-1.594696,-2.225499,-1.671500,-0.486326,1.189766,0.580125,4,0,0,0,0,0,0,1,0,0,0,0
2,Afghanistan,4.758,-1.501881,-2.336145,-1.617792,-1.050533,0.759821,-0.184622,5,0,0,0,0,0,0,1,0,0,0,0
3,Afghanistan,3.832,-1.525302,-2.489347,-1.574826,-1.793286,1.015295,-0.056273,6,0,0,0,0,0,0,1,0,0,0,0
4,Afghanistan,3.783,-1.451570,-2.489347,-1.531860,-1.543321,1.476396,0.184382,7,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2080,Zimbabwe,3.638,-1.181801,-0.506232,-1.161279,0.042171,-0.604787,0.050684,12,0,0,0,0,0,0,0,0,1,0,0
2081,Zimbabwe,3.616,-1.153176,-0.327496,-1.080718,0.113590,-0.417854,0.548038,13,0,0,0,0,0,0,0,0,1,0,0
2082,Zimbabwe,2.694,-1.239051,-0.463676,-1.000157,-0.821993,-0.392930,0.478515,14,0,0,0,0,0,0,0,0,1,0,0
2083,Zimbabwe,3.160,-1.344009,-0.821147,-0.919596,-0.743433,-0.050220,0.253904,15,0,0,0,0,0,0,0,0,1,0,0


### Save data to CSV

In [38]:
X.to_csv('../data/final_training_data.csv', index=False)