# Preprocessing

Log-transformation where neccessary, scaling & model-based imputation

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
#from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge

In [2]:
data = pd.read_csv("../data/final/full.csv")

# Restrict to OECD:
oecd = [
    "AUS", "AUT", "BEL", "CAN", "CHL", "CZE", "DNK", 
    "EST", "FIN", "FRA", "DEU", "GRC", "HUN", "ISL", 
    "IRL", "ISR", "ITA", "JPN", "KOR", "LUX", "MEX", 
    "NLD", "NZL", "NOR", "POL", "PRT", "SVK", "SVN", 
    "ESP", "SWE", "CHE", "TUR", "GBR", "USA"
]

data = data[data["iso3"].isin(oecd)].reset_index(drop=True)
data.shape

(2176, 13)

## Engineering additional features

One-hot encoding countries and years:

In [4]:
data = pd.concat([
    data,
    pd.get_dummies(data["iso3"], dtype=int),
    pd.get_dummies(data["year"], dtype=int)
], axis=1)

data.columns = data.columns.astype(str) # pandas sucks ass

## Transformations

In [15]:
log_features = ["conflict_deaths", "gdp", "disaster_deaths_per_100k", "population"]
numeric_features = [*log_features, "gdp_growth", "internet_usage", "liberal_dem", "pop_growth", "unemployment_youth"]

to_use = list(data.drop(["net_migration", "iso3", "year"], axis=1).columns)

In [16]:
log_transformer = FunctionTransformer(
    func=lambda X: X.assign(**{col: np.log1p(X[col]) for col in log_features}),
    validate=False
)

numeric_pipeline = Pipeline(steps=[
    ('log', log_transformer),
    ('scaler', MinMaxScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features)
    ],
    remainder='passthrough'
)

imputer = IterativeImputer(
    estimator=BayesianRidge(),
    random_state=42,
    verbose=2,
    max_iter=100
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imputer', imputer)
])

In [20]:
transformed = pipeline.fit_transform(data[to_use])

[IterativeImputer] Completing matrix with shape (2176, 108)
[IterativeImputer] Ending imputation round 1/500, elapsed time 2.32
[IterativeImputer] Change: 1.0624056545265685, scaled tolerance: 0.0010000000000000005 
[IterativeImputer] Ending imputation round 2/500, elapsed time 4.98
[IterativeImputer] Change: 0.1276755512673873, scaled tolerance: 0.0010000000000000005 
[IterativeImputer] Ending imputation round 3/500, elapsed time 7.88
[IterativeImputer] Change: 0.025764768238444383, scaled tolerance: 0.0010000000000000005 
[IterativeImputer] Ending imputation round 4/500, elapsed time 10.90
[IterativeImputer] Change: 0.004373384141189915, scaled tolerance: 0.0010000000000000005 
[IterativeImputer] Ending imputation round 5/500, elapsed time 13.23
[IterativeImputer] Change: 0.0009405761011839348, scaled tolerance: 0.0010000000000000005 
[IterativeImputer] Early stopping criterion reached.


In [22]:
final = pd.concat([
    data[["iso3", "year", "net_migration"]],
    pd.DataFrame(transformed, columns=to_use)
], axis=1)

In [24]:
final.to_csv("../data/final/preprocessed.csv")