# Preprocessing

Log-transformation where neccessary, scaling & model-based imputation

In [9]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
#from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge

In [10]:
data = pd.read_csv("../data/final/full.csv")

# Restrict to OECD:
oecd = [
    "AUS", "AUT", "BEL", "CAN", "CHL", "CZE", "DNK", 
    "EST", "FIN", "FRA", "DEU", "GRC", "HUN", "ISL", 
    "IRL", "ISR", "ITA", "JPN", "KOR", "LUX", "MEX", 
    "NLD", "NZL", "NOR", "POL", "PRT", "SVK", "SVN", 
    "ESP", "SWE", "CHE", "TUR", "GBR", "USA"
]

#data = data[data["iso3"].isin(oecd)].reset_index(drop=True)
data.shape

(13760, 13)

## Engineering additional features

One-hot encoding countries and years:

In [11]:
data = pd.concat([
    data,
    pd.get_dummies(data["iso3"], dtype=int),
    pd.get_dummies(data["year"], dtype=int)
], axis=1)

data.columns = data.columns.astype(str) # pandas sucks ass

## Transformations

In [12]:
log_features = ["conflict_deaths", "gdp", "disaster_deaths_per_100k", "population"]
numeric_features = [*log_features, "gdp_growth", "internet_usage", "liberal_dem", "pop_growth", "unemployment_youth"]

to_use = list(data.drop(["net_migration", "iso3", "year"], axis=1).columns)

In [13]:
log_transformer = FunctionTransformer(
    func=lambda X: X.assign(**{col: np.log1p(X[col]) for col in log_features}),
    validate=False
)

numeric_pipeline = Pipeline(steps=[
    ('log', log_transformer),
    ('scaler', MinMaxScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features)
    ],
    remainder='passthrough'
)

imputer = IterativeImputer(
    estimator=BayesianRidge(),
    random_state=42,
    verbose=2,  
    max_iter=100
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imputer', imputer)
])

In [14]:
transformed = pipeline.fit_transform(data[to_use])

[IterativeImputer] Completing matrix with shape (13760, 289)
[IterativeImputer] Ending imputation round 1/100, elapsed time 148.51
[IterativeImputer] Change: 2.210938100254739, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 2/100, elapsed time 307.81
[IterativeImputer] Change: 0.5334706867582633, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 3/100, elapsed time 535.70
[IterativeImputer] Change: 0.507622791482098, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 4/100, elapsed time 690.81
[IterativeImputer] Change: 0.1007107703745424, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 5/100, elapsed time 833.17
[IterativeImputer] Change: 0.059585537263403834, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 6/100, elapsed time 975.07
[IterativeImputer] Change: 0.0451479189394003, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 7/100, elapsed time 1220.43
[IterativeImpute

In [15]:
final = pd.concat([
    data[["iso3", "year", "net_migration"]],
    pd.DataFrame(transformed, columns=to_use)
], axis=1)

In [16]:
final.to_csv("../data/final/preprocessed_full.csv")