# Preprocessing

Log transformation where neccessary, scaling, and imputation using Bayesian ridge regression.

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
#from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge

## Prep

In [2]:
data = pd.read_csv("../data/full.csv")

Restrict to OECD destinations:

In [3]:
oecd = [
    "AUS", "AUT", "BEL", "CAN", "CHL", "CZE", "DNK", 
    "EST", "FIN", "FRA", "DEU", "GRC", "HUN", "ISL", 
    "IRL", "ISR", "ITA", "JPN", "KOR", "LUX", "MEX", 
    "NLD", "NZL", "NOR", "POL", "PRT", "SVK", "SVN", 
    "ESP", "SWE", "CHE", "TUR", "GBR", "USA"
]

data = data[data["destination_iso3"].isin(oecd)].reset_index(drop=True)
data.shape

(167400, 31)

## Transformations

I would like to one-hot encode countries to pass that information along aswell, but it unfortunately pushes the computation time for model-based imputation far beyond reasonable bounds; so we are just rolling with the numeric features.

In [4]:
log_features = list(data.filter(regex="conflict_deaths|gdp(?!_g)|disaster_deaths|population").columns)
numeric_features = list(data.select_dtypes(include=["number"]).drop(list(data.filter(regex="^total")), axis=1).columns)

In [5]:
log_transformer = FunctionTransformer(
    func=lambda X: X.assign(**{col: np.log1p(X[col]) for col in log_features}),
    validate=False
)

numeric_pipeline = Pipeline(steps=[
    ('log', log_transformer),
    ('scaler', MinMaxScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features)
    ],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imputer', IterativeImputer(estimator=BayesianRidge(), random_state=42, verbose=2, max_iter=500))
    #('imputer', IterativeImputer(estimator=RandomForestRegressor(n_estimators=10, random_state=42), random_state=42, verbose=2))
])

In [6]:
processed = pipeline.fit_transform(data[numeric_features])

[IterativeImputer] Completing matrix with shape (167400, 26)
[IterativeImputer] Ending imputation round 1/500, elapsed time 7.01
[IterativeImputer] Change: 2.9688210778880895, scaled tolerance: 0.0010000000000000005 
[IterativeImputer] Ending imputation round 2/500, elapsed time 13.28
[IterativeImputer] Change: 1.8640915006901981, scaled tolerance: 0.0010000000000000005 
[IterativeImputer] Ending imputation round 3/500, elapsed time 19.80
[IterativeImputer] Change: 4.280985559274475, scaled tolerance: 0.0010000000000000005 
[IterativeImputer] Ending imputation round 4/500, elapsed time 26.52
[IterativeImputer] Change: 1.5666651583245632, scaled tolerance: 0.0010000000000000005 
[IterativeImputer] Ending imputation round 5/500, elapsed time 33.01
[IterativeImputer] Change: 1.1776346688626254, scaled tolerance: 0.0010000000000000005 
[IterativeImputer] Ending imputation round 6/500, elapsed time 39.59
[IterativeImputer] Change: 0.7512913320652665, scaled tolerance: 0.0010000000000000005 

In [10]:
final = pd.concat([
    data[["origin_iso3", "destination_iso3", "year", "total_linear"]],
    pd.DataFrame(processed, columns=numeric_features).drop("year", axis=1)
], axis=1)

In [12]:
final.to_csv("../data/processed_data.csv")