# Preprocessing

Log transformation where neccessary, scaling, and random forest imputation.

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

## Prep

In [2]:
data = pd.read_csv("../data/full.csv")

Restrict to OECD destinations:

In [3]:
oecd = [
    "AUS", "AUT", "BEL", "CAN", "CHL", "CZE", "DNK", 
    "EST", "FIN", "FRA", "DEU", "GRC", "HUN", "ISL", 
    "IRL", "ISR", "ITA", "JPN", "KOR", "LUX", "MEX", 
    "NLD", "NZL", "NOR", "POL", "PRT", "SVK", "SVN", 
    "ESP", "SWE", "CHE", "TUR", "GBR", "USA"
]

data = data[data["destination_iso3"].isin(oecd)].reset_index(drop=True)
data.shape

(167400, 31)

One-hot encoding for origin & destination country:

In [4]:
data = pd.get_dummies(data, columns=["origin_iso3"], prefix="origin")
data = pd.get_dummies(data, columns=["destination_iso3"], prefix="dest")

## Transformations

In [5]:
log_features = list(data.filter(regex="conflict_deaths|gdp(?!_g)|disaster_deaths|population").columns)
numeric_boolean_features = list(data.select_dtypes(include=["number", "boolean"]).drop(["total"], axis=1).columns)

In [6]:
class LogTransformer(FunctionTransformer):
    def __init__(self, columns=None, **kwargs):
        super().__init__(func=np.log1p, validate=False, **kwargs)
        self.columns = columns

    def transform(self, X):
        X = X.copy()
        if self.columns is not None:
            X[self.columns] = super().transform(X[self.columns])
        return X

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('log', LogTransformer(columns=log_features), numeric_boolean_features),
        ('scaler', MinMaxScaler(), numeric_boolean_features)
    ],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imputer', IterativeImputer(estimator=RandomForestRegressor(n_estimators=100, random_state=42), random_state=42))
])

In [None]:
processed = pipeline.fit_transform(data)