# Preprocessing

Log-transformation where neccessary, scaling & model-based imputation

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

In [2]:
data = pd.read_csv("../data/final/full.csv")
data.shape

(13760, 13)

## Engineering additional features

One-hot encoding countries and decade:

In [3]:
data["decade"] = data["year"] - (data["year"] % 10)

data = pd.concat([
    data,
    pd.get_dummies(data["iso3"], dtype=int),
    pd.get_dummies(data["decade"], dtype=int)
], axis=1)

data.columns = data.columns.astype(str) # pandas sucks ass
data.shape

(13760, 236)

## Transformations

In [4]:
log_features = ["conflict_deaths", "gdp", "disaster_deaths_per_100k", "population"]
to_use = list(data.drop(["iso3", "year", "decade"], axis=1).columns)

Train-test split:

In [5]:
train = data[data["year"] < 2016]
val = data[data["year"].isin([2016, 2017, 2018])]
test = data[data["year"] > 2018]

Preprocessing pipeline:

In [6]:
log_transformer = FunctionTransformer(
    func=lambda X: X.assign(**{col: np.log1p(X[col]) for col in log_features}),
    validate=False
)

imputer = IterativeImputer(
    estimator=BayesianRidge(),
    random_state=42,
    verbose=2,  
    max_iter=100
)

pipeline = Pipeline(steps=[
    ("log", log_transformer),
    ("scaler", MinMaxScaler()),
    ("imputer", imputer)
])

Transform separately to avoid leakage (fit pipeline on training data, then transform training, validation and testing data separately):

In [7]:
pipeline.fit(train[to_use])

[IterativeImputer] Completing matrix with shape (12040, 233)
[IterativeImputer] Ending imputation round 1/100, elapsed time 46.60
[IterativeImputer] Change: 2.0538119989009656, scaled tolerance: 0.0010000000000000002 
[IterativeImputer] Ending imputation round 2/100, elapsed time 92.14
[IterativeImputer] Change: 0.4740281780746819, scaled tolerance: 0.0010000000000000002 
[IterativeImputer] Ending imputation round 3/100, elapsed time 138.58
[IterativeImputer] Change: 0.07125727070652091, scaled tolerance: 0.0010000000000000002 
[IterativeImputer] Ending imputation round 4/100, elapsed time 185.59
[IterativeImputer] Change: 0.01706779592117126, scaled tolerance: 0.0010000000000000002 
[IterativeImputer] Ending imputation round 5/100, elapsed time 232.60
[IterativeImputer] Change: 0.005717355611634292, scaled tolerance: 0.0010000000000000002 
[IterativeImputer] Ending imputation round 6/100, elapsed time 282.58
[IterativeImputer] Change: 0.0019738977190201235, scaled tolerance: 0.0010000

In [31]:
train = pd.concat([train[["iso3", "year"]].reset_index(drop=True), pd.DataFrame(pipeline.transform(train[to_use]), columns=to_use)], axis=1).assign(set="train")
val = pd.concat([val[["iso3", "year"]].reset_index(drop=True), pd.DataFrame(pipeline.transform(val[to_use]), columns=to_use)], axis=1).assign(set="val")
test = pd.concat([test[["iso3", "year"]].reset_index(drop=True), pd.DataFrame(pipeline.transform(test[to_use]), columns=to_use)], axis=1).assign(set="test")

[IterativeImputer] Completing matrix with shape (12040, 233)
[IterativeImputer] Ending imputation round 1/7, elapsed time 0.26
[IterativeImputer] Ending imputation round 2/7, elapsed time 0.36
[IterativeImputer] Ending imputation round 3/7, elapsed time 0.46
[IterativeImputer] Ending imputation round 4/7, elapsed time 0.57
[IterativeImputer] Ending imputation round 5/7, elapsed time 0.68
[IterativeImputer] Ending imputation round 6/7, elapsed time 0.77
[IterativeImputer] Ending imputation round 7/7, elapsed time 0.89
[IterativeImputer] Completing matrix with shape (645, 233)
[IterativeImputer] Ending imputation round 1/7, elapsed time 0.01
[IterativeImputer] Ending imputation round 2/7, elapsed time 0.02
[IterativeImputer] Ending imputation round 3/7, elapsed time 0.02
[IterativeImputer] Ending imputation round 4/7, elapsed time 0.03
[IterativeImputer] Ending imputation round 5/7, elapsed time 0.04
[IterativeImputer] Ending imputation round 6/7, elapsed time 0.06
[IterativeImputer] End

In [33]:
full = pd.concat([train, val, test])

Construct outcome:

In [34]:
full["net_migration_tp1"] = full.groupby(by="iso3").shift(-1)["net_migration"]
full = full.dropna(subset="net_migration_tp1")

Export:

In [36]:
full.to_csv("../data/final/preprocessed_full.csv")