## Import

In [62]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import joblib
import config

from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression as logreg
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [38]:
data_path = config.DATA_PATH

# Load the datasets
df = pd.read_csv(data_path + '/training_data/PreFer_train_data.csv')
df_outcome = pd.read_csv(data_path + '/training_data/PreFer_train_outcome.csv')
df_bg = pd.read_csv(data_path + '/other_data/PreFer_train_background_data.csv')
df_fk = pd.read_csv(data_path + '/other_data/PreFer_fake_data.csv')
df_fko = pd.read_csv(data_path + '/other_data/PreFer_fake_outcome.csv')

  df = pd.read_csv(data_path + '/PreFer_train_data.csv')


## Clean data

In [40]:
def clean_df(df, background_df=None, outcome_df=None):
    """
    Preprocess the input dataframe to feed the model.
    # If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command

    Parameters:
    df (pd.DataFrame): The input dataframe containing the raw data (e.g., from PreFer_train_data.csv or PreFer_fake_data.csv).
    background (pd.DataFrame): Optional input dataframe containing background data (e.g., from PreFer_train_background_data.csv or PreFer_fake_background_data.csv).

    Returns:
    pd.DataFrame: The cleaned dataframe with only the necessary columns and processed variables.
    """

    # Merge the outcome data with the df
    df = df.merge(outcome_df, on="nomem_encr")

    # Select variables that have a non-missing value for new_child
    df = df[df["new_child"].notnull()]

    # Impute variable migration_background_bg with the mode
    df["migration_background_bg"] = df["migration_background_bg"].fillna(df["migration_background_bg"].mode()[0])

    # Select variables for modelling
    keepcols = [
        "nomem_encr",               # ID variable required for predictions,
        "migration_background_bg",  # Origin [imputed by PreFer organisers]
        'age_bg',                   # Age of the household member on December 2020 [imputed by PreFer organisers]
        'new_child'                 # Whether respondent had child in 2021-2023 [outcome variable]
    ]

    # Keeping data with variables selected
    df = df[keepcols]

    return df

### Background analysis for clean_df function

In [4]:
# Display df.columns that have less than 50% missing values
missing_values = df.isnull().mean()
missing_values = missing_values[missing_values < 0.5]
missing_values

nomem_encr                 0.000000
outcome_available          0.000000
birthyear_bg               0.000000
gender_bg                  0.000000
migration_background_bg    0.153007
age_bg                     0.011530
belbezig_2014              0.485510
belbezig_2017              0.489405
burgstat_2014              0.485510
burgstat_2017              0.489405
oplcat_2014                0.488470
oplcat_2017                0.491742
oplmet_2014                0.485510
oplmet_2017                0.489405
oplzon_2014                0.485510
oplzon_2017                0.489405
partner_2014               0.485510
partner_2017               0.489405
sted_2014                  0.493300
woning_2014                0.491275
woning_2017                0.499065
woonvorm_2014              0.485510
woonvorm_2017              0.489405
dtype: float64

In [61]:
# Display df.columns that have less than 10% missing values for rows where outcome_available = 1
missing_values = df[df["outcome_available"] == 1].isnull().mean()
missing_values = missing_values[missing_values < 0.1]
missing_values

nomem_encr           0.000000
outcome_available    0.000000
cf20m_m              0.061803
cf20m001             0.061803
cf20m003             0.062817
                       ...   
oplzon_2020          0.004053
partner_2020         0.004053
sted_2020            0.012158
woning_2020          0.011145
woonvorm_2020        0.004053
Length: 274, dtype: float64

In [5]:
# Display the distribution of unique values in migration_background_bg, include nans
df["migration_background_bg"].value_counts(dropna=False)

0.0      4001
NaN       982
202.0     483
102.0     369
201.0     346
101.0     237
Name: migration_background_bg, dtype: int64

### Test the function

In [25]:
clean_df(df)

Unnamed: 0,nomem_encr,migration_background_bg,age_bg,new_child
4,715619,0.0,30.0,0.0
8,716711,0.0,31.0,1.0
18,717188,0.0,38.0,0.0
19,712090,0.0,39.0,0.0
35,709537,101.0,35.0,0.0
...,...,...,...,...
5975,704088,0.0,30.0,1.0
5981,701213,101.0,31.0,0.0
5984,730518,0.0,37.0,1.0
6105,709923,0.0,23.0,0.0


## Model

In [7]:
# Create a machine learning pipeline that trains a model on the data

def model_pipeline():    
    # Define numerical and categorical features
    numerical_features = ['age_bg']
    categorical_features = ['migration_background_bg']

    # Preprocessing for numerical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Preprocessing for categorical features
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first'))
    ])

    # Bundle preprocessing for numerical and categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Define model
    model = logreg(random_state=42)

    # Create and return the pipeline
    return Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

In [41]:
# Define the model pipeline
pipeline = model_pipeline()

# Preprocess the fake / holdout data
dfclean = clean_df(df, outcome_df=df_outcome)

print(dfclean.head())
print(dfclean.shape)

    nomem_encr  migration_background_bg  age_bg  new_child
4       715619                      0.0    30.0        0.0
8       716711                      0.0    31.0        1.0
18      717188                      0.0    38.0        0.0
19      712090                      0.0    39.0        0.0
35      709537                    101.0    35.0        0.0
(987, 4)


In [53]:
# Create a SMOTE object
smote = SMOTE(random_state=42)

# Apply SMOTE to the data
X_train, y_train = smote.fit_resample(dfclean, dfclean["new_child"])

### Save the trained model

In [54]:
# Fit the pipeline on the data
pipeline.fit(X_train, y_train)

# Save the model
joblib.dump(pipeline, 'model.joblib')

['model.joblib']

## Predict

In [44]:
def predict_outcomes(df, background_df=None, outcome_df=df_outcome, model_path="model.joblib"):
    """Generate predictions using the saved model and the input dataframe.

    The predict_outcomes function accepts a Pandas DataFrame as an argument
    and returns a new DataFrame with two columns: nomem_encr and
    prediction. The nomem_encr column in the new DataFrame replicates the
    corresponding column from the input DataFrame. The prediction
    column contains predictions for each corresponding nomem_encr. Each
    prediction is represented as a binary value: '0' indicates that the
    individual did not have a child during 2021-2023, while '1' implies that
    they did.

    Parameters:
    df (pd.DataFrame): The input dataframe for which predictions are to be made.
    background_df (pd.DataFrame): The background dataframe for which predictions are to be made.
    model_path (str): The path to the saved model file (which is the output of training.py).

    Returns:
    pd.DataFrame: A dataframe containing the identifiers and their corresponding predictions.
    """

    ## This script contains a bare minimum working example
    if "nomem_encr" not in df.columns:
        print("The identifier variable 'nomem_encr' should be in the dataset")

    # Load the model
    model = joblib.load(model_path)

    # Preprocess the fake / holdout data
    df = clean_df(df, background_df, outcome_df)

    # Exclude the variable nomem_encr if this variable is NOT in your model
    vars_without_id = df.columns[df.columns != 'nomem_encr']

    # Generate predictions from model, should be 0 (no child) or 1 (had child)
    predictions = model.predict(df[vars_without_id])

    # Output file should be DataFrame with two columns, nomem_encr and predictions
    df_predict = pd.DataFrame(
        {"nomem_encr": df["nomem_encr"], "prediction": predictions}
    )

    # Return only dataset with predictions and identifier
    return df_predict

In [55]:
df_pred = predict_outcomes(df_fk, None, df_fko)

In [58]:
print(df_fk.shape)

(30, 31634)


In [56]:
print(df_pred.head()) 
print(df_fko.head())

   nomem_encr  prediction
0      700001         0.0
1      700002         0.0
2      700003         1.0
3      700004         0.0
4      700005         0.0
   nomem_encr  new_child
0      700001          1
1      700002          0
2      700003          0
3      700004          0
4      700005          0


In [57]:
# Evaluate the model
y_pred = df_pred['prediction']
y_val = df_fko['new_child'].astype(float)

# Classification report
print("Classification Report:")
print(classification_report(y_val, y_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

Classification Report:
              precision    recall  f1-score   support

         0.0       0.74      0.87      0.80        23
         1.0       0.00      0.00      0.00         7

    accuracy                           0.67        30
   macro avg       0.37      0.43      0.40        30
weighted avg       0.57      0.67      0.61        30

Confusion Matrix:
[[20  3]
 [ 7  0]]
