## Import

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import joblib
import re
import config

from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression as logreg
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [20]:
data_path = config.DATA_PATH

# Load the datasets
df = pd.read_csv(data_path + '/training_data/PreFer_train_data.csv')
df_outcome = pd.read_csv(data_path + '/training_data/PreFer_train_outcome.csv')
df_bg = pd.read_csv(data_path + '/other_data/PreFer_train_background_data.csv')
df_fk = pd.read_csv(data_path + '/other_data/PreFer_fake_data.csv')
df_fko = pd.read_csv(data_path + '/other_data/PreFer_fake_outcome.csv')

  df = pd.read_csv(data_path + '/PreFer_train_data.csv')


## Clean data

In [15]:
def clean_df(df, background_df=None, outcome_df=None):
    """
    Preprocess the input dataframe to feed the model.
    # If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command

    Parameters:
    df (pd.DataFrame): The input dataframe containing the raw data (e.g., from PreFer_train_data.csv or PreFer_fake_data.csv).
    background (pd.DataFrame): Optional input dataframe containing background data (e.g., from PreFer_train_background_data.csv or PreFer_fake_background_data.csv).

    Returns:
    pd.DataFrame: The cleaned dataframe with only the necessary columns and processed variables.
    """

    # Merge the outcome data with the df
    df = df.merge(outcome_df, on="nomem_encr")

    # Select rows that have a non-missing value for new_child
    df = df[df["new_child"].notnull()]

    # Select only those variables that have under 10% missing values
    keepcols = [col for col in df.columns if df[col].isnull().mean() < 0.1]

    # Exclude those columns whose name is the format of 'LLNNL_L' where L is a 
    # lowercase letter and N is a number OR whose data type is object
    date_columns = [col for col in df.columns if re.match(r'[a-z]{2}\d{2}[a-z]_[a-z]', col)]
    object_columns = [col for col in df.columns if df[col].dtype == 'object']
    keepcols = [col for col in keepcols if col not in date_columns and col not in object_columns]
    
    # Keeping data with variables selected
    df = df[keepcols]

    return df

### Background analysis for clean_df function

In [8]:
# Display df.columns that have less than 10% missing values for rows where outcome_available = 1
missing_values = df[df["outcome_available"] == 1].isnull().mean()
missing_values = missing_values[missing_values < 0.1]
missing_values

nomem_encr           0.000000
outcome_available    0.000000
cf20m_m              0.061803
cf20m001             0.061803
cf20m003             0.062817
                       ...   
oplzon_2020          0.004053
partner_2020         0.004053
sted_2020            0.012158
woning_2020          0.011145
woonvorm_2020        0.004053
Length: 274, dtype: float64

### Test the function

In [16]:
cleaned = clean_df(df, None, df_outcome)
cleaned

Unnamed: 0,nomem_encr,outcome_available,cf20m001,cf20m003,cf20m004,cf20m024,cf20m128,cf20m388,cf20m389,cf20m390,...,nettoink_2020,nettoink_f_2020,oplcat_2020,oplmet_2020,oplzon_2020,partner_2020,sted_2020,woning_2020,woonvorm_2020,new_child
4,715619,1,16.0,1.0,30.0,1.0,3.0,3.0,3.0,3.0,...,,,4.0,4.0,5.0,1.0,3.0,1.0,2.0,0.0
8,716711,1,16.0,2.0,31.0,1.0,1.0,1.0,5.0,3.0,...,2166.0,2166.0,5.0,5.0,5.0,1.0,2.0,1.0,2.0,1.0
18,717188,1,16.0,2.0,37.0,2.0,3.0,1.0,5.0,3.0,...,,,5.0,5.0,5.0,1.0,4.0,1.0,3.0,0.0
19,712090,1,2.0,2.0,39.0,1.0,2.0,1.0,5.0,5.0,...,1700.0,1700.0,5.0,5.0,5.0,1.0,5.0,1.0,2.0,0.0
35,709537,1,16.0,2.0,35.0,2.0,3.0,2.0,3.0,3.0,...,,,4.0,4.0,5.0,1.0,3.0,1.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5975,704088,1,,,,,,,,,...,6250.0,6250.0,4.0,4.0,4.0,1.0,5.0,1.0,2.0,1.0
5981,701213,1,,,,,,,,,...,1800.0,1800.0,6.0,6.0,6.0,1.0,2.0,1.0,2.0,0.0
5984,730518,1,,,,,,,,,...,2150.0,2150.0,4.0,4.0,4.0,1.0,4.0,2.0,3.0,1.0
6105,709923,1,,,,,,,,,...,350.0,350.0,3.0,3.0,5.0,1.0,4.0,2.0,2.0,0.0


In [17]:
# Show the distribution of data types for variables in cleaned dataset
cleaned.dtypes.value_counts()

float64    251
int64        4
dtype: int64

In [62]:
# Create a function that first creates a dataframe where first column has 'var_name' 
# of each column in a given dataframe (second parameter) as values, and second column 
# is the corresponding value of 'type_var' in the codebook (third parameter is path to that codebook). 
# It then returns all values of 'var_name' for given 'type_var' in the codebook (first parameter).
# Make sure the function does not return 'new_child' as one of the variables in the output.

def get_var_labels(type_var, df, codebook_path=data_path + '/codebook'):
    """
    Get the variable labels for the given type of variable from the codebook.

    Parameters:
    codebook_path (str): The path to the codebook (e.g., 'PreFer_codebook.csv').
    df (pd.DataFrame): The dataframe containing the data.
    type_var (str): The type of variable to get the labels for (e.g., 'categorical', 'continuous', 'binary').

    Returns:
    list: A list of variable labels for the given type of variable.
    """
    codebook = pd.read_csv(codebook_path + '/PreFer_codebook.csv')
    var_labels = codebook[codebook["type_var"] == type_var]
    var_labels = var_labels[var_labels["var_name"].isin(df.columns)]
    var_names =  var_labels["var_name"].tolist()

    # Remove 'new_child' and 'outcome_available' from the list
    if 'new_child' in var_names:
        var_names.remove('new_child')
    if 'outcome_available' in var_names:
        var_names.remove('outcome_available')
    
    return var_names

get_var_labels('numeric', cleaned)

['nomem_encr',
 'cf20m004',
 'cf20m397',
 'ca20g075',
 'ca20g082',
 'cr20m120',
 'cs20m124',
 'cs20m131',
 'cs20m415',
 'birthyear_bg',
 'age_bg',
 'brutoink_f_2020',
 'nettoink_2020',
 'nettoink_f_2020']

## Model

In [57]:
# Create a machine learning pipeline that trains a model on the data

def model_pipeline(df):    
    # Define numerical and categorical features
    numerical_features = get_var_labels('numeric', df)
    categorical_features = get_var_labels('numeric', df)

    # Preprocessing for numerical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Preprocessing for categorical features
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first'))
    ])

    # Bundle preprocessing for numerical and categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Define model
    model = logreg(random_state=42)

    # Create and return the pipeline
    return Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

In [52]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

def model_pipeline(df):    
    # Drop columns 'nomem_encr', 'outcome_available'
    df = df.drop(columns=['outcome_available'])
    
    # Define numerical and categorical features
    numerical_features = get_var_labels('numeric', df)
    categorical_features = get_var_labels('categorical', df)

    # Preprocessing for numerical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Preprocessing for categorical features
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first'))
    ])

    # Bundle preprocessing for numerical and categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Define model
    model = logreg(random_state=42)

    # Define resampling
    resampling = SMOTE(random_state=42)

    # Create and return the pipeline
    return ImbPipeline(steps=[('preprocessor', preprocessor),
                              ('SMOTE', resampling),
                              ('model', model)])

In [63]:
# Define the model pipeline
pipeline = model_pipeline(cleaned)

### Save the trained model

In [67]:
# Define the features and target variable
X = cleaned.drop(columns=['outcome_available', 'new_child'])
y = cleaned['new_child']

# Fit the pipeline on the data
pipeline.fit(X, y)

# Save the model
joblib.dump(pipeline, 'model.joblib')

['model.joblib']

## Predict

In [60]:
def predict_outcomes(df, background_df=None, outcome_df=df_outcome, model_path="model.joblib"):
    """Generate predictions using the saved model and the input dataframe.

    The predict_outcomes function accepts a Pandas DataFrame as an argument
    and returns a new DataFrame with two columns: nomem_encr and
    prediction. The nomem_encr column in the new DataFrame replicates the
    corresponding column from the input DataFrame. The prediction
    column contains predictions for each corresponding nomem_encr. Each
    prediction is represented as a binary value: '0' indicates that the
    individual did not have a child during 2021-2023, while '1' implies that
    they did.

    Parameters:
    df (pd.DataFrame): The input dataframe for which predictions are to be made.
    background_df (pd.DataFrame): The background dataframe for which predictions are to be made.
    model_path (str): The path to the saved model file (which is the output of training.py).

    Returns:
    pd.DataFrame: A dataframe containing the identifiers and their corresponding predictions.
    """

    ## This script contains a bare minimum working example
    if "nomem_encr" not in df.columns:
        print("The identifier variable 'nomem_encr' should be in the dataset")

    # Load the model
    model = joblib.load(model_path)

    # Preprocess the fake / holdout data
    df = clean_df(df, background_df, outcome_df)

    # Exclude the variable nomem_encr if this variable is NOT in your model
    vars_without_id = df.columns[df.columns != 'nomem_encr']

    # Generate predictions from model, should be 0 (no child) or 1 (had child)
    predictions = model.predict(df[vars_without_id])

    # Output file should be DataFrame with two columns, nomem_encr and predictions
    df_predict = pd.DataFrame(
        {"nomem_encr": df["nomem_encr"], "prediction": predictions}
    )

    # Return only dataset with predictions and identifier
    return df_predict

In [68]:
df_pred = predict_outcomes(df_fk, None, df_fko)

KeyError: "['nomem_encr'] not in index"

In [None]:
print(df_fk.shape)

(30, 31634)


In [56]:
print(df_pred.head()) 
print(df_fko.head())

   nomem_encr  prediction
0      700001         0.0
1      700002         0.0
2      700003         1.0
3      700004         0.0
4      700005         0.0
   nomem_encr  new_child
0      700001          1
1      700002          0
2      700003          0
3      700004          0
4      700005          0


In [57]:
# Evaluate the model
y_pred = df_pred['prediction']
y_val = df_fko['new_child'].astype(float)

# Classification report
print("Classification Report:")
print(classification_report(y_val, y_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

Classification Report:
              precision    recall  f1-score   support

         0.0       0.74      0.87      0.80        23
         1.0       0.00      0.00      0.00         7

    accuracy                           0.67        30
   macro avg       0.37      0.43      0.40        30
weighted avg       0.57      0.67      0.61        30

Confusion Matrix:
[[20  3]
 [ 7  0]]
