In [5]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

import joblib

In [2]:
train = pd.read_csv("C:\\Users\\lisch\\3D Objects\\ODISSEI Summer School 2023 - Gert & Lisa\\ODISSEI Summer School 2023 - Gert Stulp\\1PreFer data for Eyra\\training_FOR PARTICIPANTS\\PreFer_train_data.csv", low_memory = False)

In [3]:
outcome = pd.read_csv("C:\\Users\\lisch\\3D Objects\\ODISSEI Summer School 2023 - Gert & Lisa\\ODISSEI Summer School 2023 - Gert Stulp\\1PreFer data for Eyra\\training_FOR PARTICIPANTS\\PreFer_train_outcome.csv", low_memory = False)

In [33]:
def clean_df(df, background_df=None):
    """
    Preprocess the input dataframe to feed the model.
    # If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command

    Parameters:
    df (pd.DataFrame): The input dataframe containing the raw data (e.g., from PreFer_train_data.csv or PreFer_fake_data.csv).
    background (pd.DataFrame): Optional input dataframe containing background data (e.g., from PreFer_train_background_data.csv or PreFer_fake_background_data.csv).

    Returns:
    pd.DataFrame: The cleaned dataframe with only the necessary columns and processed variables.
    """

    ## This script contains a bare minimum working example
    # Create new variable with age
    df["age"] = 2024 - df["birthyear_bg"]

    # Imputing missing values in age with the mean
    df["age"] = df["age"].fillna(df["age"].mean())

    # Selecting variables for modelling
    keepcols = [
        "nomem_encr",  # ID variable required for predictions,
        "age"         # newly created variable
        ,"gender_bg"  # <--------ADDED VARIABLE
    ] 

    # Keeping data with variables selected
    df = df[keepcols]

    return df

In [34]:
def train_save_model(cleaned_df, outcome_df):
    """
    Trains a model using the cleaned dataframe and saves the model to a file.

    Parameters:
    cleaned_df (pd.DataFrame): The cleaned data from clean_df function to be used for training the model.
    outcome_df (pd.DataFrame): The data with the outcome variable (e.g., from PreFer_train_outcome.csv or PreFer_fake_outcome.csv).
    """
    
    ## This script contains a bare minimum working example
    #random.seed(1) # not useful here because logistic regression deterministic
    
    # Combine cleaned_df and outcome_df
    model_df = pd.merge(cleaned_df, outcome_df, on="nomem_encr")

    # Filter cases for whom the outcome is not available
    model_df = model_df[~model_df['new_child'].isna()]  
    
    # Logistic regression model
    model = LogisticRegression()

    # Fit the model
    model.fit(model_df[['age', 'gender_bg']], model_df['new_child']) # <-------- ADDED VARIABLE

    # Save the model
    joblib.dump(model, "model.joblib")

In [35]:
# preprocessing
train_cleaned = clean_df(train)
# training and saving the model
train_save_model(train_cleaned, outcome)

In [37]:
fake = pd.read_csv("PreFer_fake_data.csv")

def predict_outcomes(df, background_df=None, model_path="model.joblib"):
    """Generate predictions using the saved model and the input dataframe.

    The predict_outcomes function accepts a Pandas DataFrame as an argument
    and returns a new DataFrame with two columns: nomem_encr and
    prediction. The nomem_encr column in the new DataFrame replicates the
    corresponding column from the input DataFrame. The prediction
    column contains predictions for each corresponding nomem_encr. Each
    prediction is represented as a binary value: '0' indicates that the
    individual did not have a child during 2021-2023, while '1' implies that
    they did.

    Parameters:
    df (pd.DataFrame): The input dataframe for which predictions are to be made.
    background_df (pd.DataFrame): The background dataframe for which predictions are to be made.
    model_path (str): The path to the saved model file (which is the output of training.py).

    Returns:
    pd.DataFrame: A dataframe containing the identifiers and their corresponding predictions.
    """

    ## This script contains a bare minimum working example
    if "nomem_encr" not in df.columns:
        print("The identifier variable 'nomem_encr' should be in the dataset")

    # Load the model
    model = joblib.load(model_path)

    # Preprocess the fake / holdout data
    df = clean_df(df, background_df)

    # Exclude the variable nomem_encr if this variable is NOT in your model
    vars_without_id = df.columns[df.columns != 'nomem_encr']

    # Generate predictions from model, should be 0 (no child) or 1 (had child)
    predictions = model.predict(df[vars_without_id])

    # Output file should be DataFrame with two columns, nomem_encr and predictions
    df_predict = pd.DataFrame(
        {"nomem_encr": df["nomem_encr"], "prediction": predictions}
    )

    # Return only dataset with predictions and identifier
    return df_predict


In [38]:
predict_outcomes(fake)

Unnamed: 0,nomem_encr,prediction
0,700001,0.0
1,700002,0.0
2,700003,0.0
3,700004,0.0
4,700005,0.0
5,700006,0.0
6,700007,0.0
7,700008,0.0
8,700009,0.0
9,700010,0.0
