# Setup

In [90]:
# Import libraries
import pandas as pd
import yaml
from sklearn.ensemble import HistGradientBoostingClassifier
from joblib import dump, load
import numpy as np

# Import configuration file
with open("config.yaml") as config_file:
    config = yaml.safe_load(config_file)

In [83]:
# Construct the output filenames
version_name = config["input"]["version_name"]

num_iterations = config["boosting_parameters"]["num_iterations"]
max_depth = config["boosting_parameters"]["max_depth"]
learning_rate = config["boosting_parameters"]["learning_rate"]

tag = f"{version_name}(m={num_iterations},d={max_depth},a={learning_rate})"

model_df_filename = f"{config["output"]["model_dump_filename_header"]}_{tag}.joblib"
train_pred_prob_df_filename = f"{config["output"]["train_pred_prob_df_filename_header"]}_{tag}.csv"
test_pred_prob_df_filename = f"{config["output"]["test_pred_prob_df_filename_header"]}_{tag}.csv"

train_log_loss_filename = f"{config["output"]["train_log_loss_filename_header"]}_{tag}.csv"
test_log_loss_filename = f"{config["output"]["test_log_loss_filename_header"]}_{tag}.csv"

# Import the data

In [51]:
# Import the dataframes
train_df = pd.read_csv(config["input"]["train_df_filename"], index_col = 0)     # use the pre-existing indices
test_df = pd.read_csv(config["input"]["test_df_filename"], index_col = 0)

In [52]:
# Process the dataframes as categorized (lost when written to CSV)
CATEGORY_COLUMN_NAMES = ["workclass", "education", "marital.status", "occupation", "relationship", "race", "sex", "native.country", "income"]
NUMBER_COLUMN_NAMES = ["age", "education.num", "capital.gain", "capital.loss", "hours.per.week"] # also fnlwgt, technically (but this is not in the frame)

# Categorize in place
def categorize(df):
    for categorical_column_name in CATEGORY_COLUMN_NAMES:
        df[categorical_column_name] = df[categorical_column_name].astype('category')

categorize(train_df)
categorize(test_df)

In [53]:
# Split into X and y
def get_X_y (df):
    return (df.drop("income", axis = 1), df["income"])

train_X, train_y = get_X_y(train_df)
test_X, test_y = get_X_y(test_df)

# Train boosted classifier

In [55]:
# Set up the boosting classifier with the provided parameters
classifier = HistGradientBoostingClassifier(
    loss = 'log_loss',
    categorical_features = 'from_dtype', 
    early_stopping = False,

    max_iter = num_iterations,
    max_depth = max_depth,
    learning_rate = learning_rate
)

In [85]:
# Train the classifier on the train data and dump
classifier.fit(train_X, train_y)
dump(classifier, model_df_filename)

['output-data/model_dump_v1(m=1000,d=1,a=0.1).joblib']

# Evaluate boosted classifier

In [59]:
# Report final accuracy
print(f"Train accuracy = {classifier.score(train_X, train_y)}")
print(f"Test accuracy = {classifier.score(test_X, test_y)}")

Train accuracy = 0.8711428571428571
Test accuracy = 0.8606666666666667


In [102]:
# Compute and save the staged predictions. Shape = (n, d, 2) (2 = one for each probability)
staged_train_pred_prob = np.array([pred_prob for pred_prob in classifier.staged_predict_proba(train_X)])
staged_test_pred_prob = np.array([pred_prob for pred_prob in classifier.staged_predict_proba(test_X)])

In [None]:
# TODO: figure out how to save a 3D array to a file
np.savetxt(train_pred_prob_df_filename, staged_train_pred_prob)
np.savetxt(test_pred_prob_df_filename, staged_train_pred_prob)