# Binary Classification Model Pipeline

In [1]:
from typing import Any, Dict, Tuple
from datetime import datetime, timezone
import random
import os

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import preprocessing
import matplotlib.pyplot as plt
import yaml

from utils import data_prep as prep
from utils import model

pd.options.display.max_columns = None

In [2]:
home = os.path.expanduser("~")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "creds.json"

## Data Load

In [20]:
raw_data = pd.read_gbq("SELECT * FROM")

In [21]:
raw_data.output.value_counts(normalize = True)

0    0.953811
1    0.046189
Name: output, dtype: float64

## Data Prep

In [22]:
# Loading data prep parameters
with open('config/data-prep-params.yml', 'r') as file:
    prep_params = yaml.safe_load(file)

In [23]:
features = list(prep_params["feature_engineering_map"].keys())
raw_features = raw_data[features]

In [None]:
# Remove columns that have a 80% nan coverage
step_1 = prep.remove_sparse_columns(raw_features, prep_params)
step_1.head()

In [None]:
# Apply Standard Scalar and Mean imputation to continuous variables
step_2, scaler = prep.continuous_variables(step_1, prep_params, True)
step_2.head()

In [None]:
# Apply label encoding to categorical variables
step_3 = prep.categorical_variables(step_1, prep_params, prep_params["categorical_label"])
step_3.head()

In [None]:
# Combine features and response variable
feature_table = pd.concat([raw_data["output"], step_2, step_3], axis = 1)
feature_table.head()

## Model Build

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
# Split data into train and test set.
X_train, X_test, y_train, y_test = model.split_data(feature_table, "output")

In [30]:
# Apply random oversampling for any class imbalance
X_resampled, y_resampled = model.random_over_sample(X_train, y_train)

In [31]:
model_fit = model.train_model(X_resampled, y_resampled, RandomForestClassifier)

## Evaluation

In [32]:
y_pred = model_fit.predict(X_test)
model.eval_accuracy(y_test, y_pred)

0.7707428200296202

In [33]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred, output_dict=True)
pd.DataFrame(report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.964389,0.084923,0.770743,0.524656,0.923521
recall,0.788694,0.402393,0.770743,0.595543,0.770743
f1-score,0.867737,0.140248,0.770743,0.503992,0.833931
support,581384.0,28333.0,0.770743,609717.0,609717.0


In [34]:
pd.DataFrame(report).to_csv("figures/classification_report_120day_conversion.csv")