# Bike experiment notebook

## Install packages

In [None]:
import os
import pickle
import sys

import numpy as np
import yaml
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics

## Get params

In [None]:
print("Works")

params = yaml.safe_load(open("params.yaml"))["train"]

input = "./data/"
output = "./models/model.pkl"

seed = params["seed"]
n_est = params["n_est"]
min_split = params["min_split"]

## Load training data

In [None]:
with open(os.path.join(input, "train.pkl"), "rb") as fd:
    matrix = pickle.load(fd)

labels = matrix.iloc[:, 11].values
x = matrix.iloc[:,1:11].values

## Train model

In [None]:
clf = RandomForestClassifier(
    n_estimators=n_est, min_samples_split=min_split, n_jobs=2, random_state=seed
)

clf.fit(x, labels)

## Save model

In [None]:
with open(output, "wb") as fd:
    pickle.dump(clf, fd)

## Set test variables

In [None]:
model_file = "./models/model.pkl"
test_file = os.path.join("./data", "test.pkl")

scores_file = "scores.json"
prc_file = "prc.json"
roc_file = "roc.json"

## Load model and test data

In [None]:
with open(model_file, "rb") as fd:
    model = pickle.load(fd)

with open(test_file, "rb") as fd:
    matrix = pickle.load(fd)

## Get model predictions

In [None]:
x = matrix.iloc[:,1:11].values

cleaned_x = np.where(np.isnan(x), 0, x)
labels_pred = model.predict(cleaned_x)

predictions_by_class = model.predict_proba(cleaned_x)
predictions = predictions_by_class[:, 1]

## Calculate model performance metrics

In [None]:
precision, recall, prc_thresholds = metrics.precision_recall_curve(labels_pred, predictions, pos_label=1)

fpr, tpr, roc_thresholds = metrics.roc_curve(labels_pred, predictions, pos_label=1)

avg_prec = metrics.average_precision_score(labels_pred, predictions)
roc_auc = metrics.roc_auc_score(labels_pred, predictions)
    
nth_point = math.ceil(len(prc_thresholds) / 1000)
prc_points = list(zip(precision, recall, prc_thresholds))[::nth_point]

## Save model performance metrics

In [None]:
with open(scores_file, "w") as fd:
    json.dump({"avg_prec": avg_prec, "roc_auc": roc_auc}, fd, indent=4)
    
with open(prc_file, "w") as fd:
    json.dump(
        {
            "prc": [
                {"precision": p, "recall": r, "threshold": t}
                for p, r, t in prc_points
            ]
        },
        fd,
        indent=4,
    )

with open(roc_file, "w") as fd:
    json.dump(
        {
            "roc": [
                {"fpr": fp, "tpr": tp, "threshold": t}
                for fp, tp, t in zip(fpr, tpr, roc_thresholds)
            ]
        },
        fd,
        indent=4,
    )