In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt

import torch
from torch import nn, optim
from torchvision import transforms, utils
from torch.utils.data import TensorDataset, DataLoader
import time

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    train_test_split,
    cross_validate,
    RandomizedSearchCV
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsOneClassifier
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import loguniform, randint

%matplotlib inline

In [2]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation
    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data
    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """
    scores = cross_validate(model, X_train, y_train, **kwargs)
    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []
    for i in range(len(mean_scores)):
        out_col.append(
            (f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i]))
        )
    return pd.Series(data=out_col, index=mean_scores.index)

In [3]:
with open("data/processed/training_arm.pickle", "rb") as f:
    training_arm = pickle.load(f)

with open("data/processed/output_models.pickle", "rb") as f:
    output_models = pickle.load(f)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    output_models, training_arm,
    test_size=0.3, random_state=2022
)

print(X_train.shape)
print(y_train.shape)

In [None]:
final_model = RandomForestRegressor(n_jobs=-1)
final_model.fit(X_train, y_train)

In [None]:
prediction = final_model.predict(X_test)
rmse = np.sqrt(np.mean((y_test - prediction)**2))
print(rmse)

In [None]:
good_examples = 0
bad_examples = 0

ax_good = plt.subplot(121)
ax_bad = plt.subplot(122)

for idx in range(X_test.shape[0]):
    while good_examples < 30 and bad_examples < 30:
        sample_X = X_test[idx, :]
        sample_y = y_test[idx, :]
        prediction = final_model.predict(sample_X)
        rmse = np.sqrt(np.mean((prediction - sample_y)**2))
        if rmse < 5:
            good_examples += 1
            ax_good.plot(sample_y[:1000], sample_y[1000:2000], color="r")
            ax_good.plot(prediction[:1000], prediction[1000:2000], color="b")
        if rmse > 30:
            bad_examples += 1
            ax_bad.plot(sample_y[:1000], sample_y[1000:2000], color="r")
            ax_bad.plot(prediction[:1000], prediction[1000:2000], color="b")

ax_good.title.set_text("Good predictions")
ax_bad.title.set_text("Bad predictions")
ax_good.set_xlim([-150, 150])
ax_good.set_ylim([-100, 100])
ax_bad.set_xlim([-150, 150])
ax_bad.set_ylim([-100, 100])
plt.show()