In [None]:
from exp.features import create_train_features
from exp.run import run_experiment
from exp.mappings import alg_map
from exp.train import train_model
import pandas as pd
import json
import numpy as np
import os

### Create Training Features

In [None]:
X_save = "X_tr.csv"
y_save = "y_tr.csv"
X_save_scaled = "X_tr_scaled.csv"
scale_params_pickle = "scale_params.pickle"
other_params_json = "other.json"
tr_scaler = None
classic_sta_lta5_mean_fill = None
classic_sta_lta7_mean_fill = None

if not (os.path.exists(X_save_scaled) and os.path.exists(y_save)):
    if os.path.exists(X_save) and os.path.exists(y_save):
        X_tr = pd.read_csv(X_save, index_col=0)
        y_tr = pd.read_csv(y_save, index_col=0)

        scale_params_pickle_on = open(scale_params_pickle, "rb")
        tr_scaler = pickle.load(scale_params_pickle_on)
        scale_params_pickle_on.close()
        
        X_train_scaled = pd.DataFrame(tr_scaler.transform(X_tr), columns=X_tr.columns)
        X_train_scaled.to_csv(X_save_scaled)
    else:
        X_tr, X_train_scaled, y_tr, tr_scaler, classic_sta_lta5_mean_fill, classic_sta_lta7_mean_fill  = create_train_features(r'C:\Users\arvin\dev\lanl\train.csv')
        X_tr.to_csv(X_save)
        y_tr.to_csv(y_save)
        X_train_scaled.to_csv(X_save_scaled)

        scale_params_pickle_on = open(scale_params_pickle, "wb")
        pickle.dump(tr_scaler, scale_params_pickle_on)
        scale_params_pickle_on.close()

        with open(other_params_json, 'w') as fp:
            json.dump({"classic_sta_lta5_mean_fill": classic_sta_lta5_mean_fill,
                       "classic_sta_lta7_mean_fill": classic_sta_lta7_mean_fill}, fp)
else:
    X_train_scaled = pd.read_csv(X_save_scaled, index_col=0)
    y_tr = pd.read_csv(y_save, index_col=0)

### hyper-parameter experiments

In [None]:
"""
Example of Cartesian Product of Hyper-parameters for Linear Regression

"lr": {"fit_intercept": [False, True], "normalize": [False, True]}

Cartesian Product: {fit_intercept} x {normalize}

Hyper-parameter choices:
"fit_intercept": False, "normalize": False
"fit_intercept": True, "normalize": False
"fit_intercept": False, "normalize": True
"fit_intercept": True, "normalize": True
"""
params={"svr": {"C": [.001, .01, .1, 1.0, 1.0, 1.0, 10.0, 100.0],
                "shrinking": [True, True, True, True, False],
                "kernel": ["rbf"],
                "gamma": ["auto", "auto", "auto", "auto", .001, .0001, .01, .1]},
        "nsvr": {"C": [.001, .01, .1, 1.0, 1.0, 1.0, 10.0, 100.0],
                 "shrinking": [True, True, True, True, False],
                 "kernel": ["rbf"],
                 "gamma": ["auto", "auto", "auto", "auto", .001, .0001, .01, .1]},
        "lsvr": {"C": [.001, .01, .1, 1.0, 1.0, 1.0, 10.0, 100.0],
                 "epsilon": [0],
                 "loss": ["epsilon_insensitive", "epsilon_insensitive", "epsilon_insensitive", "squared_epsilon_insensitive"],
                 "fit_intercept": [True, True, True, False],
                 "intercept_scaling": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, .3, .6, 1.3, 1.6, 2.0],
                 "dual": [True]},
        "knreg": {"n_neighbors": [1, 3, 5, 10, 20],
                  "weights": ["uniform", "distance"],
                  "leaf_size": [5, 10, 20, 30, 40, 50, 60, 100],
                  "p": [2,2,2,2,1,1,3,5]},
        "gpreg": {"alpha": [1e-10, 1e-10, 1e-10, 1e-10, 1e-10, 1e-8, 1e-6, 1e-4, 1e-2, .1],
                  "normalize_y": [False, False, False, False, True]},
        "plsreg": {"n_components": [2, 2, 2, 1, 5, 10],
                   "scale": [True, True, True, True, False]},
        "dtreg": {"criterion": ["mse", "friedman_mse", "mae"],
                  "splitter": ["best", "best", "random"],
                  "max_depth": [None, None, None, 5,10,20,50]},
        "bagreg": {"n_estimators": [5, 10, 50, 100, 200],
                   "max_samples": [1.0, 1.0, .2, .5]},
        "rfreg": {"n_estimators": [5, 10, 50, 100, 200],
                  "criterion": ["mse"],
                  "max_depth": [None, None, None, 5,10,20,50]},
        "etreg": {"n_estimators": [5, 10, 50, 100, 200],
                  "criterion": ["mse"],
                  "max_depth": [None, None, None, 5,10,20,50]},
        "abreg": {"n_estimators": [5, 10, 50, 100, 200],
                  "learning_rate": [1, .9, .5, .1],
                  "loss": ["linear", "square", "exponential"]},
        "gbreg": {"n_estimators": [5, 10, 50, 100, 200],
                  "learning_rate": [.5, .1, .01, .001],
                  "loss": ["ls", "lad", "huber", "quantile"],
                  "subsample": [.1, .2, .5, 1.0],
                  "criterion": ["mse", "friedman_mse", "mae"],
                  "max_depth": [3, 3, 3, 2, 5, 10]},
        "mlpreg": {"hidden_layer_sizes":[(100,), (50,), (50, 50), (100, 100), (50, 50, 50), (100, 100, 100)],
                   "activation": ['logistic', 'tanh', "relu", "relu", "relu", "relu"],
                   "solver": ["lbfgs","sgd", "adam", "adam"],
                   "batch_size": [16, 32, 64, 128, 256, 512],
                   "learning_rate": ["constant", "invscaling", "adaptive"],
                   "learning_rate_init": [.1, .01, .001, .0001, .00001]}
       }

### Run Experiment

In [None]:
num_searches=20
n_fold=10
save_results= "exp2.csv"

In [None]:
for alg in params.keys():
    print(alg)
    score_df = run_experiment(X=X_train_scaled, Y=y_tr, n_fold=n_fold, alg=alg, alg_params=params[alg], search_type="random", num_searches=num_searches, save_results=save_results)

### Display models ranked by CV scores

In [None]:
score_df = score_df.sort_values(by="score", axis=0)
display(score_df)

### Load results from CSV File and re-produce models ranked by CV scores

In [None]:
score_df = pd.read_csv(save_results)
score_df = score_df.sort_values(by="score", axis=0)

In [None]:
display(score_df)

### Load best model from CSV File

In [None]:
# retrieve top scoring row
best = score_df.iloc[1]
display(best)

# retrieve model parameters from pandas row
alg = best["alg"]
params_json = best["params_json"]
print("alg: {}".format(alg))
print("params_json: {}".format(params_json))

# retrieve relevant values
alg_cls = alg_map[alg]
params = json.loads(params_json)

# initialize model
model = alg_cls(**params)

# train algorithm
train_model(X=X_train_scaled, Y=y_tr, n_fold=n_fold, model=model)