Machine learning prototyping notebook. Data preprocessing has already been tested and implemented in data_preproc.pu (samples/). 

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

import plotly.express as px
import matplotlib.pyplot as plt
import yfinance as yf

# ML - preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.utils import shuffle

# metrics - classification
from sklearn.metrics import PredictionErrorDisplay, accuracy_score, f1_score, precision_score, roc_auc_score

# metrics - regression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

# model comparison
from dieboldmariano import dm_test


# baseline models
from sklearn import linear_model

# models

from sklearn import tree
from sklearn.multioutput import RegressorChain
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import svm
from sklearn import ensemble

# model persistence
import pickle

Decision tree regressorGet the data, split into exo/endo and perform train/test split. Get the data, split into exo/endo and perform train/test split.

In [2]:
import os
import sys

# IMPORT FUNCTIONS
sys.path.insert(0, '../sample')
import data_preproc

# LOAD FINANCIAL RATIOS AND ASSET PRICES
test_merge = pd.read_excel('../jupyter-notebooks/test_manual.xlsx')
test_merge = test_merge.loc[:, test_merge.columns != 'Unnamed: 0']
test_assets = pd.read_excel('../jupyter-notebooks/asset_prices.xlsx',index_col='Date')

# PREPROCESS FINANCIAL RATIOS DATA, REPLACE STRINGS WITH FLOATS
ML_data = test_merge.map(data_preproc.convert_placeholder_text_to_num)

# ENSURE THE TWO DATAFRAMES CONTAINING FINANCIAL RATIOS (ML_DATA) AND RETURNS (TEST_ASSETS) HAVE THE SAME ASSETS/TICKERS
ML_final = data_preproc.filter_ratios_returns(ML_data,test_assets)
# print(ML_final.head())

# RESAMPLE THE RETURNS FROM MONTHLY TO QUARTERLY, THEN BFILL AND FFILL
asset_prices = test_assets # MAKE A COPY
asset_prices.index = pd.to_datetime(asset_prices.index)
asset_prices = asset_prices.resample('Q').last()
asset_prices = asset_prices.bfill(axis=1)
asset_prices = asset_prices.ffill(axis=1)


# 
test = data_preproc.FRatioMLdata(ML_final,asset_prices,sector=None,returns_lead_by=2)#-1)
#test.transform()
#print(test.train.head())

In [3]:
# transform the data into ML compatible format

test.transform()

Unnamed: 0,EV,FCF,EBITDA,Revenue,ROE,Gross-Profit-Margin,Quick-Ratio,Debt / Equity,Returns
2,-0.026975,-0.004681,-0.004744,-0.004536,0.480392,0.000000,0.000000,0.000000,0.050000
3,0.258930,-2.478155,4.193577,-0.806291,1.000000,-0.309524,-0.600000,-0.427141,0.041667
4,-0.475836,-0.002556,-0.002633,-0.002629,-1.864407,0.000000,0.000000,0.000000,-0.153732
5,-0.293669,-0.008364,-0.541817,0.229405,-0.662857,-0.475000,4.000000,0.163001,0.155340
6,0.825410,-0.002780,-0.002607,-0.002793,-0.064171,0.000000,0.000000,0.000000,-0.036178
...,...,...,...,...,...,...,...,...,...
7,0.049659,-1.467892,-0.094140,-0.572862,0.232558,0.146119,0.000000,-0.341260,0.100000
8,-0.026540,3.187525,0.038230,0.513707,0.653846,0.531469,0.000000,0.034440,0.030928
9,-0.029439,-0.762979,0.221575,0.009796,0.000000,-0.089172,0.000000,-0.117264,0.010417
10,-0.051483,-4.752607,0.241513,0.375513,-0.037037,0.154412,0.100000,0.075935,0.185185


In [4]:
# test the dataframe shuffling procedure. Ultimately, probably better to do this by invoking shuffle directly, rather than as a method of the object.
# test.shuffle()

In [5]:
# visualise the dataframe after shuffling

#test = test.train)
data_rg = shuffle(test.train,random_state=0)

In [6]:
data_rg

Unnamed: 0,EV,FCF,EBITDA,Revenue,ROE,Gross-Profit-Margin,Quick-Ratio,Debt / Equity,Returns
9,0.257514,-1.889561,0.087670,-0.133416,-2.333333,0.175000,-0.166667,0.333333,0.575757
10,-0.209946,0.005086,0.005019,0.005256,-0.243590,0.000000,0.000000,0.000000,0.050450
5,0.364062,-6.677251,-0.008029,0.172071,0.069444,-0.048872,-0.050000,-0.666667,0.000000
11,0.000000,-0.756467,-0.024635,-0.002249,-0.220290,0.085039,0.000000,-0.098667,-0.024042
5,0.129530,-0.949155,-3.442810,0.033613,-0.025641,0.461538,0.444444,-0.124476,0.000000
...,...,...,...,...,...,...,...,...,...
5,0.018908,-0.055898,-1.656752,-0.117811,0.227273,-1.259259,2.333333,0.063901,0.333333
10,-0.163981,0.005127,0.005139,0.005117,-0.224490,0.000000,0.000000,0.000000,0.184783
9,0.011868,-1.302406,-1.609848,-0.074476,-0.097561,-0.581081,0.000000,-0.075949,-0.016667
7,-0.003843,-0.464031,-0.222056,-0.209207,0.229167,0.041397,-0.363636,0.117647,-0.001678


## Tools for converting between returns and trend prediction

In [7]:
def convert_returns_to_category(element):
    if element>= 0:
        element = 1
    if element < 0:
        element = 0
    return element

def convert_regression_to_classification(dataframe):
    '''
    Given a FRatioMLdata object i.e. [ratio_1 ... ratio_n returns], convert the returns column to:
    1 - if return >= 0
    0 - if return < 0
    '''

    df = dataframe.copy()

    df['Returns'] = df['Returns'].map(convert_returns_to_category)
    return df

def gen_train_test(dataframe,regression=True):
    '''
    Need to account for different cases of regression vs classification
    dataframe - 
    regression - 
    '''

    X = dataframe.iloc[:,:-1]
    y = dataframe.iloc[:,-1]
    
    # scale the data
    data_scaler_x = StandardScaler()
    X = data_scaler_x.fit_transform(X.values)

    if regression is True:
        data_scaler_y = StandardScaler()
        y = data_scaler_y.fit_transform(y.values.reshape(-1,1))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 0)
    return X_train, X_test, y_train, y_test

In [8]:
data_clf = convert_regression_to_classification(data_rg)

In [9]:
data_clf.head()

Unnamed: 0,EV,FCF,EBITDA,Revenue,ROE,Gross-Profit-Margin,Quick-Ratio,Debt / Equity,Returns
9,0.257514,-1.889561,0.08767,-0.133416,-2.333333,0.175,-0.166667,0.333333,1
10,-0.209946,0.005086,0.005019,0.005256,-0.24359,0.0,0.0,0.0,1
5,0.364062,-6.677251,-0.008029,0.172071,0.069444,-0.048872,-0.05,-0.666667,1
11,0.0,-0.756467,-0.024635,-0.002249,-0.22029,0.085039,0.0,-0.098667,0
5,0.12953,-0.949155,-3.44281,0.033613,-0.025641,0.461538,0.444444,-0.124476,1


In [10]:
data_clf.iloc[:,-1].head()

9     1
10    1
5     1
11    0
5     1
Name: Returns, dtype: int64

# ML methods

## Useful functions

In [12]:
def persist_model(model,filename):
    '''
    Given an sklearn model object, save the resulting model to file filename.
    Store models in directory ../models.
    '''

    with open(filename, 'wb') as f:
        pickle.dump(model,f)
    
    # only useful to load models for testing
    #with open('../models/test_lasso.pickle','rb') as f:
    #    test_lasso_2 = pickle.load(f)

In [13]:
def return_regress_metrics(y_test,y_pred):
    '''
    Given a regression type problem model (sklearn), return the following metrics as a list:
    Mean Absolute Error (MAE)
    Mean Squared Error (MSE)
    R^2 error
    Mean Absolute Percentage Error (MAPE)
    '''

    # Mean Absolute Error (MAE)
    MAE = mean_absolute_error(y_test, y_pred)
    print(f'Mean Absolute Error (MAE): {np.round(MAE, 2)}')

    # Mean Squared Error (MSE)
    MSE = mean_squared_error(y_test,y_pred)
    print(f'Mean Squared Error (MSE): {np.round(MSE, 2)}')

    # R^2 error
    R2 = r2_score(y_test, y_pred)
    print(f'R^2 error (test): {np.round(R2, 2)}')

    # Mean Absolute Percentage Error (MAPE)
    MAPE = mean_absolute_percentage_error(y_test,y_pred)
    print(f'Mean Absolute Percentage Error (MAPE): {np.round(MAPE, 2)}')

    return [R2, MAE, MSE, MAPE]

In [14]:
def return_class_metrics(y_test,y_pred):
    '''
    Given a regression type problem model (sklearn), return the following metrics as a list:
    F1 Score
    Precision Score
    AUC
    Accuracy Score
    '''
    
    # Accuracy Score
    AS = accuracy_score(y_test, y_pred)
    print(f'Accuracy Score (test): {np.round(AS, 2)}')
    
    # F1 score (best 1 - worst 0)
    f1 = f1_score(y_test,y_pred)
    print(f'F1: {np.round(f1, 2)}')

    # precision_score (the ability of the classifier not to label as positive a sample that is negative, best 1 - worst 0)
    PS = precision_score(y_test,y_pred)
    print(f'Precision Score: {np.round(PS, 2)}')

    # roc_auc_score
    AUC = roc_auc_score(y_test,y_pred)
    print(f'Reciever Operating Curve (Area Under Curve): {np.round(AUC, 2)}')

    return [AS, f1, PS, AUC]

## Implement baseline (linear) models

In [15]:
# gen_train_test(test,regression=True)
X_train, X_test, y_train, y_test =  gen_train_test(data_rg,regression=True)

### LASSO regression

In [19]:
def lasso_run(X_train, X_test, y_train, y_test):
    '''
    Fit LASSO to training data and perform 5-fold CV (grid search). Return:
    [0] - model as an object
    [1] - metrics [R2_train, R2_test, MAE, MSE, MAPE]
    [2] - predicted values on test set y_test
    [3] - model as an object
    '''
    
    grid = {
        'alpha': list(np.logspace(-2, 3, 6))
    }

    reg_cv = GridSearchCV(estimator=linear_model.Lasso(), param_grid=grid,cv=5)
    reg_cv.fit(X_train, y_train)

    reg = linear_model.Lasso(alpha=reg_cv.best_params_['alpha']).fit(X_train,y_train)

    y_pred_scaled = reg.predict(X_test)
    y_pred = y_pred_scaled#data_scaler_y.inverse_transform(y_pred_scaled.reshape(-1,1))

    # R^2 train error
    R2_train = reg.score(X_train, y_train)

    print(f'R^2 error (train): {np.round(R2_train,5)}')

    # I don't need this line because it's given in the return_regress_metrics function.
    # print(f'R^2 error (test): {np.round(reg.score(X_test, y_test),5)}')

    metrics = return_regress_metrics(y_test,y_pred)
    metrics.insert(0,R2_train)

    return reg_cv, metrics, y_pred, reg


In [20]:
test_lasso = lasso_run(X_train, X_test, y_train, y_test)
persist_model(test_lasso,"../models/proto/LASSO.pickle")

R^2 error (train): 0.0
Mean Absolute Error (MAE): 0.61
Mean Squared Error (MSE): 1.01
R^2 error (test): -0.0
Mean Absolute Percentage Error (MAPE): 1.0


In [18]:
# Example of how to persist the model manually
# persist_model(test_lasso[0],'test.pickle')

In [43]:
# implemented Diebold-Mariano test

In [21]:
dm_test(y_test, y_test, test_lasso[2], one_sided=False)

(array([-2.27316741]), array([0.02539584]))

In [22]:
test_lasso[2]

array([-0.00056127, -0.00056127, -0.00056127, -0.00056127, -0.00056127,
       -0.00056127, -0.00056127, -0.00056127, -0.00056127, -0.00056127,
       -0.00056127, -0.00056127, -0.00056127, -0.00056127, -0.00056127,
       -0.00056127, -0.00056127, -0.00056127, -0.00056127, -0.00056127,
       -0.00056127, -0.00056127, -0.00056127, -0.00056127, -0.00056127,
       -0.00056127, -0.00056127, -0.00056127, -0.00056127, -0.00056127,
       -0.00056127, -0.00056127, -0.00056127, -0.00056127, -0.00056127,
       -0.00056127, -0.00056127, -0.00056127, -0.00056127, -0.00056127,
       -0.00056127, -0.00056127, -0.00056127, -0.00056127, -0.00056127,
       -0.00056127, -0.00056127, -0.00056127, -0.00056127, -0.00056127,
       -0.00056127, -0.00056127, -0.00056127, -0.00056127, -0.00056127,
       -0.00056127, -0.00056127, -0.00056127, -0.00056127, -0.00056127,
       -0.00056127, -0.00056127, -0.00056127, -0.00056127, -0.00056127,
       -0.00056127, -0.00056127, -0.00056127, -0.00056127, -0.00

### Logistic regression

In [23]:
# check data balance
number_down_days = data_clf['Returns'][data_clf['Returns'] == 0].count()
number_up_days = data_clf['Returns'][data_clf['Returns'] == 1].count()

In [24]:
number_down_days

211

In [25]:
number_up_days

244

In [26]:
Xclf_train, Xclf_test, yclf_train, yclf_test =  gen_train_test(data_clf,regression=False)

In [27]:
def logistic_run(X_train, X_test, y_train, y_test):
    '''
    Fit Logistic Regression model to training data and perform 5-fold CV (grid search). Return:
    [0] - model as an object
    [1] - metrics [AS train, AS test, f1, PS]
    [2] - predicted values on test set y_test
    [3] - model as an object
    '''
    
    grid = [
        {
        'penalty': ['l1', 'l2'],
        'C': list(np.logspace(-2, 3, 6)),
            'solver':['saga']
        
    },
        {
         'penalty': ['elasticnet'],
            'C': list(np.logspace(-2, 3, 6)),
            'l1_ratio': list(np.linspace(0,1,5)),
            'solver':['saga']
        }
    ]

    log_cv = GridSearchCV(estimator=linear_model.LogisticRegression(), param_grid=grid,cv=5)
    log_cv.fit(X_train, y_train) # fit(X_train,np.ravel(y_train))

    log = linear_model.LogisticRegression(penalty=log_cv.best_params_['penalty'],\
                                          C=log_cv.best_params_['C'],l1_ratio=log_cv.best_params_['l1_ratio'],\
                                         solver='saga').fit(X_train,y_train)

    y_pred_scaled = log.predict(X_test)
    y_pred = y_pred_scaled

    # Accuracy score on training set
    AS_train = log.score(X_train, y_train)

    print(f'Accuracy Score (train): {np.round(AS_train,5)}')

    metrics = return_class_metrics(y_test,y_pred)
    metrics.insert(0,AS_train)

    return log_cv, metrics, y_pred, log


In [28]:
test_logistic = logistic_run(Xclf_train, Xclf_test, yclf_train, yclf_test)
persist_model(test_logistic,"../models/proto/Logistic.pickle")



Accuracy Score (train): 0.5467
Accuracy Score (test): 0.49
F1: 0.66
Precision Score: 0.49
Reciever Operating Curve (Area Under Curve): 0.5




## Regression

### SVM regression

In [65]:
def SVR_run(X_train, X_test, y_train, y_test):
    '''
    Fit SVM regression to training data and perform 5-fold CV (grid search). Return:
    [0] - model_cv as an object
    [1] - metrics [R2_train, R2_test, MAE, MSE, MAPE]
    [2] - predicted values on test set y_test
    [3] - model as an object
    '''

    grid = {
        'kernel': ['linear','poly','rbf','sigmoid'],
        'C': list(np.logspace(-2, 3, 6)), 
        'epsilon': [0.01,0.1,1,10]
    }

    svr_cv = GridSearchCV(estimator=svm.SVR(), param_grid=grid,cv=5,n_jobs=4)
    svr_cv.fit(X_train, np.ravel(y_train))

    svr = svm.SVR(kernel=svr_cv.best_params_['kernel'],C=svr_cv.best_params_['C'],epsilon=svr_cv.best_params_['epsilon']).fit(X_train,y_train)

    y_pred_scaled = svr.predict(X_test)
    y_pred = y_pred_scaled#data_scaler_y.inverse_transform(y_pred_scaled.reshape(-1,1))

    # R^2 train error
    R2_train = svr.score(X_train, y_train)

    print(f'R^2 error (train): {np.round(R2_train,5)}')

    metrics = return_regress_metrics(y_test,y_pred)
    metrics.insert(0,R2_train)

    return svr_cv, metrics, y_pred, svr


In [61]:
ml_svr = SVR_run(X_train, X_test, y_train, y_test)
persist_model(ml_svr,"../models/proto/ml_svr.pickle")

R^2 error (train): -0.00771
Mean Absolute Error (MAE): 0.57
Mean Squared Error (MSE): 1.03
R^2 error (test): -0.02
Mean Absolute Percentage Error (MAPE): 1.93


  y = column_or_1d(y, warn=True)


### Decision Tree Regressor

In [74]:
def DTR_run(X_train, X_test, y_train, y_test):
    '''
    Fit Decision Tree regression to training data and perform 5-fold CV (grid search). Return:
    [0] - model_cv as an object
    [1] - metrics [R2_train, R2_test, MAE, MSE, MAPE]
    [2] - predicted values on test set y_test
    [3] - model as an object
    '''

    grid = {
        'criterion': ['squared_error','friedman_mse','absolute_error','poisson'],
        'splitter': ['best','random'],
        'max_features': ['sqrt', 'log2',None],
        'max_depth' : [3,4,5,6,7,8, None],
        'ccp_alpha': list(np.logspace(-2, 3, 6)),
    }

    dtr_cv = GridSearchCV(estimator=tree.DecisionTreeRegressor(), param_grid=grid,cv=5,n_jobs=4)
    dtr_cv.fit(X_train, y_train)#np.ravel(y_train))

    dtr = tree.DecisionTreeRegressor(criterion=dtr_cv.best_params_['criterion'],\
                                     splitter=dtr_cv.best_params_['splitter'],\
                                     max_features=dtr_cv.best_params_['max_features'],\
                                     max_depth=dtr_cv.best_params_['max_depth'],\
                                     ccp_alpha=dtr_cv.best_params_['ccp_alpha']).fit(X_train,y_train)

    y_pred_scaled = dtr.predict(X_test)
    y_pred = y_pred_scaled#data_scaler_y.inverse_transform(y_pred_scaled.reshape(-1,1))

    # R^2 train error
    R2_train = dtr.score(X_train, y_train)

    print(f'R^2 error (train): {np.round(R2_train,5)}')

    metrics = return_regress_metrics(y_test,y_pred)
    metrics.insert(0,R2_train)

    return dtr_cv, metrics, y_pred, dtr

In [75]:
ml_dtr = DTR_run(X_train, X_test, y_train, y_test)
persist_model(ml_dtr,"../models/proto/ml_dtr.pickle")

R^2 error (train): 0.0
Mean Absolute Error (MAE): 0.61
Mean Squared Error (MSE): 1.01
R^2 error (test): -0.0
Mean Absolute Percentage Error (MAPE): 1.0


1260 fits failed out of a total of 5040.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1260 fits failed with the following error:
Traceback (most recent call last):
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 1320, in fit
    super()._fit(
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 259, in _fit
    raise ValueError(
ValueError: S

### Ada Boost Regressor

In [None]:
def ABR_run(X_train, X_test, y_train, y_test):
    '''
    Fit Ada Boost Regressor using decision trees as the estimator to training data and perform 5-fold CV (grid search). Return:
    [0] - model_cv as an object
    [1] - metrics [R2_train, R2_test, MAE, MSE, MAPE]
    [2] - predicted values on test set y_test
    [3] - model as an object
    '''

    grid = {
        'estimator': [None], # None = DecisionTreeRegressor with max_depth=3
        'n_estimators': list(np.arange(1,201,5)),
        'learning_rate': list(np.arange(0,200,5)),
        'loss': ['linear','square','exponential']
    }

    abr_cv = GridSearchCV(estimator=ensemble.AdaBoostRegressor(), param_grid=grid,cv=5,n_jobs=4)
    abr_cv.fit(X_train, np.ravel(y_train))

    abr = ensemble.AdaBoostRegressor(estimator=abr_cv.best_params_['estimator'],\
                                     n_estimators=abr_cv.best_params_['n_estimators'],\
                                     learning_rate=abr_cv.best_params_['learning_rate'],\
                                     loss=abr_cv.best_params_['loss']).fit(X_train,y_train)

    y_pred_scaled = abr.predict(X_test)
    y_pred = y_pred_scaled#data_scaler_y.inverse_transform(y_pred_scaled.reshape(-1,1))

    # R^2 train error
    R2_train = abr.score(X_train, y_train)

    print(f'R^2 error (train): {np.round(R2_train,5)}')

    metrics = return_regress_metrics(y_test,y_pred)
    metrics.insert(0,R2_train)

    return abr_cv, metrics, y_pred, abr

In [None]:
ml_abr = ABR_run(X_train, X_test, y_train, y_test)
persist_model(ml_abr,"../models/proto/ml_abr.pickle")

### Bagging Regressor

In [142]:
def BR_run(X_train, X_test, y_train, y_test):
    '''
    Fit Bagging Regressor using decision trees as the estimator to training data and perform 5-fold CV (grid search). Return:
    [0] - model_cv as an object
    [1] - metrics [R2_train, R2_test, MAE, MSE, MAPE]
    [2] - predicted values on test set y_test
    [3] - model as an object
    '''

    grid = {
        'estimator': [None], # None = DecisionTreeRegressor with max_depth=3
        'n_estimators': list(np.arange(1,50,5)),
        'max_samples': list(np.arange(1,50,5)),
        'max_features': [1.0], # default, I don't want to mess with the bootstrapping for the moment
        'bootstrap': [True], # default, I don't want to mess with the bootstrapping for the moment
        'bootstrap_features': [False] # default, I don't want to mess with the bootstrapping for the moment
    }

    br_cv = GridSearchCV(estimator=ensemble.BaggingRegressor(), param_grid=grid,cv=5,n_jobs=4)
    br_cv.fit(X_train, np.ravel(y_train))

    br = ensemble.BaggingRegressor(estimator=br_cv.best_params_['estimator'],\
                                   n_estimators=br_cv.best_params_['n_estimators'],\
                                   max_samples=br_cv.best_params_['max_samples'],\
                                   max_features=br_cv.best_params_['max_features'],\
                                   bootstrap=br_cv.best_params_['bootstrap'],\
                                   bootstrap_features=br_cv.best_params_['bootstrap_features']).fit(X_train,y_train)

    y_pred_scaled = br.predict(X_test)
    y_pred = y_pred_scaled#data_scaler_y.inverse_transform(y_pred_scaled.reshape(-1,1))

    # R^2 train error
    R2_train = br.score(X_train, y_train)

    print(f'R^2 error (train): {np.round(R2_train,5)}')

    metrics = return_regress_metrics(y_test,y_pred)
    metrics.insert(0,R2_train)

    return br_cv, metrics, y_pred, br

In [143]:
ml_br = BR_run(X_train, X_test, y_train, y_test)
persist_model(ml_br,"../models/proto/ml_br.pickle")

R^2 error (train): -0.03054
Mean Absolute Error (MAE): 0.63
Mean Squared Error (MSE): 0.97
R^2 error (test): 0.03
Mean Absolute Percentage Error (MAPE): 1.88


  return column_or_1d(y, warn=True)


### Random Forest Regressor

In [144]:
def RFR_run(X_train, X_test, y_train, y_test):
    '''
    Fit Random Forest Regressor using decision trees as the estimator to training data and perform 5-fold CV (grid search). Return:
    [0] - model_cv as an object
    [1] - metrics [R2_train, R2_test, MAE, MSE, MAPE]
    [2] - predicted values on test set y_test
    [3] - model as an object
    '''

    grid = {
        'n_estimators': list(np.arange(1,50,5)),
        'criterion': ['squared_error','friedman_mse','absolute_error','poisson'], # most of these parameters are the same as for decision tree regressor
        'max_depth': [3,4,5,6,7,8, None],
        'max_features': ['sqrt', 'log2',None],
        'max_samples': [None],
        'bootstrap': [True] # default, I don't want to mess with the bootstrapping for the moment
    }

    rfr_cv = GridSearchCV(estimator=ensemble.RandomForestRegressor(), param_grid=grid,cv=5,n_jobs=4)
    rfr_cv.fit(X_train, np.ravel(y_train))

    rfr = ensemble.RandomForestRegressor(n_estimators=rfr_cv.best_params_['n_estimators'],\
                                         criterion=rfr_cv.best_params_['criterion'],\
                                         max_depth=rfr_cv.best_params_['max_depth'],\
                                         max_samples=rfr_cv.best_params_['max_samples'],\
                                         max_features=rfr_cv.best_params_['max_features'],\
                                         bootstrap=rfr_cv.best_params_['bootstrap']).fit(X_train,y_train)

    y_pred_scaled = rfr.predict(X_test)
    y_pred = y_pred_scaled#data_scaler_y.inverse_transform(y_pred_scaled.reshape(-1,1))

    # R^2 train error
    R2_train = rfr.score(X_train, y_train)

    print(f'R^2 error (train): {np.round(R2_train,5)}')

    metrics = return_regress_metrics(y_test,y_pred)
    metrics.insert(0,R2_train)

    return rfr_cv, metrics, y_pred, rfr

In [145]:
ml_rfr = RFR_run(X_train, X_test, y_train, y_test)
persist_model(ml_rfr,"../models/proto/ml_rfr.pickle")

R^2 error (train): 0.30575
Mean Absolute Error (MAE): 0.55
Mean Squared Error (MSE): 0.93
R^2 error (test): 0.08
Mean Absolute Percentage Error (MAPE): 1.7


1050 fits failed out of a total of 4200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1050 fits failed with the following error:
Traceback (most recent call last):
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 378, in fit
    raise ValueError(
ValueError: Some value(s) of y are negative which is not allowed for Poisson regression.

 -0.1628022  -0.06861438 -0.07081743 -0.0

### Gradient Boosting Regressor

In [163]:
def XGB_run(X_train, X_test, y_train, y_test):
    '''
    Fit Gradient Boosting Regressor using decision trees as the estimator to training data and perform 5-fold CV (grid search). Return:
    [0] - model_cv as an object
    [1] - metrics [R2_train, R2_test, MAE, MSE, MAPE]
    [2] - predicted values on test set y_test
    [3] - model as an object
    '''

    grid = { 
        'loss': ['squared_error','absolute_error','huber'],
        'learning_rate': list(np.logspace(-2, 3, 6)),
        'n_estimators': list(np.arange(1,201,5)), # same for 
        'criterion': ['squared_error','friedman_mse'], # most of these parameters are the same as for decision tree regressor
        'max_depth': [1,3,5,7, None],
        'max_features': ['sqrt', 'log2',None]
    }

    xgb_cv = GridSearchCV(estimator=ensemble.GradientBoostingRegressor(), param_grid=grid,cv=5,n_jobs=4)
    xgb_cv.fit(X_train, np.ravel(y_train))

    xgb = ensemble.GradientBoostingRegressor(loss=xgb_cv.params_['loss'],\
                                             learning_rate=xgb_cv.params_['learning_rate'],\
                                             n_estimators=xgb_cv.params_['n_estimators'],\
                                             criterion=xgb_cv.params_['criterion'],\
                                             max_depth=xgb_cv.params_['max_depth'],\
                                             max_features=xgb_cv.params_['max_features']).fit(X_train,y_train)

    y_pred_scaled = xgb.predict(X_test)
    y_pred = y_pred_scaled#data_scaler_y.inverse_transform(y_pred_scaled.reshape(-1,1))

    # R^2 train error
    R2_train = xgb.score(X_train, y_train)

    print(f'R^2 error (train): {np.round(R2_train,5)}')

    metrics = return_regress_metrics(y_test,y_pred)
    metrics.insert(0,R2_train)

    return xgb_cv, metrics, y_pred, xgb

In [None]:
ml_xgb = XGB_run(X_train, X_test, y_train, y_test)
persist_model(ml_xgb,"ml_xgb.pickle")

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  * np.sum(sampl

## Support vector classification

In [29]:
Xclf_train, Xclf_test, yclf_train, yclf_test =  gen_train_test(data_clf,regression=False)

In [168]:
# functionalise SVC

def SVC_run(X_train, X_test, y_train, y_test):
    '''
    Fit SVC to training data and perform 5-fold CV (grid search). Return:
    [0] - model_cv as an object
    [1] - metrics [AS_train, AS_test, F1, PS, AUC]
    [2] - predicted values on test set y_test
    [3] - model as an object
    '''
    # define grid of parameters to search
    grid = {
        'kernel': ['linear','poly','rbf','sigmoid'],
        'C': list(np.logspace(-2, 3, 6)), 
        'degree': [3]
    }
    
    svc_cv = GridSearchCV(estimator=svm.SVC(), param_grid=grid,cv=5,n_jobs=4)
    svc_cv.fit(X_train,np.ravel(y_train))

    svc = svm.SVC(C=svc_cv.best_params_['C'], kernel=svc_cv.best_params_['kernel']).fit(X_train,y_train)

    # get predicted values (out of sample performance)
    y_pred_scaled = svc.predict(X_test)
    y_pred = y_pred_scaled#data_scaler_y.inverse_transform(y_pred_scaled.reshape(-1,1))

    # Accuracy score on training set
    AS_train = svc.score(X_train, y_train)

    print(f'Accuracy Score (train): {np.round(AS_train,5)}')

    metrics = return_class_metrics(y_test,y_pred)
    metrics.insert(0,AS_train)

    return svc_cv, metrics, y_pred, svc

In [169]:
ml_svc = SVC_run(Xclf_train, Xclf_test, yclf_train, yclf_test)
persist_model(ml_svc,"../models/proto/ml_svc.pickle")

Accuracy Score (train): 0.49176
Accuracy Score (test): 0.45
F1: 0.48
Precision Score: 0.45
Reciever Operating Curve (Area Under Curve): 0.45


## Decision Tree Classifier

In [170]:
# functionalise decision tree classifier

def DTC_run(X_train, X_test, y_train, y_test):
    '''
    Fit Decision Tree Classifier to training data and perform 5-fold CV (grid search). Return:
    [0] - model_cv as an object
    [1] - metrics [AS_train, AS_test, F1, PS, AUC]
    [2] - predicted values on test set y_test
    [3] - model as an object
    '''
    grid = {
        'criterion': ['gini','entropy','log_loss'],
        'splitter': ['best','random'],
        'max_features': ['sqrt', 'log2',None],
        'max_depth' : [3,4,5,6,7,8, None],
        'ccp_alpha': list(np.logspace(-2, 3, 6)),
    }
    
    dtc_cv = GridSearchCV(estimator=tree.DecisionTreeClassifier(), param_grid=grid,cv=5,n_jobs=4)
    dtc_cv.fit(X_train,y_train)

    dtc = tree.DecisionTreeClassifier(criterion=dtc_cv.best_params_['criterion'],\
                                      splitter=dtc_cv.best_params_['splitter'],\
                                      ccp_alpha=dtc_cv.best_params_['ccp_alpha'],\
                                      max_depth=dtc_cv.best_params_['max_depth'],\
                                      max_features=dtc_cv.best_params_['max_features']).fit(X_train,y_train)
    
    # get predicted values (out of sample performance)
    y_pred_scaled = dtc.predict(X_test)
    y_pred = y_pred_scaled#data_scaler_y.inverse_transform(y_pred_scaled.reshape(-1,1))

    # Accuracy score on training set
    AS_train = dtc.score(X_train, y_train)

    print(f'Accuracy Score (train): {np.round(AS_train,5)}')

    metrics = return_class_metrics(y_test,y_pred)
    metrics.insert(0,AS_train)

    return dtc_cv, metrics, y_pred, dtc

In [171]:
ml_dtc = DTC_run(Xclf_train, Xclf_test, yclf_train, yclf_test)
persist_model(ml_dtc,"../models/proto/ml_dtc.pickle")

Accuracy Score (train): 0.66484
Accuracy Score (test): 0.63
F1: 0.65
Precision Score: 0.61
Reciever Operating Curve (Area Under Curve): 0.63


### Random Forest Classifier
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier

In [30]:
def RFC_run(X_train, X_test, y_train, y_test):
    '''
    Fit Random Forest Classifier using decision trees as the estimator to training data and perform 5-fold CV (grid search). Return:
    [0] - model_cv as an object
    [1] - metrics [AS_train, AS_test, F1, PS, AUC]
    [2] - predicted values on test set y_test
    [3] - model as an object
    '''

    grid = {
        'n_estimators': list(np.arange(1,200,5)),
        'criterion': ['gini','entropy','log_loss'], # most of these parameters are the same as for decision tree regressor
        'max_depth': [3,4,5,6,7,8, None],
        'max_features': ['sqrt', 'log2',None],
        'max_samples': [None], # default
        'bootstrap': [True] # default, I don't want to mess with the bootstrapping for the moment
    }

    rfc_cv = GridSearchCV(estimator=ensemble.RandomForestClassifier(), param_grid=grid,cv=5,n_jobs=4)
    rfc_cv.fit(X_train, np.ravel(y_train))

    rfc= ensemble.RandomForestClassifier(n_estimators=rfc_cv.best_params_['n_estimators'],\
                                         criterion=rfc_cv.best_params_['criterion'],\
                                         max_depth=rfc_cv.best_params_['max_depth'],\
                                         max_samples=rfc_cv.best_params_['max_samples'],\
                                         max_features=rfc_cv.best_params_['max_features'],\
                                         bootstrap=rfc_cv.best_params_['bootstrap']).fit(X_train,y_train)

    # get predicted values (out of sample performance)
    y_pred_scaled = rfc.predict(X_test)
    y_pred = y_pred_scaled#data_scaler_y.inverse_transform(y_pred_scaled.reshape(-1,1))

    # Accuracy score on training set
    AS_train = rfc.score(X_train, y_train)

    print(f'Accuracy Score (train): {np.round(AS_train,5)}')

    metrics = return_class_metrics(y_test,y_pred)
    metrics.insert(0,AS_train)

    return rfc_cv, metrics, y_pred, rfc

In [None]:
ml_rfc = RFC_run(Xclf_train, Xclf_test, yclf_train, yclf_test)
persist_model(ml_rfc,"../models/proto/ml_rfc.pickle")

### Bagging Classifier
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier

In [174]:
def BC_run(X_train, X_test, y_train, y_test):
    '''
    Fit Bagging Classifier using decision trees as the estimator to training data and perform 5-fold CV (grid search). Return:
    [0] - model_cv as an object
    [1] - metrics [AS_train, AS_test, F1, PS, AUC]
    [2] - predicted values on test set y_test
    [3] - model as an object
    '''

    grid = {
        'estimator': [None], # None = DecisionTreeClassifier with max_depth=3
        'n_estimators': list(np.arange(1,50,5)),
        'max_samples': list(np.arange(1,50,5)),
        'max_features': [1.0], # default, I don't want to mess with the bootstrapping for the moment
        'bootstrap': [True], # default, I don't want to mess with the bootstrapping for the moment
        'bootstrap_features': [False] # default, I don't want to mess with the bootstrapping for the moment
    }

    bc_cv = GridSearchCV(estimator=ensemble.BaggingClassifier(), param_grid=grid,cv=5,n_jobs=4)
    bc_cv.fit(X_train, np.ravel(y_train))

    bc = ensemble.BaggingClassifier(estimator=bc_cv.best_params_['estimator'],\
                                   n_estimators=bc_cv.best_params_['n_estimators'],\
                                   max_samples=bc_cv.best_params_['max_samples'],\
                                   max_features=bc_cv.best_params_['max_features'],\
                                   bootstrap=bc_cv.best_params_['bootstrap'],\
                                   bootstrap_features=bc_cv.best_params_['bootstrap_features']).fit(X_train,y_train)

    y_pred_scaled = bc.predict(X_test)
    y_pred = y_pred_scaled#data_scaler_y.inverse_transform(y_pred_scaled.reshape(-1,1))

    # Accuracy score on training set
    AS_train = bc.score(X_train, y_train)

    print(f'Accuracy Score (train): {np.round(AS_train,5)}')

    metrics = return_class_metrics(y_test,y_pred)
    metrics.insert(0,AS_train)

    return bc_cv, metrics, y_pred, bc

In [175]:
ml_bc = BC_run(Xclf_train, Xclf_test, yclf_train, yclf_test)
persist_model(ml_bc,"../models/proto/ml_bc.pickle")

Accuracy Score (train): 0.67308
Accuracy Score (test): 0.66
F1: 0.7
Precision Score: 0.62
Reciever Operating Curve (Area Under Curve): 0.66


### AdaBoost Classifier
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier

In [None]:
def ABC_run(X_train, X_test, y_train, y_test):
    '''
    Fit Ada Boost Classifier using decision trees as the estimator to training data and perform 5-fold CV (grid search). Return:
    [0] - model_cv as an object
    [1] - metrics [AS_train, AS_test, F1, PS, AUC]
    [2] - predicted values on test set y_test
    [3] - model as an object
    '''

    grid = {
        'estimator': [None], # None = DecisionTreeRegressor with max_depth=1
        'n_estimators': list(np.arange(1,201,5)),
        'learning_rate': list(np.arange(0,200,5)),
        'algorithm': ['SAMME.R'] # default
    }

    abc_cv = GridSearchCV(estimator=ensemble.AdaBoostClassifier(), param_grid=grid,cv=5,n_jobs=4)
    abc_cv.fit(X_train, np.ravel(y_train))

    abc = ensemble.AdaBoostClassifier(estimator=abc_cv.best_params_['estimator'],\
                                     n_estimators=abc_cv.best_params_['n_estimators'],\
                                     learning_rate=abc_cv.best_params_['learning_rate'],\
                                     algorithm=abc_cv.best_params_['algorithm']).fit(X_train,y_train)

    y_pred_scaled = abc.predict(X_test)
    y_pred = y_pred_scaled#data_scaler_y.inverse_transform(y_pred_scaled.reshape(-1,1))

    # Accuracy score on training set
    AS_train = abc.score(X_train, y_train)

    print(f'Accuracy Score (train): {np.round(AS_train,5)}')

    metrics = return_class_metrics(y_test,y_pred)
    metrics.insert(0,AS_train)

    return abc_cv, metrics, y_pred, abc

In [None]:
ml_abc = ABC_run(Xclf_train, Xclf_test, yclf_train, yclf_test)
persist_model(ml_abc,"../models/proto/ml_abc.pickle")

### XGB Classifier
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier

In [None]:
def XGBC_run(X_train, X_test, y_train, y_test):
    '''
    Fit Gradient Boosting Classifier using decision trees as the estimator to training data and perform 5-fold CV (grid search). Return:
    [0] - model_cv as an object
    [1] - metrics [AS_train, AS_test, F1, PS, AUC]
    [2] - predicted values on test set y_test
    [3] - model as an object
    '''

    grid = { 
        'loss': ['log_loss','exponential'],
        'learning_rate': list(np.logspace(-2, 3, 6)),
        'n_estimators': list(np.arange(1,201,5)), # same for 
        'criterion': ['squared_error','friedman_mse'], # most of these parameters are the same as for decision tree regressor
        'max_depth': [1,3,5,7, None],
        'max_features': ['sqrt', 'log2',None]
    }

    xgbc_cv = GridSearchCV(estimator=ensemble.GradientBoostingClassifier(), param_grid=grid,cv=5,n_jobs=4)
    xgbc_cv.fit(X_train, np.ravel(y_train))

    xgbc = ensemble.GradientBoostingClassifier(loss=xgbc_cv.params_['loss'],\
                                             learning_rate=xgbc_cv.params_['learning_rate'],\
                                             n_estimators=xgbc_cv.params_['n_estimators'],\
                                             criterion=xgbc_cv.params_['criterion'],\
                                             max_depth=xgbc_cv.params_['max_depth'],\
                                             max_features=xgbc_cv.params_['max_features']).fit(X_train,y_train)

    y_pred_scaled = xgbc.predict(X_test)
    y_pred = y_pred_scaled#data_scaler_y.inverse_transform(y_pred_scaled.reshape(-1,1))

    # Accuracy score on training set
    AS_train = xgbc.score(X_train, y_train)

    print(f'Accuracy Score (train): {np.round(AS_train,5)}')

    metrics = return_class_metrics(y_test,y_pred)
    metrics.insert(0,AS_train)

    return xgbc_cv, metrics, y_pred, xgbc

In [None]:
ml_xgbc = XGBC_run(Xclf_train, Xclf_test, yclf_train, yclf_test)
persist_model(ml_xgbc,"../models/proto/ml_xgbc.pickle")

## Functions to generate results

In [256]:
rg_models_dict = {
    'LASSO Regression': test_lasso[1],
    'SVM Regression': ml_svr[1],
    'Decision Tree Regression': ml_dtr[1]
}

clf_models_dict = {
    'Logistic Regression': test_logistic[1],
    'SVM Classification': ml_svc[1],
    'Decision Tree Classification': ml_dtc[1]
}

In [257]:
def from_models_return_metrics(models_dict,regression):
    '''
    Input - a dictionary with keys as model names / descr and values as ML metrics, 
          - regression = True [R2_train, R2_test, MAE, MSE, MAPE], False [AS train, AS test, f1, PS]
    Output - dataframe summarising ML metrics with column names defined according to regression
    '''
    if regression is True:
        df = pd.DataFrame.from_dict(models_dict,orient='index',columns=['R^2 Score Train', 'R^2 Score Test', 'MAE', 'MSE', 'MAPE'])
    else:
        df = pd.DataFrame.from_dict(models_dict,orient='index',columns=['Accuracy Train','Accuracy Test', 'F1 Score', 'Precision Score', 'ROC AUC'])
    return df

In [258]:
from_models_return_metrics(rg_models_dict,regression=True)

Unnamed: 0,R^2 Score Train,R^2 Score Test,MAE,MSE,MAPE
LASSO Regression,0.0,-8e-06,0.607139,1.005501,0.997761
SVM Regression,-0.007707,-0.022358,0.570794,1.027974,1.932022
Decision Tree Regression,0.0,-8e-06,0.607139,1.005501,0.997761


In [259]:
from_models_return_metrics(clf_models_dict,regression=False)

Unnamed: 0,Accuracy Train,Accuracy Test,F1 Score,Precision Score,ROC AUC
Logistic Regression,0.546703,0.494505,0.661765,0.494505,0.5
SVM Classification,0.491758,0.450549,0.479167,0.45098,0.451208
Decision Tree Classification,0.664835,0.626374,0.645833,0.607843,0.627053


In [260]:
list(rg_models_dm_dict.keys())

['LASSO Regression', 'SVM Regression', 'Decision Tree Regression']

In [261]:
rg_models_dm_dict = {
    'LASSO Regression': test_lasso[2],
    'SVM Regression': ml_svr[2],
    'Decision Tree Regression': ml_dtr[2]
}

clf_models_dm_dict = {
    'Logistic Regression': test_logistic[2],
    'SVM Classification': ml_svc[2],
    'Decision Tree Classification': ml_dtc[2]
}

In [276]:
def from_models_return_diebold_mariano(models_dict,y_test):
    '''
    Only applies to regression type models.
    Input - a dictionary with keys as model names / descr and values as y_pred. 
          - y_test, the test set common to all models
    Output - dataframe showing p-value of DM test
    '''
    labels = list(models_dict.keys())
    X = list(models_dict.values())
    Y = X

    result = np.zeros((len(X),len(Y)))
    for i in range(len(X)):
        for j in range(len(Y)):
            if i != j:
                result[i,j] = dm_test(y_test, X[i], Y[j], one_sided=False)[1][0]
            else:
                pass
                
    df = pd.DataFrame(result)
    df.index = labels
    df.columns = labels
    return df

In [277]:
from_models_return_diebold_mariano(rg_models_dm_dict,y_test)

Unnamed: 0,LASSO Regression,SVM Regression,Decision Tree Regression
LASSO Regression,0.0,0.443741,0.140301
SVM Regression,0.443741,0.0,0.443741
Decision Tree Regression,0.140301,0.443741,0.0


# (obsolete) Tensorflow tests

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
split_fraction = 0.715
train_split = int(split_fraction * int(df.shape[0]))
step = 6

past = 2
future = 1
learning_rate = 0.001
batch_size = 256
epochs = 10

In [None]:
print(
    "The selected parameters are:",
    ", ".join([titles[i] for i in [0, 1, 5, 7, 8, 10, 11]]),
)
selected_features = [feature_keys[i] for i in [0, 1, 5, 7, 8, 10, 11]]
features = df[selected_features]
features.index = df[date_time_key]
features.head()

features = normalize(features.values, train_split)
features = pd.DataFrame(features)
features.head()

train_data = features.loc[0 : train_split - 1]
val_data = features.loc[train_split:]

In [None]:
start = past + future
end = start + train_split

x_train = train_data[[i for i in range(7)]].values
y_train = features.iloc[start:end][[1]]

sequence_length = int(past / step)

In [None]:
dataset_train = keras.preprocessing.timeseries_dataset_from_array(
    x_train,
    y_train,
    sequence_length=sequence_length,
    sampling_rate=step,
    batch_size=batch_size,
)

In [87]:
test.train[test.train.index == 0]

Unnamed: 0,EV,FCF,EBITDA,Revenue,ROE,Gross-Profit-Margin,Quick-Ratio,Debt / Equity,Returns
0,-0.360803,3.447233,6.393835,2.463991,-0.617978,-0.194444,0.0,0.0,0.041667
0,-0.16773,2.071429,1.383396,2.139292,-0.034483,0.196581,0.0,0.0,-0.105882
0,-0.118467,-1.077758,3.32293,3.002712,-0.160714,-0.015,0.0,0.0,0.269761
0,0.168897,0.0,2.828341,2.425314,-0.166667,0.105485,0.0,0.0,-0.018919
0,-0.137338,8.858668,0.814978,1.140352,1.397833,-0.107325,0.0,0.0,-0.289474
0,-0.215536,2.891626,2.661053,2.845256,0.221519,-0.020891,0.0,0.0,-0.085798
0,-0.235294,0.717772,2.678497,3.650943,-0.103448,-0.070866,0.0,0.0,0.000713
0,1.141936,1.516245,2.951967,2.931063,0.174757,-0.039634,0.0,0.0,0.214325
0,2.163367,1.601319,3.884052,2.301454,-0.113924,0.039002,0.0,0.0,0.735849
0,-0.341322,-66.739179,-7.620575,1.043916,-0.166667,0.091667,0.0,0.0,0.052632


# Prototype code, unused

In [28]:
# attempt SVM
grid = {
    'kernel': ['linear','poly','rbf','sigmoid'],
    'C': [0.01,0.1,1,10], 
}

SVC_cv = GridSearchCV(estimator=SVC(), param_grid=grid,cv=5)
SVC_cv.fit(X_train,np.ravel(y_train))

In [29]:
SVC_cv.best_params_

{'C': 10, 'kernel': 'rbf'}

In [30]:
SVC_cv.best_params_['C']

10

In [31]:
svc = SVC(C=SVC_cv.best_params_['C'], kernel=SVC_cv.best_params_['kernel']).fit(X_train,y_train)

# get predicted values (out of sample performance)
y_pred_scaled = svc.predict(X_test)
y_pred = y_pred_scaled#data_scaler_y.inverse_transform(y_pred_scaled.reshape(-1,1))

print(f'R^2 error (train): {np.round(svc.score(X_train, y_train),5)}')
print(f'R^2 error (test): {np.round(svc.score(X_test, y_test),5)}')

AS = accuracy_score(y_test, y_pred)
print(f'Accuracy score: {np.round(AS, 2)}')


R^2 error (train): 0.58716
R^2 error (test): 0.52727
Accuracy score: 0.53


In [14]:
# attempt decision tree classifier

grid = {
    'max_features': ['sqrt', 'log2',None],
    'max_depth' : [3,4,5,6,7,8, None],
    'ccp_alpha': list(np.logspace(-2, 3, 6)),
    'random_state' : [0]
}

DTR_cv = GridSearchCV(estimator=DecisionTreeRegressor(), param_grid=grid,cv=5)
DTR_cv.fit(X_train,y_train)

In [47]:
np.logspace(-2, 3, 6)

array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])