Machine learning prototyping notebook. Data preprocessing has already been tested and implemented in data_preproc.py (samples/). 

In [1]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

# IMPORT FUNCTIONS
sys.path.insert(0, '../sample')
import data_preproc
import ML_routines
import models

# LOAD FINANCIAL RATIOS AND ASSET PRICES
test_merge = pd.read_excel('../jupyter-notebooks/test_manual.xlsx')
test_merge = test_merge.loc[:, test_merge.columns != 'Unnamed: 0']
test_assets = pd.read_excel('../jupyter-notebooks/asset_prices.xlsx',index_col='Date')

# PREPROCESS FINANCIAL RATIOS DATA, REPLACE STRINGS WITH FLOATS
ML_data = test_merge.map(data_preproc.convert_placeholder_text_to_num)

# ENSURE THE TWO DATAFRAMES CONTAINING FINANCIAL RATIOS (ML_DATA) AND RETURNS (TEST_ASSETS) HAVE THE SAME ASSETS/TICKERS
ML_final = data_preproc.filter_ratios_returns(ML_data,test_assets)
# print(ML_final.head())

# RESAMPLE THE RETURNS FROM MONTHLY TO QUARTERLY, THEN BFILL AND FFILL
asset_prices = test_assets # MAKE A COPY
asset_prices.index = pd.to_datetime(asset_prices.index)
asset_prices = asset_prices.resample('Q').last()
asset_prices = asset_prices.bfill(axis=1)
asset_prices = asset_prices.ffill(axis=1)


# 
test = data_preproc.FRatioMLdata(ML_final,asset_prices,sector=None,returns_lead_by=2)#-1)
#test.transform()
#print(test.train.head())

In [2]:
# transform the data into ML compatible format

test.transform()

Unnamed: 0,EV,FCF,EBITDA,Revenue,ROE,Gross-Profit-Margin,Quick-Ratio,Debt / Equity,Returns
2,-0.026975,-0.004681,-0.004744,-0.004536,0.480392,0.000000,0.000000,0.000000,0.050000
3,0.258930,-2.478155,4.193577,-0.806291,1.000000,-0.309524,-0.600000,-0.427141,0.041667
4,-0.475836,-0.002556,-0.002633,-0.002629,-1.864407,0.000000,0.000000,0.000000,-0.153732
5,-0.293669,-0.008364,-0.541817,0.229405,-0.662857,-0.475000,4.000000,0.163001,0.155340
6,0.825410,-0.002780,-0.002607,-0.002793,-0.064171,0.000000,0.000000,0.000000,-0.036178
...,...,...,...,...,...,...,...,...,...
7,0.049659,-1.467892,-0.094140,-0.572862,0.232558,0.146119,0.000000,-0.341260,0.100000
8,-0.026540,3.187525,0.038230,0.513707,0.653846,0.531469,0.000000,0.034440,0.030928
9,-0.029439,-0.762979,0.221575,0.009796,0.000000,-0.089172,0.000000,-0.117264,0.010417
10,-0.051483,-4.752607,0.241513,0.375513,-0.037037,0.154412,0.100000,0.075935,0.185185


In [3]:
# test the dataframe shuffling procedure. Ultimately, probably better to do this by invoking shuffle directly, rather than as a method of the object.
# test.shuffle()

In [4]:
# visualise the dataframe after shuffling

#test = test.train)
data_rg = shuffle(test.train,random_state=0)

In [5]:
data_rg

Unnamed: 0,EV,FCF,EBITDA,Revenue,ROE,Gross-Profit-Margin,Quick-Ratio,Debt / Equity,Returns
9,0.257514,-1.889561,0.087670,-0.133416,-2.333333,0.175000,-0.166667,0.333333,0.575757
10,-0.209946,0.005086,0.005019,0.005256,-0.243590,0.000000,0.000000,0.000000,0.050450
5,0.364062,-6.677251,-0.008029,0.172071,0.069444,-0.048872,-0.050000,-0.666667,0.000000
11,0.000000,-0.756467,-0.024635,-0.002249,-0.220290,0.085039,0.000000,-0.098667,-0.024042
5,0.129530,-0.949155,-3.442810,0.033613,-0.025641,0.461538,0.444444,-0.124476,0.000000
...,...,...,...,...,...,...,...,...,...
5,0.018908,-0.055898,-1.656752,-0.117811,0.227273,-1.259259,2.333333,0.063901,0.333333
10,-0.163981,0.005127,0.005139,0.005117,-0.224490,0.000000,0.000000,0.000000,0.184783
9,0.011868,-1.302406,-1.609848,-0.074476,-0.097561,-0.581081,0.000000,-0.075949,-0.016667
7,-0.003843,-0.464031,-0.222056,-0.209207,0.229167,0.041397,-0.363636,0.117647,-0.001678


## Converting between returns and trend prediction

In [6]:
data_clf = ML_routines.convert_regression_to_classification(data_rg)

In [7]:
data_clf.head()

Unnamed: 0,EV,FCF,EBITDA,Revenue,ROE,Gross-Profit-Margin,Quick-Ratio,Debt / Equity,Returns
9,0.257514,-1.889561,0.08767,-0.133416,-2.333333,0.175,-0.166667,0.333333,1
10,-0.209946,0.005086,0.005019,0.005256,-0.24359,0.0,0.0,0.0,1
5,0.364062,-6.677251,-0.008029,0.172071,0.069444,-0.048872,-0.05,-0.666667,1
11,0.0,-0.756467,-0.024635,-0.002249,-0.22029,0.085039,0.0,-0.098667,0
5,0.12953,-0.949155,-3.44281,0.033613,-0.025641,0.461538,0.444444,-0.124476,1


In [8]:
data_clf.iloc[:,-1].head()

9     1
10    1
5     1
11    0
5     1
Name: Returns, dtype: int64

# ML methods

## Load pretrained models or run them

In [2]:
rg_models_list = [
    'LASSO',
    'ml_svr',
    'ml_dtr',
    'ml_br',
    'something that doesnt exist yet'
] # list containing desired models

lag = 2

In [3]:
ML_routines.return_models_not_in_folder(rg_models_list,'../models/proto',1)

Missing models:

something that doesnt exist yet


{'LASSO': [0.0,
  -7.832587750167264e-06,
  0.6071385084750027,
  1.005501214101306,
  0.9977612129181752],
 'ml_svr': [-0.007707253711481732,
  -0.022357534377815735,
  0.5707938468251097,
  1.0279736903683765,
  1.9320220570680278],
 'ml_dtr': [0.13000956935558983,
  -0.000469106332092073,
  0.600229759414848,
  1.0059650217784604,
  1.0498505993306246],
 'ml_br': [0.14532471991691265,
  0.0977383112625887,
  0.566125415757059,
  0.907218117597047,
  1.7520858539592736]}

## Implement models

In [8]:
X_train, X_test, y_train, y_test =  ML_routines.gen_train_test(data_rg,regression=True)

In [10]:
# check data balance
number_down_days = data_clf['Returns'][data_clf['Returns'] == 0].count()
number_up_days = data_clf['Returns'][data_clf['Returns'] == 1].count()

In [11]:
number_down_days

211

In [12]:
number_up_days

244

In [9]:
Xclf_train, Xclf_test, yclf_train, yclf_test =  ML_routines.gen_train_test(data_clf,regression=False)

### MLP Classifier

In [16]:
import keras
import tensorflow as tf
from scikeras.wrappers import KerasClassifier, KerasRegressor
from sklearn.model_selection import GridSearchCV

# fix random seed for reproducibility
tf.random.set_seed(0)

In [48]:
def MLP_clf(hidden_layers, neurons, dropout,optimizer='adam',activation='relu'):
    model = keras.models.Sequential()

    # define an input layer with dim 8 (8 financial ratios)
    model.add(keras.layers.Input(shape=(8,)))
    
    for i in range(hidden_layers):
        model.add(keras.layers.Dense(units=neurons, activation=activation))
        model.add(keras.layers.Dropout(dropout))

    # for classification problem, the final layer must output a sigmoid
    model.add(keras.layers.Dense(1, activation="sigmoid"))

    # obsolete, passed the optimiser to get_clf, remove this line going forward
    # comment out the line below if you're doing CV on the optimizer
    model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['accuracy'])
    return model


In [122]:
def MLP_clf_run(X_train, X_test, y_train, y_test):
    '''
    Fit an MLP NN classifier to training data and perform 5-fold CV (grid search). 
    
    This is a sequential model from the keras package. scikeras package is used to interface the keras
    objects with sklearn so that GridSearchCV can be performed. 
    
    Return:
    [0] - model_cv as an object
    [1] - metrics [AS_train, AS_test, F1, PS, AUC]
    [2] - predicted values on test set y_test
    [3] - model as an object
    '''
    
    clf = KerasClassifier(model=MLP_clf,verbose=False,)

    grid = {
        #'optimizer__learning_rate': [0.05, 0.1], # adam adapts its learning_rate automatically.
        'model__hidden_layers': [1, 2, 3],
        'model__neurons': [32],# 64, 128],
        'model__dropout': [0, 0.5],
        'model__activation': ['relu','softmax', 'tanh', 'sigmoid', 'linear'],
        'optimizer': ['Adam'],# 'SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adamax', 'Nadam']
        #'batch_size': [10,20],#40,60,80,100],
        #'epochs': [10]#,50,100]
    }

    NN_cv = GridSearchCV(clf, grid, scoring='accuracy',cv=5, n_jobs=-1, verbose=False)
    NN_cv.fit(X_train, y_train)

    NN = KerasClassifier(model=MLP_clf,\
                         activation=NN_cv.best_params_['model__activation'],\
                         dropout=NN_cv.best_params_['model__dropout'],\
                         hidden_layers=NN_cv.best_params_['model__hidden_layers'],\
                         neurons=NN_cv.best_params_['model__neurons'],\
                         optimizer=NN_cv.best_params_['optimizer'],\
                         #learning_rate=NN_cv.best_params_['optimizer__learning_rate'],\
                         verbose=False,).fit(X_train,y_train)

    y_pred_scaled = NN.predict(X_test)
    y_pred = y_pred_scaled

    # Accuracy score on training set
    AS_train = NN.score(X_train,y_train)

    print(f'Accuracy Score (train): {np.round(AS_train,5)}')

    metrics = ML_routines.return_class_metrics(y_test,y_pred)
    metrics.insert(0,AS_train)

    return NN_cv, metrics, y_pred, NN

In [123]:
MLP_clf_run(Xclf_train, Xclf_test, yclf_train, yclf_test)

2023-10-02 09:11:22.536281: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-02 09:11:22.537057: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-02 09:11:22.538306: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-02 09:11:22.569729: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-02 09:11:22.592266: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-02 09:11:22.624786: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-02 09:11:22.634591: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-02 09:11:22.651805: I tensorflow/tsl/cud

Accuracy Score (train): 0.5467
Accuracy Score (test): 0.49
F1: 0.66
Precision Score: 0.49
Reciever Operating Curve (Area Under Curve): 0.5


(GridSearchCV(cv=5,
              estimator=KerasClassifier(model=<function MLP_clf at 0x7f773dba7d90>, verbose=False),
              n_jobs=-1,
              param_grid={'model__activation': ['relu', 'softmax', 'tanh',
                                                'sigmoid', 'linear'],
                          'model__dropout': [0, 0.5],
                          'model__hidden_layers': [1, 2, 3],
                          'model__neurons': [32], 'optimizer': ['Adam']},
              scoring='accuracy', verbose=False),
 [0.5467032967032966,
  0.4945054945054945,
  0.6617647058823529,
  0.4945054945054945,
  0.5],
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1]),
 KerasClassifier(
 	model=<function MLP_clf at 0x7f773dba7d9

### MLP Regression

In [134]:
def MLP_rg(hidden_layers, neurons, dropout,optimizer='adam',activation='relu'):
    model = keras.models.Sequential()

    # define an input layer with dim 8 (8 financial ratios)
    model.add(keras.layers.Input(shape=(8,)))
    
    for i in range(hidden_layers):
        model.add(keras.layers.Dense(units=neurons, activation=activation))
        model.add(keras.layers.Dropout(dropout))

    # for classification problem, the final layer must output a sigmoid
    model.add(keras.layers.Dense(1))

    model.compile(loss="mse", optimizer=optimizer, metrics=[KerasRegressor.r_squared])
    return model


In [137]:
def MLP_rg_run(X_train, X_test, y_train, y_test):
    '''
    Fit an MLP NN regressor to training data and perform 5-fold CV (grid search). 
    
    This is a sequential model from the keras package. scikeras package is used to interface the keras
    objects with sklearn so that GridSearchCV can be performed. 
    
    Return:
    [0] - model_cv as an object
    [1] - metrics [AS_train, AS_test, F1, PS, AUC]
    [2] - predicted values on test set y_test
    [3] - model as an object
    '''
    
    rg = KerasRegressor(model=MLP_rg,verbose=False,)

    grid = {
        #'optimizer__learning_rate': [0.05, 0.1], # adam adapts its learning_rate automatically.
        'model__hidden_layers': [1, 2, 3],
        'model__neurons': [32],# 64, 128],
        'model__dropout': [0, 0.5],
        'model__activation': ['relu','softmax', 'tanh', 'sigmoid', 'linear'],
        'optimizer': ['Adam'],# 'SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adamax', 'Nadam']
        #'batch_size': [10,20],#40,60,80,100],
        #'epochs': [10]#,50,100]
    }

    NN_cv = GridSearchCV(rg, grid,cv=5, n_jobs=-1, verbose=False)
    NN_cv.fit(X_train, np.ravel(y_train))

    NN = KerasRegressor(model=MLP_rg,\
                         activation=NN_cv.best_params_['model__activation'],\
                         dropout=NN_cv.best_params_['model__dropout'],\
                         hidden_layers=NN_cv.best_params_['model__hidden_layers'],\
                         neurons=NN_cv.best_params_['model__neurons'],\
                         optimizer=NN_cv.best_params_['optimizer'],\
                         #learning_rate=NN_cv.best_params_['optimizer__learning_rate'],\
                         verbose=False,).fit(X_train,np.ravel(y_train))

    y_pred_scaled = NN.predict(X_test)
    y_pred = y_pred_scaled

    # Accuracy score on training set
    R2_train = NN.score(X_train,np.ravel(y_train))

    print(f'R^2 error (train): {np.round(R2_train,5)}')

    metrics = ML_routines.return_regress_metrics(y_test,y_pred)
    metrics.insert(0,R2_train)

    return NN_cv, metrics, y_pred, NN

In [138]:
MLP_rg_run(X_train, X_test, y_train, y_test)

R^2 error (train): -0.00077
Mean Absolute Error (MAE): 0.62
Mean Squared Error (MSE): 1.01
R^2 error (test): -0.0
Mean Absolute Percentage Error (MAPE): 1.1


(GridSearchCV(cv=5,
              estimator=KerasRegressor(model=<function MLP_rg at 0x7f76f0766950>, verbose=False),
              n_jobs=-1,
              param_grid={'model__activation': ['relu', 'softmax', 'tanh',
                                                'sigmoid', 'linear'],
                          'model__dropout': [0, 0.5],
                          'model__hidden_layers': [1, 2, 3],
                          'model__neurons': [32], 'optimizer': ['Adam']},
              verbose=False),
 [-0.0007673158616139375,
  -0.003237019517524864,
  0.6159712666613331,
  1.0087481400479221,
  1.100504792313193],
 array([ 0.02017784,  0.01789396,  0.01676052,  0.01833169,  0.02047124,
         0.02132298,  0.01734638,  0.01833281,  0.02232034,  0.0175805 ,
         0.01365416,  0.02021439,  0.02525225,  0.01855992,  0.02038604,
         0.01987434,  0.02761941,  0.01641712,  0.02200782,  0.00482366,
         0.02029385,  0.01900963,  0.01537822, -0.02357827,  0.03400206,
         0.

## Functions to generate results

In [26]:
rg_models_dict = {
    'LASSO Regression': test_lasso[1],
    'SVM Regression': ml_svr[1],
    'Decision Tree Regression': ml_dtr[1]
}

clf_models_dict = {
    'Logistic Regression': test_logistic[1],
    'SVM Classification': ml_svc[1],
    'Decision Tree Classification': ml_dtc[1]
}

In [28]:
ML_routines.from_models_return_metrics(rg_models_dict,regression=True)

Unnamed: 0,R^2 Score Train,R^2 Score Test,MAE,MSE,MAPE
LASSO Regression,0.0,-8e-06,0.607139,1.005501,0.997761
SVM Regression,-0.007707,-0.022358,0.570794,1.027974,1.932022
Decision Tree Regression,0.13001,-0.000469,0.60023,1.005965,1.049851


In [29]:
ML_routines.from_models_return_metrics(clf_models_dict,regression=False)

Unnamed: 0,Accuracy Train,Accuracy Test,F1 Score,Precision Score,ROC AUC
Logistic Regression,0.546703,0.494505,0.661765,0.494505,0.5
SVM Classification,0.491758,0.450549,0.479167,0.45098,0.451208
Decision Tree Classification,0.681319,0.626374,0.645833,0.607843,0.627053


In [30]:
rg_models_dm_dict = {
    'LASSO Regression': test_lasso[2],
    'SVM Regression': ml_svr[2],
    'Decision Tree Regression': ml_dtr[2]
}

clf_models_dm_dict = {
    'Logistic Regression': test_logistic[2],
    'SVM Classification': ml_svc[2],
    'Decision Tree Classification': ml_dtc[2]
}

In [32]:
ML_routines.from_models_return_diebold_mariano(rg_models_dm_dict,y_test)

Unnamed: 0,LASSO Regression,SVM Regression,Decision Tree Regression
LASSO Regression,0.0,0.443741,0.907901
SVM Regression,0.443741,0.0,0.38738
Decision Tree Regression,0.907901,0.38738,0.0
