Machine learning prototyping notebook. Data preprocessing has already been tested and implemented in data_preproc.py (samples/). 

In [1]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

# IMPORT FUNCTIONS
sys.path.insert(0, '../sample')
import data_preproc
import ML_routines
import models

# LOAD FINANCIAL RATIOS AND ASSET PRICES
test_merge = pd.read_excel('../jupyter-notebooks/test_manual.xlsx')
test_merge = test_merge.loc[:, test_merge.columns != 'Unnamed: 0']
test_assets = pd.read_excel('../jupyter-notebooks/asset_prices.xlsx',index_col='Date')

# PREPROCESS FINANCIAL RATIOS DATA, REPLACE STRINGS WITH FLOATS
ML_data = test_merge.map(data_preproc.convert_placeholder_text_to_num)

# ENSURE THE TWO DATAFRAMES CONTAINING FINANCIAL RATIOS (ML_DATA) AND RETURNS (TEST_ASSETS) HAVE THE SAME ASSETS/TICKERS
ML_final = data_preproc.filter_ratios_returns(ML_data,test_assets)
# print(ML_final.head())

# RESAMPLE THE RETURNS FROM MONTHLY TO QUARTERLY, THEN BFILL AND FFILL
asset_prices = test_assets # MAKE A COPY
asset_prices.index = pd.to_datetime(asset_prices.index)
asset_prices = asset_prices.resample('Q').last()
asset_prices = asset_prices.bfill(axis=1)
asset_prices = asset_prices.ffill(axis=1)


# 
test = data_preproc.FRatioMLdata(ML_final,asset_prices,sector=None,returns_lead_by=2)#-1)
#test.transform()
#print(test.train.head())

In [2]:
# transform the data into ML compatible format

test.transform()

Unnamed: 0,EV,FCF,EBITDA,Revenue,ROE,Gross-Profit-Margin,Quick-Ratio,Debt / Equity,Returns
2,-0.026975,-0.004681,-0.004744,-0.004536,0.480392,0.000000,0.000000,0.000000,0.050000
3,0.258930,-2.478155,4.193577,-0.806291,1.000000,-0.309524,-0.600000,-0.427141,0.041667
4,-0.475836,-0.002556,-0.002633,-0.002629,-1.864407,0.000000,0.000000,0.000000,-0.153732
5,-0.293669,-0.008364,-0.541817,0.229405,-0.662857,-0.475000,4.000000,0.163001,0.155340
6,0.825410,-0.002780,-0.002607,-0.002793,-0.064171,0.000000,0.000000,0.000000,-0.036178
...,...,...,...,...,...,...,...,...,...
7,0.049659,-1.467892,-0.094140,-0.572862,0.232558,0.146119,0.000000,-0.341260,0.100000
8,-0.026540,3.187525,0.038230,0.513707,0.653846,0.531469,0.000000,0.034440,0.030928
9,-0.029439,-0.762979,0.221575,0.009796,0.000000,-0.089172,0.000000,-0.117264,0.010417
10,-0.051483,-4.752607,0.241513,0.375513,-0.037037,0.154412,0.100000,0.075935,0.185185


In [3]:
# test the dataframe shuffling procedure. Ultimately, probably better to do this by invoking shuffle directly, rather than as a method of the object.
# test.shuffle()

In [4]:
# visualise the dataframe after shuffling

#test = test.train)
data_rg = shuffle(test.train,random_state=0)

In [5]:
data_rg

Unnamed: 0,EV,FCF,EBITDA,Revenue,ROE,Gross-Profit-Margin,Quick-Ratio,Debt / Equity,Returns
9,0.257514,-1.889561,0.087670,-0.133416,-2.333333,0.175000,-0.166667,0.333333,0.575757
10,-0.209946,0.005086,0.005019,0.005256,-0.243590,0.000000,0.000000,0.000000,0.050450
5,0.364062,-6.677251,-0.008029,0.172071,0.069444,-0.048872,-0.050000,-0.666667,0.000000
11,0.000000,-0.756467,-0.024635,-0.002249,-0.220290,0.085039,0.000000,-0.098667,-0.024042
5,0.129530,-0.949155,-3.442810,0.033613,-0.025641,0.461538,0.444444,-0.124476,0.000000
...,...,...,...,...,...,...,...,...,...
5,0.018908,-0.055898,-1.656752,-0.117811,0.227273,-1.259259,2.333333,0.063901,0.333333
10,-0.163981,0.005127,0.005139,0.005117,-0.224490,0.000000,0.000000,0.000000,0.184783
9,0.011868,-1.302406,-1.609848,-0.074476,-0.097561,-0.581081,0.000000,-0.075949,-0.016667
7,-0.003843,-0.464031,-0.222056,-0.209207,0.229167,0.041397,-0.363636,0.117647,-0.001678


## Converting between returns and trend prediction

In [6]:
data_clf = ML_routines.convert_regression_to_classification(data_rg)

In [7]:
data_clf.head()

Unnamed: 0,EV,FCF,EBITDA,Revenue,ROE,Gross-Profit-Margin,Quick-Ratio,Debt / Equity,Returns
9,0.257514,-1.889561,0.08767,-0.133416,-2.333333,0.175,-0.166667,0.333333,1
10,-0.209946,0.005086,0.005019,0.005256,-0.24359,0.0,0.0,0.0,1
5,0.364062,-6.677251,-0.008029,0.172071,0.069444,-0.048872,-0.05,-0.666667,1
11,0.0,-0.756467,-0.024635,-0.002249,-0.22029,0.085039,0.0,-0.098667,0
5,0.12953,-0.949155,-3.44281,0.033613,-0.025641,0.461538,0.444444,-0.124476,1


In [8]:
data_clf.iloc[:,-1].head()

9     1
10    1
5     1
11    0
5     1
Name: Returns, dtype: int64

# ML methods

## Load pretrained models or run them

In [2]:
rg_models_list = [
    'LASSO',
    'ml_svr',
    'ml_dtr',
    'ml_br',
    'something that doesnt exist yet'
] # list containing desired models

lag = 2

In [3]:
ML_routines.return_models_not_in_folder(rg_models_list,'../models/proto',1)

Missing models:

something that doesnt exist yet


{'LASSO': [0.0,
  -7.832587750167264e-06,
  0.6071385084750027,
  1.005501214101306,
  0.9977612129181752],
 'ml_svr': [-0.007707253711481732,
  -0.022357534377815735,
  0.5707938468251097,
  1.0279736903683765,
  1.9320220570680278],
 'ml_dtr': [0.13000956935558983,
  -0.000469106332092073,
  0.600229759414848,
  1.0059650217784604,
  1.0498505993306246],
 'ml_br': [0.14532471991691265,
  0.0977383112625887,
  0.566125415757059,
  0.907218117597047,
  1.7520858539592736]}

## Implement baseline (linear) models

In [7]:
X_train, X_test, y_train, y_test =  ML_routines.gen_train_test(data_rg,regression=True)

### LASSO regression

In [11]:
test_lasso = models.lasso_run(X_train, X_test, y_train, y_test)
ML_routines.persist_model(test_lasso,"../models/proto/LASSO.pickle")

R^2 error (train): 0.0
Mean Absolute Error (MAE): 0.61
Mean Squared Error (MSE): 1.01
R^2 error (test): -0.0
Mean Absolute Percentage Error (MAPE): 1.0


### Logistic regression

In [12]:
# check data balance
number_down_days = data_clf['Returns'][data_clf['Returns'] == 0].count()
number_up_days = data_clf['Returns'][data_clf['Returns'] == 1].count()

In [13]:
number_down_days

211

In [14]:
number_up_days

244

In [17]:
Xclf_train, Xclf_test, yclf_train, yclf_test =  ML_routines.gen_train_test(data_clf,regression=False)

In [18]:
test_logistic = models.logistic_run(Xclf_train, Xclf_test, yclf_train, yclf_test)
ML_routines.persist_model(test_logistic,"../models/proto/Logistic.pickle")



Accuracy Score (train): 0.5467
Accuracy Score (test): 0.49
F1: 0.66
Precision Score: 0.49
Reciever Operating Curve (Area Under Curve): 0.5




## Regression

### SVM regression

In [20]:
ml_svr = models.SVR_run(X_train, X_test, y_train, y_test)
ML_routines.persist_model(ml_svr,"../models/proto/ml_svr.pickle")

R^2 error (train): -0.00771
Mean Absolute Error (MAE): 0.57
Mean Squared Error (MSE): 1.03
R^2 error (test): -0.02
Mean Absolute Percentage Error (MAPE): 1.93


### Decision Tree Regressor

In [25]:
ml_dtr = models.DTR_run(X_train, X_test, y_train, y_test)
ML_routines.persist_model(ml_dtr,"../models/proto/ml_dtr.pickle")

R^2 error (train): 0.13001
Mean Absolute Error (MAE): 0.6
Mean Squared Error (MSE): 1.01
R^2 error (test): -0.0
Mean Absolute Percentage Error (MAPE): 1.05


1260 fits failed out of a total of 5040.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1260 fits failed with the following error:
Traceback (most recent call last):
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 1320, in fit
    super()._fit(
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 259, in _fit
    raise ValueError(
ValueError: S

### Ada Boost Regressor

In [9]:
ml_abr = models.ABR_run(X_train, X_test, y_train, y_test)
ML_routines.persist_model(ml_abr,"../models/proto/ml_abr.pickle")

R^2 error (train): 0.133
Mean Absolute Error (MAE): 0.58
Mean Squared Error (MSE): 0.99
R^2 error (test): 0.02
Mean Absolute Percentage Error (MAPE): 2.03


50 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
 

### Bagging Regressor

In [9]:
ml_br = models.BR_run(X_train, X_test, y_train, y_test)
ML_routines.persist_model(ml_br,"../models/proto/ml_br.pickle")

R^2 error (train): 0.14532
Mean Absolute Error (MAE): 0.57
Mean Squared Error (MSE): 0.91
R^2 error (test): 0.1
Mean Absolute Percentage Error (MAPE): 1.75


### Random Forest Regressor

In [145]:
ml_rfr = RFR_run(X_train, X_test, y_train, y_test)
persist_model(ml_rfr,"../models/proto/ml_rfr.pickle")

R^2 error (train): 0.30575
Mean Absolute Error (MAE): 0.55
Mean Squared Error (MSE): 0.93
R^2 error (test): 0.08
Mean Absolute Percentage Error (MAPE): 1.7


1050 fits failed out of a total of 4200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1050 fits failed with the following error:
Traceback (most recent call last):
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 378, in fit
    raise ValueError(
ValueError: Some value(s) of y are negative which is not allowed for Poisson regression.

 -0.1628022  -0.06861438 -0.07081743 -0.0

### Gradient Boosting Regressor

In [10]:
ml_xgb = models.XGB_run(X_train, X_test, y_train, y_test)
ML_routines.persist_model(ml_xgb,"../models/proto/ml_xgb.pickle")

  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  return ufunc.reduce(obj, axis, dtyp

R^2 error (train): 0.08743
Mean Absolute Error (MAE): 0.61
Mean Squared Error (MSE): 1.01
R^2 error (test): 0.0
Mean Absolute Percentage Error (MAPE): 1.13


  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
 -4.24267932e-001 -4.07760266e-001 -5.14085864e-001 -5.08609164e-001
 -2.60846367e-002 -1.48664748e-001 -2.95245455e-001 -3.86041950e-001
 -3.66177682e-001 -4.20888352e-001 -5.14857644e-001 -6.14298115e-001
 -2.39681674e-002 -1.98987650e-001 -2.51977303e-001 -3.71735846e-001
 -4.25058707e-001 -4.59255830e-001 -5.04138308e-001 -5.22259756e-001
 -2.72844552e-002 -1.52027425e-001 -2.34145158e-001 -2.92951994e-001
 -3.04269178e-001 -3.32243121e-001 -3.29342070e-001 -4.00795547e-001
 -2.19477981e-002 -1.15799131e-001 -2.64170718e-0

## Support vector classification

In [8]:
Xclf_train, Xclf_test, yclf_train, yclf_test =  ML_routines.gen_train_test(data_clf,regression=False)

In [23]:
ml_svc = models.SVC_run(Xclf_train, Xclf_test, yclf_train, yclf_test)
ML_routines.persist_model(ml_svc,"../models/proto/ml_svc.pickle")

Accuracy Score (train): 0.49176
Accuracy Score (test): 0.45
F1: 0.48
Precision Score: 0.45
Reciever Operating Curve (Area Under Curve): 0.45


## Decision Tree Classifier

In [14]:
ml_dtc = models.DTC_run(Xclf_train, Xclf_test, yclf_train, yclf_test)
ML_routines.persist_model(ml_dtc,"../models/proto/ml_dtc.pickle")

Accuracy Score (train): 0.68132
Accuracy Score (test): 0.63
F1: 0.65
Precision Score: 0.61
Reciever Operating Curve (Area Under Curve): 0.63


### Random Forest Classifier
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier

In [15]:
ml_rfc = models.RFC_run(Xclf_train, Xclf_test, yclf_train, yclf_test)
ML_routines.persist_model(ml_rfc,"../models/proto/ml_rfc.pickle")

Accuracy Score (train): 0.89286
Accuracy Score (test): 0.67
F1: 0.7
Precision Score: 0.64
Reciever Operating Curve (Area Under Curve): 0.67


### Bagging Classifier
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier

In [16]:
ml_bc = models.BC_run(Xclf_train, Xclf_test, yclf_train, yclf_test)
ML_routines.persist_model(ml_bc,"../models/proto/ml_bc.pickle")

Accuracy Score (train): 0.68407
Accuracy Score (test): 0.65
F1: 0.68
Precision Score: 0.62
Reciever Operating Curve (Area Under Curve): 0.65


### AdaBoost Classifier
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier

In [9]:
ml_abc = models.ABC_run(Xclf_train, Xclf_test, yclf_train, yclf_test)
ML_routines.persist_model(ml_abc,"../models/proto/ml_abc.pickle")

  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
  sample_weight *= np.exp(
  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
  sample_weight *= np.exp

Accuracy Score (train): 0.61538
Accuracy Score (test): 0.65
F1: 0.6
Precision Score: 0.69
Reciever Operating Curve (Area Under Curve): 0.65


  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
  sample_weight *= np.exp(
  return fit_method(estimator, *args, **kwargs)
50 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/home/vscode/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/vscode/.local/lib/py

### XGB Classifier
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier

In [13]:
ml_xgbc = models.XGBC_run(Xclf_train, Xclf_test, yclf_train, yclf_test)
ML_routines.persist_model(ml_xgbc,"../models/proto/ml_xgbc.pickle")

Accuracy Score (train): 0.66758
Accuracy Score (test): 0.66
F1: 0.69
Precision Score: 0.63
Reciever Operating Curve (Area Under Curve): 0.66


## Functions to generate results

In [26]:
rg_models_dict = {
    'LASSO Regression': test_lasso[1],
    'SVM Regression': ml_svr[1],
    'Decision Tree Regression': ml_dtr[1]
}

clf_models_dict = {
    'Logistic Regression': test_logistic[1],
    'SVM Classification': ml_svc[1],
    'Decision Tree Classification': ml_dtc[1]
}

In [28]:
ML_routines.from_models_return_metrics(rg_models_dict,regression=True)

Unnamed: 0,R^2 Score Train,R^2 Score Test,MAE,MSE,MAPE
LASSO Regression,0.0,-8e-06,0.607139,1.005501,0.997761
SVM Regression,-0.007707,-0.022358,0.570794,1.027974,1.932022
Decision Tree Regression,0.13001,-0.000469,0.60023,1.005965,1.049851


In [29]:
ML_routines.from_models_return_metrics(clf_models_dict,regression=False)

Unnamed: 0,Accuracy Train,Accuracy Test,F1 Score,Precision Score,ROC AUC
Logistic Regression,0.546703,0.494505,0.661765,0.494505,0.5
SVM Classification,0.491758,0.450549,0.479167,0.45098,0.451208
Decision Tree Classification,0.681319,0.626374,0.645833,0.607843,0.627053


In [30]:
rg_models_dm_dict = {
    'LASSO Regression': test_lasso[2],
    'SVM Regression': ml_svr[2],
    'Decision Tree Regression': ml_dtr[2]
}

clf_models_dm_dict = {
    'Logistic Regression': test_logistic[2],
    'SVM Classification': ml_svc[2],
    'Decision Tree Classification': ml_dtc[2]
}

In [32]:
ML_routines.from_models_return_diebold_mariano(rg_models_dm_dict,y_test)

Unnamed: 0,LASSO Regression,SVM Regression,Decision Tree Regression
LASSO Regression,0.0,0.443741,0.907901
SVM Regression,0.443741,0.0,0.38738
Decision Tree Regression,0.907901,0.38738,0.0
