Machine learning prototyping notebook. Data preprocessing has already been tested and implemented in data_preproc.py (samples/). 

In [1]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

# IMPORT FUNCTIONS
sys.path.insert(0, '../sample')
import data_preproc
import ML_routines
import models

# LOAD FINANCIAL RATIOS AND ASSET PRICES
test_merge = pd.read_excel('../jupyter-notebooks/test_manual.xlsx')
test_merge = test_merge.loc[:, test_merge.columns != 'Unnamed: 0']
test_assets = pd.read_excel('../jupyter-notebooks/asset_prices.xlsx',index_col='Date')

# PREPROCESS FINANCIAL RATIOS DATA, REPLACE STRINGS WITH FLOATS
ML_data = test_merge.map(data_preproc.convert_placeholder_text_to_num)

# ENSURE THE TWO DATAFRAMES CONTAINING FINANCIAL RATIOS (ML_DATA) AND RETURNS (TEST_ASSETS) HAVE THE SAME ASSETS/TICKERS
ML_final = data_preproc.filter_ratios_returns(ML_data,test_assets)
# print(ML_final.head())

# RESAMPLE THE RETURNS FROM MONTHLY TO QUARTERLY, THEN BFILL AND FFILL
asset_prices = test_assets # MAKE A COPY
asset_prices.index = pd.to_datetime(asset_prices.index)
asset_prices = asset_prices.resample('Q').last()
asset_prices = asset_prices.bfill(axis=1)
asset_prices = asset_prices.ffill(axis=1)


# 
test = data_preproc.FRatioMLdata(ML_final,asset_prices,sector=None,returns_lead_by=2)#-1)
#test.transform()
#print(test.train.head())

In [2]:
# transform the data into ML compatible format

test.transform()

Unnamed: 0,EV,FCF,EBITDA,Revenue,ROE,Gross-Profit-Margin,Quick-Ratio,Debt / Equity,Returns
2,-0.026975,-0.004681,-0.004744,-0.004536,0.480392,0.000000,0.000000,0.000000,0.050000
3,0.258930,-2.478155,4.193577,-0.806291,1.000000,-0.309524,-0.600000,-0.427141,0.041667
4,-0.475836,-0.002556,-0.002633,-0.002629,-1.864407,0.000000,0.000000,0.000000,-0.153732
5,-0.293669,-0.008364,-0.541817,0.229405,-0.662857,-0.475000,4.000000,0.163001,0.155340
6,0.825410,-0.002780,-0.002607,-0.002793,-0.064171,0.000000,0.000000,0.000000,-0.036178
...,...,...,...,...,...,...,...,...,...
7,0.049659,-1.467892,-0.094140,-0.572862,0.232558,0.146119,0.000000,-0.341260,0.100000
8,-0.026540,3.187525,0.038230,0.513707,0.653846,0.531469,0.000000,0.034440,0.030928
9,-0.029439,-0.762979,0.221575,0.009796,0.000000,-0.089172,0.000000,-0.117264,0.010417
10,-0.051483,-4.752607,0.241513,0.375513,-0.037037,0.154412,0.100000,0.075935,0.185185


In [3]:
# test the dataframe shuffling procedure. Ultimately, probably better to do this by invoking shuffle directly, rather than as a method of the object.
# test.shuffle()

In [4]:
# visualise the dataframe after shuffling

#test = test.train)
data_rg = shuffle(test.train,random_state=0)

In [5]:
data_rg

Unnamed: 0,EV,FCF,EBITDA,Revenue,ROE,Gross-Profit-Margin,Quick-Ratio,Debt / Equity,Returns
9,0.257514,-1.889561,0.087670,-0.133416,-2.333333,0.175000,-0.166667,0.333333,0.575757
10,-0.209946,0.005086,0.005019,0.005256,-0.243590,0.000000,0.000000,0.000000,0.050450
5,0.364062,-6.677251,-0.008029,0.172071,0.069444,-0.048872,-0.050000,-0.666667,0.000000
11,0.000000,-0.756467,-0.024635,-0.002249,-0.220290,0.085039,0.000000,-0.098667,-0.024042
5,0.129530,-0.949155,-3.442810,0.033613,-0.025641,0.461538,0.444444,-0.124476,0.000000
...,...,...,...,...,...,...,...,...,...
5,0.018908,-0.055898,-1.656752,-0.117811,0.227273,-1.259259,2.333333,0.063901,0.333333
10,-0.163981,0.005127,0.005139,0.005117,-0.224490,0.000000,0.000000,0.000000,0.184783
9,0.011868,-1.302406,-1.609848,-0.074476,-0.097561,-0.581081,0.000000,-0.075949,-0.016667
7,-0.003843,-0.464031,-0.222056,-0.209207,0.229167,0.041397,-0.363636,0.117647,-0.001678


## Converting between returns and trend prediction

In [6]:
data_clf = ML_routines.convert_regression_to_classification(data_rg)

In [7]:
data_clf.head()

Unnamed: 0,EV,FCF,EBITDA,Revenue,ROE,Gross-Profit-Margin,Quick-Ratio,Debt / Equity,Returns
9,0.257514,-1.889561,0.08767,-0.133416,-2.333333,0.175,-0.166667,0.333333,1
10,-0.209946,0.005086,0.005019,0.005256,-0.24359,0.0,0.0,0.0,1
5,0.364062,-6.677251,-0.008029,0.172071,0.069444,-0.048872,-0.05,-0.666667,1
11,0.0,-0.756467,-0.024635,-0.002249,-0.22029,0.085039,0.0,-0.098667,0
5,0.12953,-0.949155,-3.44281,0.033613,-0.025641,0.461538,0.444444,-0.124476,1


In [8]:
data_clf.iloc[:,-1].head()

9     1
10    1
5     1
11    0
5     1
Name: Returns, dtype: int64

# ML methods

## Load pretrained models or run them

In [2]:
rg_models_list = [
    'LASSO',
    'ml_svr',
    'ml_dtr',
    'ml_br',
    'something that doesnt exist yet'
] # list containing desired models

lag = 2

In [3]:
ML_routines.return_models_not_in_folder(rg_models_list,'../models/proto',1)

Missing models:

something that doesnt exist yet


{'LASSO': [0.0,
  -7.832587750167264e-06,
  0.6071385084750027,
  1.005501214101306,
  0.9977612129181752],
 'ml_svr': [-0.007707253711481732,
  -0.022357534377815735,
  0.5707938468251097,
  1.0279736903683765,
  1.9320220570680278],
 'ml_dtr': [0.13000956935558983,
  -0.000469106332092073,
  0.600229759414848,
  1.0059650217784604,
  1.0498505993306246],
 'ml_br': [0.14532471991691265,
  0.0977383112625887,
  0.566125415757059,
  0.907218117597047,
  1.7520858539592736]}

## Implement models

In [9]:
X_train, X_test, y_train, y_test =  ML_routines.gen_train_test(data_rg,regression=True)

In [10]:
# check data balance
number_down_days = data_clf['Returns'][data_clf['Returns'] == 0].count()
number_up_days = data_clf['Returns'][data_clf['Returns'] == 1].count()

In [11]:
number_down_days

211

In [12]:
number_up_days

244

In [13]:
Xclf_train, Xclf_test, yclf_train, yclf_test =  ML_routines.gen_train_test(data_clf,regression=False)

In [21]:
!pip install scikeras

Defaulting to user installation because normal site-packages is not writeable
Collecting scikeras
  Downloading scikeras-0.11.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.11.0
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
from scikeras.wrappers import KerasClassifier

In [20]:
import keras
from keras.models import Sequential
from keras.layers import Dense

#Create Sequential model with Dense layers, using the add method
model = Sequential()

#Dense implements the operation:
#        output = activation(dot(input, kernel) + bias)
#Units are the dimensionality of the output space for the layer,
#     which equals the number of hidden units
#Activation and loss functions may be specified by strings or classes
model.add(Dense(units=64, activation='relu', input_dim=8))
model.add(Dense(units=1, activation='softmax'))

#The compile method configures the model’s learning process
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

#The fit method does the training in batches
# x_train and y_train are Numpy arrays --just like in the Scikit-Learn API.
model.fit(X_train, y_train, epochs=5, batch_size=32)

#The evaluate method calculates the losses and metrics
#     for the trained model
loss_and_metrics = model.evaluate(X_test, y_test, batch_size=128)

#The predict method applies the trained model to inputs
#     to generate outputs
classes = model.predict(X_test, batch_size=128)

Epoch 1/5


  return dispatch_target(*args, **kwargs)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [53]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

def create_sequential(input_dim, hidden_layers, neurons, activation, dropout_rate, optimizer):
    model = Sequential()
    model.add(Dense(units=neurons, activation=activation, input_dim=input_dim))

    for i in range(hidden_layers):
        model.add(Dense(units=neurons, activation=activation))
        model.add(Dropout(dropout_rate))

    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [54]:
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier, KerasRegressor

input_dim = X_train.shape[1]
hidden_layers = [1, 2, 3]
neurons = [32, 64, 128]
activation = ['relu']#, 'sigmoid']
dropout_rate = [0.1, 0.2, 0.3]
optimizer = ['adam', 'sgd']

param_grid = dict(hidden_layers=hidden_layers, neurons=neurons, activation=activation, dropout_rate=dropout_rate, optimizer=optimizer)


In [56]:
model = KerasClassifier(model=create_sequential, input_dim=input_dim)

grid = GridSearchCV(model,param_grid, cv=3,scoring='acuracy')

In [57]:
grid_result = grid.fit(X_train, y_train)

InvalidParameterError: The 'scoring' parameter of GridSearchCV must be a str among {'roc_auc_ovr_weighted', 'precision_macro', 'recall', 'adjusted_rand_score', 'neg_negative_likelihood_ratio', 'neg_mean_gamma_deviance', 'average_precision', 'f1', 'normalized_mutual_info_score', 'neg_log_loss', 'rand_score', 'completeness_score', 'f1_micro', 'f1_samples', 'roc_auc_ovo', 'precision', 'top_k_accuracy', 'v_measure_score', 'precision_samples', 'neg_root_mean_squared_error', 'jaccard', 'r2', 'neg_brier_score', 'neg_mean_squared_error', 'explained_variance', 'fowlkes_mallows_score', 'accuracy', 'neg_mean_absolute_error', 'jaccard_macro', 'precision_weighted', 'recall_micro', 'roc_auc_ovo_weighted', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_log_error', 'jaccard_weighted', 'homogeneity_score', 'mutual_info_score', 'jaccard_samples', 'neg_median_absolute_error', 'recall_samples', 'jaccard_micro', 'positive_likelihood_ratio', 'f1_macro', 'roc_auc_ovr', 'roc_auc', 'balanced_accuracy', 'adjusted_mutual_info_score', 'matthews_corrcoef', 'recall_macro', 'neg_mean_poisson_deviance', 'recall_weighted', 'f1_weighted', 'precision_micro', 'max_error'}, a callable, an instance of 'list', an instance of 'tuple', an instance of 'dict' or None. Got 'acuracy' instead.

In [None]:
def create_sequential(input_dim, hidden_layers, neurons, activation, dropout_rate, optimizer):
    model = Sequential()
    model.add(Dense(units=neurons, activation=activation, input_dim=input_dim))

    for i in range(hidden_layers):
        model.add(Dense(units=neurons, activation=activation))
        model.add(Dropout(dropout_rate))

    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [None]:
import tensorflow as tf

# fix random seed for reproducibility
tf.random.set_seed(0)

In [80]:
def get_clf(hidden_layers, neurons, dropout):
    #n_features_in_ = meta["n_features_in_"]
    #n_classes_ = meta["n_classes_"]
    model = keras.models.Sequential()

    # define an input layer with dim 8 (8 financial ratios)
    model.add(keras.layers.Input(shape=(8,)))
    
    for i in range(hidden_layers):
        model.add(keras.layers.Dense(units=neurons, activation="relu"))
        model.add(keras.layers.Dropout(dropout))
        
    model.add(keras.layers.Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])
    return model


In [81]:
clf = KerasClassifier(
    model=get_clf,
    loss="binary_crossentropy",
    optimizer="adam",
    optimizer__learning_rate=0.1,
    model__hidden_layers = 1,#[1, 2, 3],
    model__neurons = 32,#[32, 64, 128],
    model__dropout=0.5,
    verbose=False,
)

In [82]:
params = {
    'optimizer__learning_rate': [0.05, 0.1],
    'model__hidden_layers': [1, 2, 3],
    'model__neurons': [32, 64, 128],
    'model__dropout': [0, 0.5],
}

gs = GridSearchCV(clf, params, scoring='accuracy',cv=5, n_jobs=-1, verbose=False)

gs.fit(Xclf_train, yclf_train)

print(gs.best_score_, gs.best_params_)

0.5631278538812785 {'model__dropout': 0.5, 'model__hidden_layers': 3, 'model__neurons': 128, 'optimizer__learning_rate': 0.05}


In [83]:
gs.score(Xclf_test, yclf_test)

0.4945054945054945

In [63]:
gs.score(Xclf_train, yclf_train)

0.532967032967033

## Functions to generate results

In [26]:
rg_models_dict = {
    'LASSO Regression': test_lasso[1],
    'SVM Regression': ml_svr[1],
    'Decision Tree Regression': ml_dtr[1]
}

clf_models_dict = {
    'Logistic Regression': test_logistic[1],
    'SVM Classification': ml_svc[1],
    'Decision Tree Classification': ml_dtc[1]
}

In [28]:
ML_routines.from_models_return_metrics(rg_models_dict,regression=True)

Unnamed: 0,R^2 Score Train,R^2 Score Test,MAE,MSE,MAPE
LASSO Regression,0.0,-8e-06,0.607139,1.005501,0.997761
SVM Regression,-0.007707,-0.022358,0.570794,1.027974,1.932022
Decision Tree Regression,0.13001,-0.000469,0.60023,1.005965,1.049851


In [29]:
ML_routines.from_models_return_metrics(clf_models_dict,regression=False)

Unnamed: 0,Accuracy Train,Accuracy Test,F1 Score,Precision Score,ROC AUC
Logistic Regression,0.546703,0.494505,0.661765,0.494505,0.5
SVM Classification,0.491758,0.450549,0.479167,0.45098,0.451208
Decision Tree Classification,0.681319,0.626374,0.645833,0.607843,0.627053


In [30]:
rg_models_dm_dict = {
    'LASSO Regression': test_lasso[2],
    'SVM Regression': ml_svr[2],
    'Decision Tree Regression': ml_dtr[2]
}

clf_models_dm_dict = {
    'Logistic Regression': test_logistic[2],
    'SVM Classification': ml_svc[2],
    'Decision Tree Classification': ml_dtc[2]
}

In [32]:
ML_routines.from_models_return_diebold_mariano(rg_models_dm_dict,y_test)

Unnamed: 0,LASSO Regression,SVM Regression,Decision Tree Regression
LASSO Regression,0.0,0.443741,0.907901
SVM Regression,0.443741,0.0,0.38738
Decision Tree Regression,0.907901,0.38738,0.0
