Machine learning prototyping notebook. Data preprocessing has already been tested and implemented in data_preproc.pu (samples/). 

In [33]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

import plotly.express as px
import matplotlib.pyplot as plt
import yfinance as yf

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.multioutput import RegressorChain
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR, SVC
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, PredictionErrorDisplay, accuracy_score

Decision tree regressorGet the data, split into exo/endo and perform train/test split. Get the data, split into exo/endo and perform train/test split.

In [21]:
import os
import sys
import pandas as pd
import numpy as np

# IMPORT FUNCTIONS
sys.path.insert(0, '../sample')
import data_preproc

# LOAD FINANCIAL RATIOS AND ASSET PRICES
test_merge = pd.read_excel('../jupyter-notebooks/test_manual.xlsx')
test_merge = test_merge.loc[:, test_merge.columns != 'Unnamed: 0']
test_assets = pd.read_excel('../jupyter-notebooks/asset_prices.xlsx',index_col='Date')

# PREPROCESS FINANCIAL RATIOS DATA, REPLACE STRINGS WITH FLOATS
ML_data = test_merge.map(data_preproc.convert_placeholder_text_to_num)

# ENSURE THE TWO DATAFRAMES CONTAINING FINANCIAL RATIOS (ML_DATA) AND RETURNS (TEST_ASSETS) HAVE THE SAME ASSETS/TICKERS
ML_final = data_preproc.filter_ratios_returns(ML_data,test_assets)
# print(ML_final.head())

# RESAMPLE THE RETURNS FROM MONTHLY TO QUARTERLY, THEN BFILL AND FFILL
asset_prices = test_assets # MAKE A COPY
asset_prices.index = pd.to_datetime(asset_prices.index)
asset_prices = asset_prices.resample('Q').last()
asset_prices = asset_prices.bfill(axis=1)
asset_prices = asset_prices.ffill(axis=1)


# 
test = data_preproc.FRatioMLdata(ML_final,asset_prices,sector=None,returns_lead_by=-1)
#test.transform()
#print(test.train.head())

In [22]:
test = test.transform()

In [3]:
test

Unnamed: 0,EV,FCF,EBITDA,Revenue,ROE,Gross-Profit-Margin,Quick-Ratio,Debt / Equity,Returns
0,-0.360803,3.447233,6.393835,2.463991,-0.617978,-0.194444,0.000000,0.000000,0.041667
1,-0.251257,-0.184708,-0.630011,0.362394,-0.410596,0.862069,0.000000,0.023301,-0.153732
2,-0.026975,-0.004681,-0.004744,-0.004536,0.480392,0.000000,0.000000,0.000000,0.155340
3,0.258930,-2.478155,4.193577,-0.806291,1.000000,-0.309524,-0.600000,-0.427141,-0.036178
4,-0.475836,-0.002556,-0.002633,-0.002629,-1.864407,0.000000,0.000000,0.000000,0.148515
...,...,...,...,...,...,...,...,...,...
7,0.049659,-1.467892,-0.094140,-0.572862,0.232558,0.146119,0.000000,-0.341260,0.185185
8,-0.026540,3.187525,0.038230,0.513707,0.653846,0.531469,0.000000,0.034440,0.000000
9,-0.029439,-0.762979,0.221575,0.009796,0.000000,-0.089172,0.000000,-0.117264,-0.058140
10,-0.051483,-4.752607,0.241513,0.375513,-0.037037,0.154412,0.100000,0.075935,0.011765


In [42]:
def convert_returns_to_category(element):
    if element>= 0:
        element = 1
    if element < 0:
        element = 0
    return element

def convert_regression_to_classification(dataframe):
    '''
    Given a FRatioMLdata object i.e. [ratio_1 ... ratio_n returns], convert the returns column to:
    1 - if return >= 0
    0 - if return < 0
    '''

    df = dataframe.copy()

    df['Returns'] = df['Returns'].map(convert_returns_to_category)
    return df

def gen_train_test(dataframe,regression=True):
    '''
    Need to account for different cases of regression vs classification
    dataframe - 
    regression - 
    '''

    X = dataframe.iloc[:,:-1]
    y = dataframe.iloc[:,-1]
    
    # scale the data
    data_scaler_x = StandardScaler()
    X = data_scaler_x.fit_transform(X.values)

    if regression is True:
        data_scaler_y = StandardScaler()
        y = data_scaler_y.fit_transform(y.values.reshape(-1,1))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 0)
    return X_train, X_test, y_train, y_test

In [43]:
test_classification = convert_regression_to_classification(test)

In [44]:
test.head()

Unnamed: 0,EV,FCF,EBITDA,Revenue,ROE,Gross-Profit-Margin,Quick-Ratio,Debt / Equity,Returns
0,-0.360803,3.447233,6.393835,2.463991,-0.617978,-0.194444,0.0,0.0,0.041667
1,-0.251257,-0.184708,-0.630011,0.362394,-0.410596,0.862069,0.0,0.023301,-0.153732
2,-0.026975,-0.004681,-0.004744,-0.004536,0.480392,0.0,0.0,0.0,0.15534
3,0.25893,-2.478155,4.193577,-0.806291,1.0,-0.309524,-0.6,-0.427141,-0.036178
4,-0.475836,-0.002556,-0.002633,-0.002629,-1.864407,0.0,0.0,0.0,0.148515


In [45]:
test_classification.head()

Unnamed: 0,EV,FCF,EBITDA,Revenue,ROE,Gross-Profit-Margin,Quick-Ratio,Debt / Equity,Returns
0,-0.360803,3.447233,6.393835,2.463991,-0.617978,-0.194444,0.0,0.0,1
1,-0.251257,-0.184708,-0.630011,0.362394,-0.410596,0.862069,0.0,0.023301,0
2,-0.026975,-0.004681,-0.004744,-0.004536,0.480392,0.0,0.0,0.0,1
3,0.25893,-2.478155,4.193577,-0.806291,1.0,-0.309524,-0.6,-0.427141,0
4,-0.475836,-0.002556,-0.002633,-0.002629,-1.864407,0.0,0.0,0.0,1


In [46]:
test_classification.iloc[:,-1]

0     1
1     0
2     1
3     0
4     1
     ..
7     1
8     1
9     0
10    1
11    0
Name: Returns, Length: 546, dtype: int64

In [52]:
# gen_train_test(test,regression=True)
X_train, X_test, y_train, y_test =  gen_train_test(test_classification,regression=False)

In [53]:
# attempt SVM
grid = {
    'kernel': ['linear','poly','rbf','sigmoid'],
    'C': [0.01,0.1,1,10], 
}

SVC_cv = GridSearchCV(estimator=SVC(), param_grid=grid,cv=5)
SVC_cv.fit(X_train,np.ravel(y_train))

In [54]:
SVC_cv.best_params_

{'C': 10, 'kernel': 'rbf'}

In [55]:
svc = SVC(C=1, kernel='poly').fit(X_train,y_train)

# get predicted values (out of sample performance)
y_pred_scaled = svc.predict(X_test)
y_pred = y_pred_scaled#data_scaler_y.inverse_transform(y_pred_scaled.reshape(-1,1))

print(f'R^2 error (train): {np.round(svc.score(X_train, y_train),5)}')
print(f'R^2 error (test): {np.round(svc.score(X_test, y_test),5)}')

AS = accuracy_score(y_test, y_pred)
print(f'Accuracy score: {np.round(AS, 2)}')


R^2 error (train): 0.52982
R^2 error (test): 0.44545
Accuracy score: 0.45


In [None]:
# attempt decision tree classifier

grid = {
    'max_features': ['sqrt', 'log2',None],
    'max_depth' : [3,4,5,6,7,8, None],
    'ccp_alpha': list(np.logspace(-2, 3, 6)),
    'random_state' : [0]
}

DTR_cv = GridSearchCV(estimator=DecisionTreeRegressor(), param_grid=grid,cv=5)
DTR_cv.fit(X_train,y_train)