<left>FINM 33160 - Machine Learning In Finance</left>
<left>Winter 2023</left>
<br>
<h1><center> Homework 2 </center></h1>
<center> Due - 23:59 [CST] January 29th, 2023</center>
<br>
<h3>Ki Hyun</h3>
<h3>Student ID: 12125881</h3>

<h2> Imports </h2>

In [48]:
%matplotlib inline

import numpy as np
import pandas as pd
pd.set_option('use_inf_as_na', True)

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection
from matplotlib import cm
from collections import Counter

import optuna
from optuna.trial import Trial
from functools import partial

import shap
sns.set()

<h2> Data </h2>

In [34]:
raw_data = pd.read_pickle(r'./dataset.pkl')
data = raw_data[raw_data['market_cap'] > 1000.0]
df = data.copy()
df.fillna(0.0,inplace=True)

df.reset_index(inplace=True)
df.set_index('date',inplace=True)
df.sort_index(inplace = True)

<h2> Helper-functions </h2>

In [8]:
def HW2_Q1_conditions(x):
    if x > 0.05:
        return 1
    elif x < -0.1:
        return -1
    else:
        return 0

In [14]:
def objective(trial:Trial,train=None,labels=None,val=None,val_labels=None,val_rets=None):

    t_min_samples_leaf = trial.suggest_int('min_samples_leaf',100,1200,step=100)
    t_max_depth = trial.suggest_int('max_depth',5,25,step=5)
    t_n_estimators = trial.suggest_int('n_estimators',5,50,step=5)


    t_clf = DecisionTreeClassifier(min_samples_leaf = t_min_samples_leaf,max_depth=t_max_depth,random_state=123)
    bg_clf = BaggingClassifier(t_clf,n_estimators=t_n_estimators,random_state=123,n_jobs=1)
    bg_clf.fit(train,labels)

    preds = bg_clf.predict(val)
    profit = (preds * val_rets).sum()

#     score = bg_clf.score(val,val_labels)

    return profit

In [15]:
def bagging_feat_importance(m, df):
    feature_importances = []
    for est in m.estimators_:
        fi = est.feature_importances_
        feature_importances.append(fi)
    feature_importances = np.array(feature_importances)

    return pd.DataFrame({'cols':train.columns, 'feat_imp':np.mean(feature_importances,axis=0)}
                       ).sort_values('feat_imp', ascending=False)

In [16]:
def plot_fi(fi): return fi.plot('cols', 'feat_imp', 'barh', figsize=(12,7), legend=False)

In [47]:
def model(features):
    tree_features = features[features.columns[:-1].values]

    pred = bg_clf.predict(tree_features)

    ret = pred * features[features.columns[-1]]

    return ret

<h2> Q1 </h2>

<h3> Inserting a column </h3>

In [35]:
df['HW2_Q1'] = df['pred_rel_return'].apply(HW2_Q1_conditions)

<h3> Train, Validation and Test Sets </h3>

In [54]:
df_train = df.loc['2007-01-01':'2009-12-31']
df_valid = df.loc['2010-04-01':'2010-07-01']
df_test = df.loc['2010-01-01':'2018-12-31']

train = df_train.reset_index().drop(['ticker','date',
                                     'next_period_return',
                                     'spy_next_period_return',
                                     'HW2_Q1','pred_rel_return',
                                     'return', 'cum_ret', 'spy_cum_ret'],axis=1)


valid = df_valid.reset_index().drop(['ticker','date',
                                     'next_period_return',
                                     'spy_next_period_return',
                                     'HW2_Q1','pred_rel_return',
                                     'return', 'cum_ret', 'spy_cum_ret'],axis=1)

test = df_test.reset_index().drop(['ticker','date',
                                     'next_period_return',
                                     'spy_next_period_return',
                                     'HW2_Q1','pred_rel_return',
                                     'return', 'cum_ret', 'spy_cum_ret'],axis=1)

train_stock_returns = df_train['next_period_return']
valid_stock_returns = df_valid['next_period_return']
test_stock_returns = df_test['next_period_return']

y_train = df_train['HW2_Q1']
y_valid = df_valid['HW2_Q1']
y_test = df_test['HW2_Q1']

y_train = y_train.values
y_valid = y_valid.values
y_test = y_test.values

In [37]:
scaler = StandardScaler()

float_vars = [x for x in train.columns if df[x].dtype == 'float64']

train_norm = train.copy()
valid_norm = valid.copy()
test_norm = test.copy()

train_norm[float_vars] = scaler.fit_transform(train[float_vars])
valid_norm[float_vars] = scaler.transform(valid[float_vars])
test_norm[float_vars] = scaler.transform(test[float_vars])

<h3> Optimization </h3>

In [38]:
study = optuna.create_study(direction="maximize")

[32m[I 2023-01-29 23:05:33,455][0m A new study created in memory with name: no-name-e4fcf3b0-893b-421d-b965-766fa80e026b[0m


In [39]:
%%time
study.optimize(partial(objective, train = train_norm, labels = y_train, val = valid_norm, val_labels = y_valid,
                       val_rets = valid_stock_returns.values), n_trials=200,n_jobs=-1)

[32m[I 2023-01-29 23:06:00,862][0m Trial 4 finished with value: -38.704815000000025 and parameters: {'min_samples_leaf': 1100, 'max_depth': 15, 'n_estimators': 15}. Best is trial 4 with value: -38.704815000000025.[0m
[32m[I 2023-01-29 23:06:12,473][0m Trial 5 finished with value: -36.540779000000036 and parameters: {'min_samples_leaf': 800, 'max_depth': 5, 'n_estimators': 20}. Best is trial 5 with value: -36.540779000000036.[0m
[32m[I 2023-01-29 23:06:18,570][0m Trial 3 finished with value: -36.39827300000003 and parameters: {'min_samples_leaf': 500, 'max_depth': 5, 'n_estimators': 20}. Best is trial 3 with value: -36.39827300000003.[0m
[32m[I 2023-01-29 23:06:20,741][0m Trial 7 finished with value: -39.56227800000004 and parameters: {'min_samples_leaf': 1100, 'max_depth': 10, 'n_estimators': 30}. Best is trial 3 with value: -36.39827300000003.[0m
[32m[I 2023-01-29 23:06:30,776][0m Trial 11 finished with value: -37.25211600000003 and parameters: {'min_samples_leaf': 400, 

CPU times: user 1h 55min 59s, sys: 2min 5s, total: 1h 58min 5s
Wall time: 18min 18s


In [40]:
study.best_params

{'min_samples_leaf': 100, 'max_depth': 15, 'n_estimators': 5}

<h3> Decision Tree Classifier </h3>

In [43]:
t_clf = DecisionTreeClassifier(**{'min_samples_leaf': 100, 'max_depth': 15},random_state=123)

<h3> Bagging Classifier </h3>

In [44]:
bg_clf = BaggingClassifier(t_clf,n_estimators=5,random_state=123)

<h3> Train </h3>

In [45]:
bg_clf.fit(valid_norm,y_valid)

<h3> Back Test </h3>

In [46]:
pred_test = bg_clf.predict(test_norm)
(pred_test * df_test['next_period_return'].values).sum()

593.6424799999998

<h3> Shapley Values </h3>

In [57]:
fi = bagging_feat_importance(bg_clf,train_norm)

In [58]:
features = fi[(fi['feat_imp'] > 0.0)]

In [59]:
cols = features['cols'].values

In [60]:
float_vars = [x for x in train[cols].columns if data[x].dtype == 'float64']

In [61]:
train_red_norm = train.copy()
valid_red_norm = valid.copy()
test_red_norm = test.copy()

In [62]:
train_red_norm[float_vars] = scaler.fit_transform((train[cols])[float_vars])
valid_red_norm[float_vars] = scaler.transform((valid[cols])[float_vars])
test_red_norm[float_vars] = scaler.transform((test[cols])[float_vars])

In [63]:
valid_1_norm = valid_red_norm.copy()
valid_1_norm['rets'] = df_valid['next_period_return'].values

In [64]:
explainer = shap.explainers.Permutation(model,valid_1_norm)

In [None]:
shap_values = explainer(valid_1_norm,max_evals=2000)

In [None]:
shap_cols = cols[np.abs(shap_values[:,:-1].values).mean(axis=0)>0.000]

In [None]:
data = df.copy()
start_dates = [pd.to_datetime('2007-01-01') + pd.DateOffset(months = 3 * i) for i in range(57)]
end_dates = [d + pd.DateOffset(months = 36) for d in start_dates]

training_frames = [data.loc[d:d+pd.DateOffset(months = 36)] for d in start_dates]
valid_frames = [data.loc[d + pd.DateOffset(months=3):d+pd.DateOffset(months = 6)] for d in end_dates]
test_frames = [data.loc[d + pd.DateOffset(months=6):d+pd.DateOffset(months = 9)] for d in end_dates]

training_data = [d.reset_index().drop
                                 (['ticker','date',
                                   'next_period_return',
                                   'spy_next_period_return',
                                   'rel_performance','pred_rel_return',
                                  'return', 'cum_ret', 'spy_cum_ret'],axis=1) for d in training_frames]
valid_data = [d.reset_index().drop(['ticker','date',
                                   'next_period_return',
                                   'spy_next_period_return',
                                   'rel_performance','pred_rel_return',
                                  'return', 'cum_ret', 'spy_cum_ret'],axis=1) for d in valid_frames]
test_data = [d.reset_index().drop(['ticker','date',
                                   'next_period_return',
                                   'spy_next_period_return',
                                   'rel_performance','pred_rel_return',
                                  'return', 'cum_ret', 'spy_cum_ret'],axis=1) for d in test_frames]

training_labels = [d['rel_performance'].values for d in training_frames]
validation_labels = [d['rel_performance'].values for d in test_frames]

In [None]:
scalers = [StandardScaler() for _ in range(len(training_data))]

opt_training_data = [pd.DataFrame(scalers[i].fit_transform(training_frames[i][shap_cols].values),
                                  columns=shap_cols) for i in range(len(training_data))]
opt_valid_data = [pd.DataFrame(scalers[i].transform(valid_frames[i][shap_cols].values),
                               columns=shap_cols) for i in range(len(valid_data))]
opt_test_data = [pd.DataFrame(scalers[i].transform(test_frames[i][shap_cols].values),
                              columns=shap_cols) for i in range(len(test_data))]

In [None]:
x = [1]
ret = []

for i in range(len(start_dates)-1):
        bg_clf.fit(opt_training_data[i],training_labels[i])

        preds = bg_clf.predict(opt_test_data[i])
        profit_i = (preds*test_frames[i]['next_period_return']).sum()
        ret.append(profit_i)
        num_names = len(opt_test_data[i])
        x.append(x[i] + (x[i]/num_names)*profit_i)

In [None]:
SPY = pd.read_pickle(r'./SPY_cum_ret.pkl')
SPY = SPY.loc['2010-01-01':'2018-12-31']
SPY = SPY.resample('Q').ffill()
SPY['spy_cum_ret'] = (SPY['spy_cum_ret'] - SPY['spy_cum_ret'][0]+1)
SPY['strategy'] = x

In [None]:
SPY = SPY.resample('Y').ffill()

In [None]:
strategy_mean_ret = (SPY['strategy'] - 1).diff().mean()
strategy_std = (SPY['strategy'] - 1).diff().std()
strategy_sr = strategy_mean_ret/strategy_std
print('Strategy Sharpe Ratio: ',strategy_sr)

In [None]:
strategy_ret = (SPY['strategy'] - 1).diff().values[1:]
spy_ret = (SPY['spy_cum_ret'] - 1).diff().values[1:]
beta = (np.cov(spy_ret,strategy_ret)/np.var(spy_ret))[1,0]
residual_ret = strategy_ret - beta * spy_ret
IR = np.mean(residual_ret)/np.std(residual_ret)
print('Strategy Information Ratio: ', IR)

In [None]:
alpha = np.mean(residual_ret)
print('Strategy Alpha: ', alpha)

<h2> Q2 </h2>

In [None]:
x = [1]
ret = []

for i in range(len(start_dates)-1):
        bg_clf.fit(opt_training_data[i],training_labels[i])
        temp = bg_clf.predict_proba(opt_test_data[i])
        conviction = temp[:, 2] - temp[:, 0]
        weights = conviction/np.absolute(conviction).sum()
        profit_i = (weights*test_frames[i]['next_period_return']).sum()
        ret.append(profit_i)
        num_names = len(opt_test_data[i])
        x.append(x[i] + (x[i]/num_names)*profit_i)

In [None]:
SPY['strategy'] = x

In [None]:
SPY = SPY.resample('Y').ffill()

In [None]:
strategy_mean_ret = (SPY['strategy'] - 1).diff().mean()
strategy_std = (SPY['strategy'] - 1).diff().std()
strategy_sr = strategy_mean_ret/strategy_std
print('Strategy Sharpe Ratio: ',strategy_sr)

In [None]:
strategy_ret = (SPY['strategy'] - 1).diff().values[1:]
spy_ret = (SPY['spy_cum_ret'] - 1).diff().values[1:]
beta = (np.cov(spy_ret,strategy_ret)/np.var(spy_ret))[1,0]
residual_ret = strategy_ret - beta * spy_ret
IR = np.mean(residual_ret)/np.std(residual_ret)
print('Strategy Information Ratio: ', IR)

In [None]:
alpha = np.mean(residual_ret)
print('Strategy Alpha: ', alpha)