# Long-Short Strategy, Part 4: How to interpret GBM results

### Loading Libraries

In [3]:
# Numerical Computing
import numpy as np
from random import randint

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import FuncFormatter


# Path
from pathlib import Path

# Warnings
import warnings

# Itertools & Joblib
import joblib
from itertools import product

# Feature Relevancy
import shap

# LightGBM
import lightgbm as lgb

# Scikit-Learn
from sklearn.inspection import PartialDependenceDisplay

In [4]:
np.random.seed(42)

idx = pd.IndexSlice

sns.set_style('darkgrid')

warnings.filterwarnings('ignore')

In [5]:
DATA_STORE = Path('../data/assets.h5')

### Retrieving Best Model Parameters

In [6]:
# with pd.HDFStore('data.h5') as store:
#     best_params = store['best_params']

In [7]:
# best_params

### Getting Training Data

In [8]:
# data = pd.read_hdf('data.h5', 'model_data').sort_index()
# data = data.loc[idx[:, '2013':'2018'], :]

In [9]:
# data.info()

### Creating Binary Data

In [10]:
# dates = sorted(data.index.get_level_values('date').unique())

In [11]:
# train_dates = dates[-int(best_params.train_length):]

In [12]:
# data = data.loc[idx[:, train_dates], :]

In [13]:
# labels = sorted(data.filter(like='_fwd').columns)

# features = data.columns.difference(labels).tolist()

In [14]:
# lookahead = 1

# label = f'r{lookahead:02}_fwd'

In [15]:
categoricals = ['year', 'month', 'sector', 'weekday']

In [16]:
# lgb_train = lgb.Dataset(data=data[features],
#                        label=data[label],
#                        categorical_feature=categoricals,
#                        free_raw_data=False)

### Training LightGBM Model

In [20]:
params = dict(boosting='gbdt', objective='regression', verbose=-1)

In [21]:
train_params = ['learning_rate', 'num_leaves', 'feature_fraction', 'min_data_in_leaf']

In [23]:
# params.update(best_params.loc[train_params].to_dict())

# for p in ['min_data_in_leaf', 'num_leaves']:
#     params[p] = int(params[p])

In [24]:
params

{'boosting': 'gbdt', 'objective': 'regression', 'verbose': -1}

In [25]:
# lgb_model = lgb.train(params=params,
#                   train_set=lgb_train,
#                   num_boost_round=int(best_params.boost_rounds))

### Computing Feature Importance

In [26]:
def get_feature_importance(model, importance_type='split'):
    fi = pd.Series(model.feature_importance(importance_type=importance_type), 
                   index=model.feature_name())
    return fi/fi.sum()

In [27]:
# feature_importance = (get_feature_importance(lgb_model).to_frame('Split').
#                       join(get_feature_importance(lgb_model, 'gain').to_frame('Gain')))

In [28]:
# (feature_importance
#  .nlargest(20, columns='Gain')
#  .sort_values('Gain', ascending=False)
#  .plot
#  .bar(subplots=True,
#       layout=(2, 1),
#       figsize=(14, 6),
#       legend=False,
#       sharey=True,
#       rot=0))
# plt.suptitle('Normalized Importance (Top 20 Features)', fontsize=14)
# plt.tight_layout()
# plt.subplots_adjust(top=.9);
# plt.show()

### Partial Dependence Plots

In [29]:
class OneStepTimeSeriesSplit:
    pass

In [31]:
# gb_clf = joblib.load('results/baseline/sklearn_gbm_model.joblib')

In [32]:
def get_data(start='2000', end='2018', holding_period=1, dropna=False):
    idx = pd.IndexSlice
    target = f'target_{holding_period}m'
    with pd.HDFStore(DATA_STORE) as store:
        df = store['engineered_features']

    if start is not None and end is not None:
        df = df.loc[idx[:, start: end], :]
    if dropna:
        df = df.dropna()

    y = (df[target] > 0).astype(int)
    X = df.drop([c for c in df.columns if c.startswith('target')], axis=1)
    return y, X

In [33]:
def factorize_cats(df, cats=['sector']):
    cat_cols = ['year', 'month', 'age', 'msize'] + cats
    for cat in cats:
        df[cat] = pd.factorize(df[cat])[0]
    df.loc[:, cat_cols] = df.loc[:, cat_cols].fillna(-1)
    return df

In [34]:
# y_clean, features_clean = get_data(dropna=True)

# X = factorize_cats(features_clean).drop(['year', 'month'], axis=1)

#### 2D Partial Dependency

In [35]:
# fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))

# X_ = features_clean
# plot_partial_dependence(
#     estimator=gb_clf,
#     X=X_,
#     features=['return_12m', 'return_6m', 'CMA', ('return_12m', 'return_6m')],
#     percentiles=(0.05, 0.95),
#     n_jobs=-1,
#     n_cols=2,
#     response_method='decision_function',
#     grid_resolution=250,
#     ax=axes)

# for i, j in product([0, 1], repeat=2):
#     if i!=1 or j!= 0:
#         axes[i][j].xaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 

# axes[1][1].yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 

# axes[0][0].set_ylabel('Partial Dependence')
# axes[1][0].set_ylabel('Partial Dependence')
# axes[0][0].set_xlabel('12-Months Return')
# axes[0][1].set_xlabel('6-Months Return')
# axes[1][0].set_xlabel('Conservative Minus Aggressive')

# axes[1][1].set_xlabel('12-Month Return')
# axes[1][1].set_ylabel('6-Months Return')
# fig.suptitle('Partial Dependence Plots', fontsize=16)
# fig.tight_layout()
# fig.subplots_adjust(top=.95);
# plt.show()

#### 3D Partial Dependency

In [36]:
# sns.set_style('whitegrid')
# targets = ['return_12m', 'return_6m']
# pdp, axes = partial_dependence(estimator=gb_clf,
#                                features=targets,
#                                X=X_,
#                                grid_resolution=100)

# XX, YY = np.meshgrid(axes[0], axes[1])
# Z = pdp[0].reshape(list(map(np.size, axes))).T

# fig = plt.figure(figsize=(14, 8))
# ax = Axes3D(fig)
# surface = ax.plot_surface(XX, YY, Z,
#                           rstride=1,
#                           cstride=1,
#                           cmap=plt.cm.BuPu,
#                           edgecolor='k')
# ax.set_xlabel('12-Month Return')
# ax.set_ylabel('6-Month Return')
# ax.set_zlabel('Partial Dependence')
# ax.view_init(elev=22, azim=30)
# ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 
# ax.xaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 

# # fig.colorbar(surface)
# fig.suptitle('Partial Dependence by 6- and 12-month Returns', fontsize=16)
# fig.tight_layout();
# plt.show()

### SHAP Values

#### Summary Plot

In [37]:
# X = data[features].sample(n=1000)

In [38]:
# # load JS visualization code to notebook
# shap.initjs()

# # explain the model's predictions using SHAP values
# explainer = shap.TreeExplainer(lgb_model)
# shap_values = explainer.shap_values(X=X)

# shap.summary_plot(shap_values, X, show=False)
# plt.tight_layout();
# plt.show()

In [40]:
# shap.summary_plot(shap_values, X, plot_type="bar",show=False)
# plt.tight_layout();
# plt.show()

### Feature Interaction

#### Force Plots

In [41]:
# i = randint(0, len(X))

# # Visualize The 1st Prediction's Explanation
# shap.force_plot(explainer.expected_value, shap_values[i,:], X.iloc[i,:])

In [42]:
# shap.force_plot(explainer.expected_value, shap_values[:1000,:], X.iloc[:1000])

#### Interaction Plot

In [43]:
# shap.dependence_plot(ind='r01',
#                      shap_values=shap_values,
#                      features=X,
#                      interaction_index='r05',
#                      title='Interaction between 1- and 5-Day Returns')