# How to Use Decision Trees to Predict Equity Returns & Price Moves

### Loading Libraries

In [8]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import pandas_datareader.data as web

# Data Visualization
import graphviz
import seaborn as sns
from matplotlib import cm
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Warnings
import warnings

# OS & Path
import os, sys
from pathlib import Path

# ScyPy
from scipy.stats import spearmanr

# Scikit-Learn
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error, make_scorer
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz, _tree


# StatsModels
import statsmodels.api as sm
from statsmodels.regression.rolling import RollingOLS

# Technical Analysis
from talib import RSI, BBANDS, MACD, NATR, ATR

# Utils
from utils import MultipleTimeSeriesCV

In [9]:
idx = pd.IndexSlice

sns.set_style('white')

warnings.filterwarnings('ignore')

In [10]:
%matplotlib inline

In [11]:
results_path = Path('results', 'decision_trees')

if not results_path.exists():
    results_path.mkdir(parents=True)

### Loading Model Data

In [12]:
with pd.HDFStore('data.h5') as store:
    data = store['us/equities/monthly']

data.info()

### Simple Regression Tree with Time Series Data

#### Generating Two Lags of Monthly Returns

In [14]:
X2 = data.loc[:, ['target', 'return_1m']]
X2.columns = ['y', 't-1']
X2['t-2'] = data.groupby(level='ticker').return_1m.shift()
X2 = X2.dropna()
X2.info()

In [15]:
y2 = X2.y

X2 = X2.drop('y', axis=1)

### Exploring Data

In [16]:
sns.distplot(y2)
sns.despine();
plt.show()

#### Configuring Tree

In [17]:
reg_tree_t2 = DecisionTreeRegressor(criterion='mse',
                                    splitter='best',
                                    max_depth=6,
                                    min_samples_split=2,
                                    min_samples_leaf=50,
                                    min_weight_fraction_leaf=0.0,
                                    max_features=None,
                                    random_state=42,
                                    max_leaf_nodes=None,
                                    min_impurity_decrease=0.0,
                                    min_impurity_split=None)

#### Training Decision Tree

In [18]:
%%timeit
reg_tree_t2.fit(X=X2, y=y2)

In [19]:
reg_tree_t2.fit(X=X2, y=y2)

#### Tree Visualization

In [20]:
out_file = results_path / 'reg_tree_t2.dot'

dot_data = export_graphviz(reg_tree_t2,
                          out_file=out_file.as_posix(),
                          feature_names=X2.columns,
                          max_depth=2,
                          filled=True,
                          rounded=True,
                          special_characters=True)

if out_file is not None:
    dot_data = Path(out_file).read_text()

graphviz.Source(dot_data)

### Comparing with Linear Regression

#### `statsmodels OLS`

In [21]:
ols_model = sm.OLS(endog=y2, exog=sm.add_constant(X2))

In [22]:
%%timeit
ols_model.fit()

In [23]:
result = ols_model.fit()

print(result.summary())

#### Sklearn Linear Regression

In [24]:
lin_reg = LinearRegression()

In [25]:
%%timeit
lin_reg.fit(X=X2,y=y2)

In [26]:
lin_reg.fit(X=X2,y=y2)

In [27]:
lin_reg.intercept_

In [28]:
lin_reg.coef_

### Linear Regression vs Regression Tree Decision Surfaces

In [29]:
t1, t2 = np.meshgrid(np.linspace(X2['t-1'].quantile(.01), X2['t-1'].quantile(.99), 100),
                     np.linspace(X2['t-2'].quantile(.01), X2['t-2'].quantile(.99), 100))

X_data = np.c_[t1.ravel(), t2.ravel()]

In [30]:
fig, axes = plt.subplots(ncols=2, figsize=(12,5))

# Linear Regression
ret1 = lin_reg.predict(X_data).reshape(t1.shape)
surface1 = axes[0].contourf(t1, t2, ret1, cmap='Blues')
plt.colorbar(mappable=surface1, ax=axes[0])

# Regression Tree
ret2 = reg_tree_t2.predict(X_data).reshape(t1.shape)
surface2 = axes[1].contourf(t1, t2, ret2, cmap='Blues')
plt.colorbar(mappable=surface2, ax=axes[1])

# Format plots
titles = ['Linear Regression', 'Regression Tree']
for i, ax in enumerate(axes):
    ax.set_xlabel('t-1')
    ax.set_ylabel('t-2')
    ax.set_title(titles[i])

fig.suptitle('Decision Surfaces', fontsize=14)
sns.despine()
fig.tight_layout()
fig.subplots_adjust(top=.9);
plt.show()

### Simple Classification Tree with Time Series Data

#### Loss Functions

In [31]:
def entropy(f):
    return (-f*np.log2(f) - (1-f)*np.log2(1-f))/2

In [32]:
def gini(f):
    return 2*f*(1-f)

In [33]:
def misclassification_rate(f):
    return np.where(f<=.5, f, 1-f)

In [34]:
x = np.linspace(0, 1, 10000)

(pd.DataFrame({'Gini': gini(x), 
              'Entropy': entropy(x),
             'Misclassification Rate': misclassification_rate(x)}, index=x)
 .plot(title='Classification Loss Functions', lw=2, style=['-', '--', ':']))

sns.despine()
plt.tight_layout();
plt.show()

#### Comparing Computation Time

In [35]:
%%timeit
misclassification_rate(x)

In [36]:
%%timeit
gini(x)

In [37]:
%%timeit
entropy(x)

### Configuring Tree

In [38]:
clf_tree_t2 = DecisionTreeClassifier(criterion='gini',
                                     splitter='best',
                                     max_depth=5,
                                     min_samples_split=1000,
                                     min_samples_leaf=1,
                                     min_weight_fraction_leaf=0.0,
                                     max_features=None,
                                     random_state=42,
                                     max_leaf_nodes=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     class_weight=None)

### Training Tree

In [39]:
y_binary = (y2>0).astype(int)

y_binary.value_counts()

In [40]:
%%timeit
clf_tree_t2.fit(X=X2, y=y_binary)

In [41]:
clf_tree_t2.fit(X=X2, y=y_binary)

### Tree Visualization

In [42]:
out_file = results_path / 'clf_tree_t2.dot'
dot_data = export_graphviz(clf_tree_t2,
                           out_file=out_file.as_posix(),
                           feature_names=X2.columns,
                           class_names=['Down', 'Up'],
                           max_depth=2,
                           filled=True,
                           rounded=True,
                           special_characters=True)
if out_file is not None:
    dot_data = Path(out_file).read_text()

graphviz.Source(dot_data)

### Compare with Logistic Regression

#### Statsmodels

In [43]:
log_reg_sm = sm.Logit(endog=y_binary, exog=sm.add_constant(X2))

In [44]:
%%timeit
log_reg_sm.fit(disp=False)

In [45]:
log_result = log_reg_sm.fit()

In [46]:
print(log_result.summary())

#### Sklearn

In [47]:
log_reg_sk = LogisticRegression()

In [48]:
%%timeit
log_reg_sk.fit(X=X2, y=y_binary)

In [49]:
log_reg_sk.fit(X=X2, y=y_binary)

In [50]:
log_reg_sk.coef_

### Decision Surfaces: Classifier Tree vs. Logistic Regression

In [51]:
fig, axes = plt.subplots(ncols=2, figsize=(12,5))

# Linear Regression
ret1 = log_reg_sk.predict_proba(X_data)[:, 1].reshape(t1.shape)
surface1 = axes[0].contourf(t1, t2, ret1, cmap='Blues')
plt.colorbar(mappable=surface1, ax=axes[0])

# Regression Tree
ret2 = clf_tree_t2.predict_proba(X_data)[:, 1].reshape(t1.shape)
surface2 = axes[1].contourf(t1, t2, ret2, cmap='Blues')
plt.colorbar(mappable=surface2, ax=axes[1])

# Format plots
titles = ['Logistic Regression', 'Classification Tree']
for i, ax in enumerate(axes):
    ax.set_xlabel('t-1')
    ax.set_ylabel('t-2')
    ax.set_title(titles[i])

fig.suptitle('Decision Surfaces', fontsize=20)
sns.despine()
fig.tight_layout()
fig.subplots_adjust(top=.9);
plt.show()

### Regression Tree with All Features

#### Train-Test Split

In [52]:
X = pd.get_dummies(data.drop('target', axis=1))

y = data.target

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Configuring Tree

In [54]:
regression_tree = DecisionTreeRegressor(criterion='mse',
                                        splitter='best',
                                        max_depth=5,
                                        min_samples_split=2,
                                        min_samples_leaf=1,
                                        min_weight_fraction_leaf=0.0,
                                        max_features=None,
                                        random_state=42,
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None)

#### Training Model

In [55]:
regression_tree.fit(X=X_train, y=y_train)

#### Visualizing Tree

In [56]:
out_file = results_path / 'reg_tree.dot'

dot_data = export_graphviz(regression_tree,
                           out_file=out_file.as_posix(),
                           feature_names=X_train.columns,
                           max_depth=3,
                           filled=True,
                           rounded=True,
                           special_characters=True)

if out_file is not None:
    dot_data = Path(out_file).read_text()

graphviz.Source(dot_data)

#### Evaluating Test Set

In [None]:
y_pred = regression_tree.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_test))

In [None]:
r, p = spearmanr(y_pred, y_test)
print(f'{r*100:.2f} (p-value={p:.2%})')

### Classification Tree with All Features

#### Train-Test Split

In [57]:
y_binary = (y>0).astype(int)
y_binary.value_counts()

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

In [59]:
clf = DecisionTreeClassifier(criterion='gini',
                             max_depth=5,
                             random_state=42)

In [60]:
clf.fit(X=X_train, y=y_train)

#### Plotting Tree

In [61]:
out_file = results_path / 'clf_tree.dot'

dot_data = export_graphviz(clf,
                           out_file=out_file.as_posix(),
                           feature_names=X.columns,
                           class_names=['Down', 'Up'],
                           max_depth=3,
                           filled=True,
                           rounded=True,
                           special_characters=True)
if out_file is not None:
    dot_data = Path(out_file).read_text()

graphviz.Source(dot_data)

#### Evaluating Test Set

In [62]:
y_score = clf.predict_proba(X=X_test)[:, 1]

In [63]:
roc_auc_score(y_score=y_score, y_true=y_test)

### Printing Decision Path

In [64]:
from sklearn.tree._tree import Tree
help(Tree)

Help on class Tree in module sklearn.tree._tree:

class Tree(builtins.object)
 |  Array-based representation of a binary decision tree.
 |
 |  The binary tree is represented as a number of parallel arrays. The i-th
 |  element of each array holds information about the node `i`. Node 0 is the
 |  tree's root. You can find a detailed description of all arrays in
 |  `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split
 |  nodes, resp. In this case the values of nodes of the other type are
 |  arbitrary!
 |
 |  Attributes
 |  ----------
 |  node_count : intp_t
 |      The number of nodes (internal nodes + leaves) in the tree.
 |
 |  capacity : intp_t
 |      The current capacity (i.e., size) of the arrays, which is at least as
 |      great as `node_count`.
 |
 |  max_depth : intp_t
 |      The depth of the tree, i.e. the maximum depth of its leaves.
 |
 |  children_left : array of intp_t, shape [node_count]
 |      children_left[i] holds the node id of the left chil

In [65]:
def tree_to_code(tree, feature_names):
    if isinstance(tree, DecisionTreeClassifier):
        model = 'clf'
    elif isinstance(tree, DecisionTreeRegressor):
        model = 'reg'
    else:
        raise ValueError('Need Regression or Classification Tree')
        
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print(indent, f'if {name} <= {threshold:.2%}')
            recurse(tree_.children_left[node], depth + 1)
            print(indent, f'else:  # if {name} > {threshold:.2%}')
            recurse(tree_.children_right[node], depth + 1)
        else:
            pred = tree_.value[node][0]
            val = pred[1]/sum(pred) if model == 'clf' else pred[0]
            print(indent, f'return {val:.2%}')
    recurse(0, 1)

In [66]:
tree_to_code(clf_tree_t2, X2.columns)

### Overfitting, Regularization & Parameter Tuning

#### Cross-validation parameters

In [67]:
n_splits = 10

train_period_length = 60

test_period_length = 6

lookahead = 1

cv = MultipleTimeSeriesCV(n_splits=n_splits,
                          train_period_length=train_period_length,
                          test_period_length=test_period_length,
                          lookahead=lookahead)

In [68]:
max_depths = range(1, 16)

### Finding The Best Trees using GridSearchCV

#### Define Parameter Grid

In [69]:
param_grid = {'max_depth': [2, 3, 4, 5, 6, 7, 8, 10, 12, 15],
              'min_samples_leaf': [5, 25, 50, 100],
              'max_features': ['sqrt', 'auto']}

#### Classification Tree

In [70]:
clf = DecisionTreeClassifier(random_state=42)

In [71]:
gridsearch_clf = GridSearchCV(estimator=clf,
                          param_grid=param_grid,
                          scoring='roc_auc',
                          n_jobs=-1,
                          cv=cv,
                          refit=True,
                          return_train_score=True)

In [72]:
gridsearch_clf.fit(X=X, y=y_binary)

In [73]:
gridsearch_clf.best_params_

In [74]:
gridsearch_clf.best_score_

#### Defining Custom IC Score

In [75]:
def rank_correl(y, y_pred):
    return spearmanr(y, y_pred)[0]

In [76]:
ic = make_scorer(rank_correl)

#### Regression Tree

In [77]:
reg_tree = DecisionTreeRegressor(random_state=42)

In [78]:
gridsearch_reg = GridSearchCV(estimator=reg_tree,
                              param_grid=param_grid,
                              scoring=ic,
                              n_jobs=-1,
                              cv=cv,
                              refit=True,
                              return_train_score=True)

In [79]:
gridsearch_reg.fit(X=X, y=y)

In [80]:
gridsearch_reg.best_params_

In [81]:
gridsearch_reg.best_score_

In [82]:
pd.DataFrame({'Regression': pd.Series(gridsearch_reg.best_params_),
              'Classification': pd.Series(gridsearch_clf.best_params_)})

### Classifier Cross-Validation

In [83]:
def get_leaves_count(tree):
    t = tree.tree_
    n = t.node_count
    leaves = len([i for i in range(t.node_count) if t.children_left[i]== -1])
    return leaves

In [85]:
train_scores, val_scores, leaves = {}, {}, {}

for max_depth in max_depths:
    print(max_depth, end=' ', flush=True)
    clf = DecisionTreeClassifier(criterion='gini', 
                                 max_depth=max_depth,
                                 min_samples_leaf=5,
                                 max_features='sqrt',
                                 random_state=42)
    train_scores[max_depth], val_scores[max_depth], leaves[max_depth] = [], [], []
    for train_idx, test_idx in cv.split(X):
        X_train, y_train,  = X.iloc[train_idx], y_binary.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y_binary.iloc[test_idx]
        clf.fit(X=X_train, y=y_train)

        train_pred = clf.predict_proba(X=X_train)[:, 1]
        train_score = roc_auc_score(y_score=train_pred, y_true=y_train)
        train_scores[max_depth].append(train_score)

        test_pred = clf.predict_proba(X=X_test)[:, 1]
        val_score = roc_auc_score(y_score=test_pred, y_true=y_test)
        val_scores[max_depth].append(val_score)    
        leaves[max_depth].append(get_leaves_count(clf))
        
clf_train_scores = pd.DataFrame(train_scores)
clf_valid_scores = pd.DataFrame(val_scores)
clf_leaves = pd.DataFrame(leaves)

In [86]:
clf_cv_data = pd.concat([pd.melt(clf_train_scores,
                                 var_name='Max. Depth',
                                 value_name='ROC AUC').assign(Data='Train'),
                         pd.melt(clf_valid_scores,
                                 var_name='Max. Depth',
                                 value_name='ROC AUC').assign(Data='Valid')])

### Regression Tree Cross-Validation

#### Running Cross-Validation

In [87]:
train_scores, val_scores, leaves = {}, {}, {}

for max_depth in max_depths:
    print(max_depth, end=' ', flush=True)
    reg_tree = DecisionTreeRegressor(max_depth=max_depth,
                                     min_samples_leaf=50,
                                     max_features= 'sqrt',
                                     random_state=42)
    train_scores[max_depth], val_scores[max_depth], leaves[max_depth] = [], [], []
    for train_idx, test_idx in cv.split(X):
        X_train, y_train,  = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
        reg_tree.fit(X=X_train, y=y_train)

        train_pred = reg_tree.predict(X=X_train)
        train_score = spearmanr(train_pred, y_train)[0]
        train_scores[max_depth].append(train_score)

        test_pred = reg_tree.predict(X=X_test)
        val_score = spearmanr(test_pred, y_test)[0]
        val_scores[max_depth].append(val_score)
        leaves[max_depth].append(get_leaves_count(reg_tree))

reg_train_scores = pd.DataFrame(train_scores)
reg_valid_scores = pd.DataFrame(val_scores)
reg_leaves = pd.DataFrame(leaves)

In [88]:
reg_cv_data = (pd.melt(reg_train_scores, var_name='Max. Depth',
                         value_name='IC').assign(Data='Train').append(
    pd.melt(reg_valid_scores,
            var_name='Max. Depth',
            value_name='IC').assign(Data='Valid')))

### Comparing CV Results

In [89]:
fig, axes = plt.subplots(ncols=2, figsize=(14, 5))

sns.lineplot(data=reg_cv_data,
             x='Max. Depth', y='IC',
             hue='Data', ci=95,
             ax=axes[0], lw=2)

axes[0].set_title('Regression Tree')
axes[0].axvline(x=reg_valid_scores.mean().idxmax(), ls='--', c='k', lw=1)
axes[0].axhline(y=0, ls='--', c='k', lw=1)

sns.lineplot(data=clf_cv_data,
             x='Max. Depth', y='ROC AUC',
             hue='Data', ci=95,
             ax=axes[1], lw=2)

axes[1].set_title('Classification Tree')
axes[1].axvline(x=clf_valid_scores.mean().idxmax(), ls='--', c='k', lw=1)
axes[1].axhline(y=.5, ls='--', c='k', lw=1)
for ax in axes:
    ax.set_xlim(min(param_grid['max_depth']),
                max(param_grid['max_depth']))

fig.suptitle(f'Train-Validation Scores', fontsize=14)
sns.despine()
fig.tight_layout()
fig.subplots_adjust(top=.91)
plt.show()

### Learning Curves for Best Models

#### Classifier

In [90]:
sizes = np.arange(.1, 1.01, .1)

In [93]:
train_sizes, train_scores, valid_scores = learning_curve(gridsearch_clf.best_estimator_,
                                                         X,
                                                         y_binary,
                                                         train_sizes=sizes,
                                                         cv=cv,
                                                         scoring='roc_auc',
                                                         n_jobs=-1,
                                                         shuffle=True,
                                                         random_state=42)

In [94]:
clf_lc_data = pd.concat([
    pd.melt(pd.DataFrame(train_scores.T, columns=train_sizes),
            var_name='Train Size',
            value_name='ROC AUC').assign(Data='Train'),
    pd.melt(pd.DataFrame(valid_scores.T, columns=train_sizes),
            var_name='Train Size',
            value_name='ROC AUC').assign(Data='Valid')])

clf_lc_data.info()

#### Regression Tree

In [95]:
train_sizes, train_scores, valid_scores = learning_curve(gridsearch_reg.best_estimator_,
                                                         X, y,
                                                         train_sizes=sizes,
                                                         cv=cv,
                                                         scoring=ic,
                                                         n_jobs=-1,
                                                         shuffle=True,
                                                         random_state=42)

In [96]:
reg_lc_data = pd.concat([
    pd.melt(pd.DataFrame(train_scores.T,
                         columns=train_sizes),
            var_name='Train Size',
            value_name='IC').assign(Data='Train'),
    pd.melt(pd.DataFrame(valid_scores.T,
                         columns=train_sizes),
            var_name='Train Size',
            value_name='IC').assign(Data='Valid')])

reg_lc_data.info()

#### Comparing Learning Curves

In [97]:
fig, axes = plt.subplots(ncols=2, figsize=(14, 5))
xmin, xmax = reg_lc_data['Train Size'].min(), reg_lc_data['Train Size'].max()

sns.lineplot(data=reg_lc_data,
             x='Train Size', y='IC',
             hue='Data', ci=95,  ax=axes[0], lw=2)
axes[0].set_title('Best Regression Tree')
axes[0].set_ylabel('IC')

axes[0].xaxis.set_major_formatter(
    FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))

sns.lineplot(data=clf_lc_data,
             x='Train Size',
             y='ROC AUC',
             hue='Data',
             ci=95,
             ax=axes[1],
             lw=2)
axes[1].set_title('Best Classification Tree')
axes[1].set_ylabel('ROC AUC')
axes[1].xaxis.set_major_formatter(
    FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))

for i in [0, 1]:
    axes[i].tick_params(axis='both', which='major', labelsize=10)
    axes[i].tick_params(axis='both', which='minor', labelsize=8)
    axes[i].set_xlim(xmin, xmax)

fig.suptitle('Learning Curves', fontsize=14)
sns.despine()
fig.tight_layout()
fig.subplots_adjust(top=.9)
plt.show()

### Feature Importance

In [98]:
top_n = 15
labels = X.columns.str.replace('_', ' ').str.upper()
fi_clf = (pd.Series(gridsearch_clf.best_estimator_.feature_importances_, 
                    index=labels).sort_values(ascending=False).iloc[:top_n])
fi_reg = (pd.Series(gridsearch_reg.best_estimator_.feature_importances_, 
                    index=labels).sort_values(ascending=False).iloc[:top_n])

In [99]:
fig, axes= plt.subplots(ncols=2, figsize=(12,4), sharex=True)
color = cm.Blues(np.linspace(.4,.9, top_n))
fi_clf.sort_values().plot.barh(ax=axes[1], title='Classification Tree', color=color)
fi_reg.sort_values().plot.barh(ax=axes[0], title='Regression Tree', color=color)
axes[0].set_xlabel('Feature Importance')
axes[1].set_xlabel('Feature Importance')
fig.suptitle(f'Top {top_n} Features', fontsize=14)
sns.despine()
fig.tight_layout()
fig.subplots_adjust(top=.9);
plt.show()