# Adaptive and Gradient Boosting

### Loading Libraries

In [4]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import FuncFormatter

# OS & Time
import sys, os
from time import time

# IterTools
from itertools import product

# Warnings
import warnings

# Joblib
import joblib

# Path
from pathlib import Path

# ExtremeGradient (XGBoost)
from xgboost import XGBClassifier

# Lightgradient (LGBoost)
from lightgbm import LGBMClassifier

# Categorical Boosting (CatBoost)
from catboost import CatBoostClassifier

# Scikit-Learn
from sklearn.metrics import roc_auc_score
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
# needed for HistGradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
# from sklearn.inspection import partial_dependence, plot_partial_dependence
from sklearn.inspection import PartialDependenceDisplay
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier

In [6]:
sys.path.insert(1, os.path.join(sys.path[0], '..'))

from utils import format_time

In [7]:
results_path = Path('results', 'baseline')

if not results_path.exists():
    results_path.mkdir(exist_ok=True, parents=True)

In [8]:
np.random.seed(42)

idx = pd.IndexSlice

sns.set_style("whitegrid")

warnings.filterwarnings('ignore')

### Preparing Data

#### Getting Source

In [9]:
DATA_STORE = '../data/assets.h5'

In [10]:
def get_data(start='2000', end='2018', task='classification', holding_period=1, dropna=False):
    
    idx = pd.IndexSlice
    target = f'target_{holding_period}m'
    with pd.HDFStore(DATA_STORE) as store:
        df = store['engineered_features']

    if start is not None and end is not None:
        df = df.loc[idx[:, start: end], :]
    if dropna:
        df = df.dropna()
        
    y = (df[target]>0).astype(int)
    X = df.drop([c for c in df.columns if c.startswith('target')], axis=1)
    return y, X

### Factorize Categories

In [11]:
cat_cols = ['year', 'month', 'age', 'msize', 'sector']

In [12]:
def factorize_cats(df, cats=['sector']):
    cat_cols = ['year', 'month', 'age', 'msize'] + cats
    for cat in cats:
        df[cat] = pd.factorize(df[cat])[0]
    df.loc[:, cat_cols] = df.loc[:, cat_cols].fillna(-1).astype(int)
    return df

### One-Hot Encoding

In [13]:
def get_one_hot_data(df, cols=cat_cols[:-1]):
    df = pd.get_dummies(df,
                        columns=cols + ['sector'],
                        prefix=cols + [''],
                        prefix_sep=['_'] * len(cols) + [''])
    return df.rename(columns={c: c.replace('.0', '') for c in df.columns})

### Getting Holdout Set

In [14]:
def get_holdout_set(target, features, period=6):
    idx = pd.IndexSlice
    label = target.name
    dates = np.sort(y.index.get_level_values('date').unique())
    cv_start, cv_end = dates[0], dates[-period - 2]
    holdout_start, holdout_end = dates[-period - 1], dates[-1]

    df = features.join(target.to_frame())
    train = df.loc[idx[:, cv_start: cv_end], :]
    y_train, X_train = train[label], train.drop(label, axis=1)

    test = df.loc[idx[:, holdout_start: holdout_end], :]
    y_test, X_test = test[label], test.drop(label, axis=1)
    return y_train, X_train, y_test, X_test

### Loading Data

In [16]:
y, features = get_data()

X_dummies = get_one_hot_data(features)
X_factors = factorize_cats(features)

In [17]:
X_factors.info()

In [18]:
y_clean, features_clean = get_data(dropna=True)

X_dummies_clean = get_one_hot_data(features_clean)
X_factors_clean = factorize_cats(features_clean)

### Cross-Validation Setup

#### Custom Time Series KFold Generator

In [19]:
class OneStepTimeSeriesSplit:
    """Generates tuples of train_idx, test_idx pairs
    Assumes the index contains a level labeled 'date'"""

    def __init__(self, n_splits=3, test_period_length=1, shuffle=False):
        self.n_splits = n_splits
        self.test_period_length = test_period_length
        self.shuffle = shuffle

    @staticmethod
    def chunks(l, n):
        for i in range(0, len(l), n):
            yield l[i:i + n]

    def split(self, X, y=None, groups=None):
        unique_dates = (X.index
                        .get_level_values('date')
                        .unique()
                        .sort_values(ascending=False)
                        [:self.n_splits*self.test_period_length])

        dates = X.reset_index()[['date']]
        for test_date in self.chunks(unique_dates, self.test_period_length):
            train_idx = dates[dates.date < min(test_date)].index
            test_idx = dates[dates.date.isin(test_date)].index
            if self.shuffle:
                np.random.shuffle(list(train_idx))
            yield train_idx, test_idx

    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

In [20]:
cv = OneStepTimeSeriesSplit(n_splits=12, 
                            test_period_length=1, 
                            shuffle=False)

In [21]:
run_time = {}

### CV Metrics

In [22]:
metrics = {'balanced_accuracy': 'Accuracy' ,
           'roc_auc': 'AUC',
           'neg_log_loss': 'Log Loss',
           'f1_weighted': 'F1',
           'precision_weighted': 'Precision',
           'recall_weighted': 'Recall'
}

In [24]:
def run_cv(clf, X=X_dummies, y=y, metrics=metrics, cv=cv, fit_params=None, n_jobs=-1):
    start = time()
    scores = cross_validate(estimator=clf,
                            X=X, 
                            y=y,
                            scoring=list(metrics.keys()),
                            cv=cv,
                            return_train_score=True,
                            n_jobs=n_jobs,
                            verbose=1,
                            fit_params=fit_params)
    duration = time() - start
    return scores, duration

### CV Result Handler Functions

In [25]:
def stack_results(scores):
    columns = pd.MultiIndex.from_tuples(
        [tuple(m.split('_', 1)) for m in scores.keys()],
        names=['Dataset', 'Metric'])
    data = np.array(list(scores.values())).T
    df = (pd.DataFrame(data=data,
                       columns=columns)
          .iloc[:, 2:])
    results = pd.melt(df, value_name='Value')
    results.Metric = results.Metric.apply(lambda x: metrics.get(x))
    results.Dataset = results.Dataset.str.capitalize()
    return results

In [26]:
def plot_result(df, model=None, fname=None):
    m = list(metrics.values())
    g = sns.catplot(x='Dataset', 
                    y='Value', 
                    hue='Dataset', 
                    col='Metric',
                    data=df, 
                    col_order=m,
                    order=['Train', 'Test'],
                    kind="box", 
                    col_wrap=3,
                    sharey=False,
                    height=4, aspect=1.2)
    df = df.groupby(['Metric', 'Dataset']).Value.mean().unstack().loc[m]
    for i, ax in enumerate(g.axes.flat):
        s = f"Train: {df.loc[m[i], 'Train']:>7.4f}\nTest:  {df.loc[m[i], 'Test'] :>7.4f}"
        ax.text(0.05, 0.85, s, fontsize=10, transform=ax.transAxes, 
                bbox=dict(facecolor='white', edgecolor='grey', boxstyle='round,pad=0.5'))
    g.fig.suptitle(model, fontsize=16)
    g.fig.subplots_adjust(top=.9)
    if fname:
        g.savefig(fname, dpi=300);

### Baseline Classifier

In [27]:
dummy_clf = DummyClassifier(strategy='stratified',
                            random_state=42)

In [28]:
algo = 'dummy_clf'

In [30]:
fname = results_path / f'{algo}.joblib'

if not Path(fname).exists():
    dummy_cv_result, run_time[algo] = run_cv(dummy_clf)
    joblib.dump(dummy_cv_result, fname)
else:
    dummy_cv_result = joblib.load(fname)

In [31]:
dummy_result = stack_results(dummy_cv_result)
dummy_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()

In [32]:
plot_result(dummy_result, model='Dummy Classifier')
plt.show()

### RandomForest

#### Configuring

In [33]:
rf_clf = RandomForestClassifier(n_estimators=100,
                                criterion='gini', 
                                max_depth=None, 
                                min_samples_split=2, 
                                min_samples_leaf=1, 
                                min_weight_fraction_leaf=0.0, 
                                max_features='auto',
                                max_leaf_nodes=None, 
                                min_impurity_decrease=0.0, 
                                min_impurity_split=None, 
                                bootstrap=True, 
                                oob_score=True, 
                                n_jobs=-1,
                                random_state=42, 
                                verbose=1)

#### Cross-Validation

In [34]:
algo = 'random_forest'

In [36]:
fname = results_path / f'{algo}.joblib'

if not Path(fname).exists():
    rf_cv_result, run_time[algo] = run_cv(rf_clf, y=y_clean, X=X_dummies_clean)
    joblib.dump(rf_cv_result, fname)
else:
    rf_cv_result = joblib.load(fname)

#### Plotting Results

In [37]:
rf_result = stack_results(rf_cv_result)

rf_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()

In [39]:
plot_result(rf_result, model='Random Forest')
plt.show()

## Scikit-Learn: `AdaBoost`

### Base Estimator

In [41]:
base_estimator = DecisionTreeClassifier(criterion='gini', 
                                        splitter='best',
                                        max_depth=1, 
                                        min_samples_split=2, 
                                        min_samples_leaf=20, 
                                        min_weight_fraction_leaf=0.0,
                                        max_features=None, 
                                        random_state=None, 
                                        max_leaf_nodes=None, 
                                        min_impurity_decrease=0.0, 
                                        min_impurity_split=None, 
                                        class_weight=None)

### AdaBoost Configuration

In [43]:
ada_clf = AdaBoostClassifier(base_estimator=base_estimator,
                             n_estimators=100,
                             learning_rate=1.0,
                             algorithm='SAMME.R',
                             random_state=42)

#### Cross-Validating

In [44]:
algo = 'adaboost'

In [45]:
fname = results_path / f'{algo}.joblib'

if not Path(fname).exists():
    ada_cv_result, run_time[algo] = run_cv(ada_clf, y=y_clean, X=X_dummies_clean)
    joblib.dump(ada_cv_result, fname)
else:
    ada_cv_result = joblib.load(fname)

#### Plotting Result

In [46]:
ada_result = stack_results(ada_cv_result)

ada_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()

In [47]:
plot_result(ada_result, model='AdaBoost')
plt.show()

## Scikit-Learn: `HistGradientBoostingClassifier`

### Configuring

In [48]:
gb_clf = HistGradientBoostingClassifier(loss='binary_crossentropy',              
                                        learning_rate=0.1,          # regulates the contribution of each tree
                                        max_iter=100,               # number of boosting stages
                                        min_samples_leaf=20,
                                        max_depth=None,
                                        random_state=None,
                                        max_leaf_nodes=31,           # opt value depends on feature interaction
                                        warm_start=False,
#                                         early_stopping=True,
#                                         scoring='loss',
#                                         validation_fraction=0.1,
#                                         n_iter_no_change=None,
                                        verbose=0,
                                        tol=0.0001)

#### Cross-Validating

In [50]:
algo = 'sklearn_gbm'

In [51]:
fname = results_path / f'{algo}.joblib'

if not Path(fname).exists():
    gb_cv_result, run_time[algo] = run_cv(gb_clf, y=y_clean, X=X_dummies_clean)
    joblib.dump(gb_cv_result, fname)
else:
    gb_cv_result = joblib.load(fname)

#### Plotting Results

In [52]:
gb_result = stack_results(gb_cv_result)

gb_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()

In [53]:
plot_result(gb_result, model='Gradient Boosting Classifier')
plt.show()

### Partial Dependence Plots

In [54]:
X_ = X_factors_clean.drop(['year', 'month'], axis=1)

In [55]:
fname = results_path / f'{algo}_model.joblib'

if not Path(fname).exists():
    gb_clf.fit(y=y_clean, X=X_)
    joblib.dump(gb_clf, fname)
else:
    gb_clf = joblib.load(fname)

In [56]:
# Mean Accuracy
gb_clf.score(X=X_, y=y_clean)

In [57]:
y_score = gb_clf.predict_proba(X_)[:, 1]

roc_auc_score(y_score=y_score, y_true=y_clean)

#### One-Way & Two-Way Partial Dependence Plots

In [58]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))

plot_partial_dependence(
    estimator=gb_clf,
    X=X_,
    features=['return_12m', 'return_6m', 'CMA', ('return_12m', 'return_6m')],
    percentiles=(0.05, 0.95),
    n_jobs=-1,
    n_cols=2,
    response_method='decision_function',
    grid_resolution=250,
    ax=axes)

for i, j in product([0, 1], repeat=2):
    if i!=1 or j!= 0:
        axes[i][j].xaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 

axes[1][1].yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 

axes[0][0].set_ylabel('Partial Dependence')
axes[1][0].set_ylabel('Partial Dependence')
axes[0][0].set_xlabel('12-Months Return')
axes[0][1].set_xlabel('6-Months Return')
axes[1][0].set_xlabel('Conservative Minus Aggressive')

axes[1][1].set_xlabel('12-Month Return')
axes[1][1].set_ylabel('6-Months Return')
fig.suptitle('Partial Dependence Plots', fontsize=16)
fig.tight_layout()
fig.subplots_adjust(top=.95)
plt.show()

### Two-way partial dependence as 3D Plot

In [59]:
targets = ['return_12m', 'return_6m']
pdp, axes = partial_dependence(estimator=gb_clf,
                               features=targets,
                               X=X_,
                               grid_resolution=100)

XX, YY = np.meshgrid(axes[0], axes[1])
Z = pdp[0].reshape(list(map(np.size, axes))).T

fig = plt.figure(figsize=(14, 8))
ax = Axes3D(fig)
surface = ax.plot_surface(XX, YY, Z,
                          rstride=1,
                          cstride=1,
                          cmap=plt.cm.BuPu,
                          edgecolor='k')
ax.set_xlabel('12-Month Return')
ax.set_ylabel('6-Month Return')
ax.set_zlabel('Partial Dependence')
ax.view_init(elev=22, azim=30)
ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 
ax.xaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 

# fig.colorbar(surface)
fig.suptitle('Partial Dependence by 6- and 12-month Returns', fontsize=16)
fig.tight_layout()
plt.show()

## XGBoost

### Configuring

In [60]:
xgb_clf = XGBClassifier(max_depth=3,                  
                        learning_rate=0.1,            
                        n_estimators=100,             
                        silent=True,                  
                        objective='binary:logistic',  
                        booster='gbtree',
                        # tree_method='gpu_hist',
                        n_jobs=-1,                    
                        gamma=0,                      
                        min_child_weight=1,           
                        max_delta_step=0,             
                        subsample=1,                  
                        colsample_bytree=1,           
                        colsample_bylevel=1,          
                        reg_alpha=0,                  
                        reg_lambda=1,                 
                        scale_pos_weight=1,           
                        base_score=0.5,               
                        random_state=42)              

#### Cross-Validating

In [61]:
algo = 'xgboost'

In [62]:
fname = results_path / f'{algo}.joblib'

if not Path(fname).exists():
    xgb_cv_result, run_time[algo] = run_cv(xgb_clf)
    joblib.dump(xgb_cv_result, fname)
else:
    xgb_cv_result = joblib.load(fname)

#### Plotting Results

In [63]:
xbg_result = stack_results(xgb_cv_result)

xbg_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()

In [64]:
plot_result(xbg_result, model='XG Boost', fname=f'figures/{algo}_cv_result')
plt.show()

### Feature Importance

In [65]:
xgb_clf.fit(X=X_dummies, y=y)

In [66]:
fi = pd.Series(xgb_clf.feature_importances_, 
               index=X_dummies.columns)

In [67]:
fi.nlargest(25).sort_values().plot.barh(figsize=(10, 5), 
                                        title='Feature Importance')
sns.despine()
plt.tight_layout();
plt.show()

## LightGBM

### Configuring

In [68]:
lgb_clf = LGBMClassifier(boosting_type='gbdt',
                         # device='gpu',
                         objective='binary',          
                         metric='auc',
                         num_leaves=31,               
                         max_depth=-1,                
                         learning_rate=0.1,          
                         n_estimators=100,            
                         subsample_for_bin=200000,    
                         class_weight=None,           
                         min_split_gain=0.0,          
                         min_child_weight=0.001,      
                         min_child_samples=20,        
                         subsample=1.0,               
                         subsample_freq=0,            
                         colsample_bytree=1.0,        
                         reg_alpha=0.0,               
                         reg_lambda=0.0,              
                         random_state=42,             
                         n_jobs=-1,                   
                         silent=False,
                         importance_type='gain',      
                        )

### Cross-Validating

#### Using Categorical Features

In [69]:
algo = 'lgb_factors'

In [70]:
fname = results_path / f'{algo}.joblib'

if not Path(fname).exists():
    lgb_factor_cv_result, run_time[algo] = run_cv(lgb_clf, X=X_factors, fit_params={'categorical_feature': cat_cols})
    joblib.dump(lgb_factor_cv_result, fname)
else:
    lgb_factor_cv_result = joblib.load(fname)

#### Plotting Results

In [71]:
lgb_factor_result = stack_results(lgb_factor_cv_result)

lgb_factor_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()

In [72]:
plot_result(lgb_factor_result, model='Light GBM | Factors', fname=f'figures/{algo}_cv_result')
plt.show()

#### Using Dummy Variables

In [73]:
algo = 'lgb_dummies'

In [74]:
fname = results_path / f'{algo}.joblib'

if not Path(fname).exists():
    lgb_dummy_cv_result, run_time[algo] = run_cv(lgb_clf)
    joblib.dump(lgb_dummy_cv_result, fname)
else:
    lgb_dummy_cv_result = joblib.load(fname)

#### Plotting Results

In [75]:
lgb_dummy_result = stack_results(lgb_dummy_cv_result)

lgb_dummy_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()

In [76]:
plot_result(lgb_dummy_result, model='Light GBM | Factors', fname=f'figures/{algo}_cv_result')
plt.show()

## Catboost

### CPU

#### Configuring

In [77]:
cat_clf = CatBoostClassifier()

#### Cross-Validating

In [79]:
s = pd.Series(X_factors.columns.tolist())

cat_cols_idx = s[s.isin(cat_cols)].index.tolist()

In [80]:
algo = 'catboost'

In [81]:
fname = results_path / f'{algo}.joblib'

if not Path(fname).exists():
    fit_params = {'cat_features': cat_cols_idx}
    cat_cv_result, run_time[algo] = run_cv(cat_clf,
                                           X=X_factors,
                                           fit_params=fit_params,
                                           n_jobs=-1)
    joblib.dump(cat_cv_result, fname)
else:
    cat_cv_result = joblib.load(fname)

#### Plotting Results

In [82]:
cat_result = stack_results(cat_cv_result)

cat_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()

In [83]:
plot_result(cat_result, model='CatBoost', fname=f'figures/{algo}_cv_result')
plt.show()

### GPU

#### Configuring

In [84]:
cat_clf_gpu = CatBoostClassifier(task_type='GPU')

#### Cross-Validating

In [85]:
s = pd.Series(X_factors.columns.tolist())

cat_cols_idx = s[s.isin(cat_cols)].index.tolist()

In [86]:
algo = 'catboost_gpu'

In [87]:
fname = results_path / f'{algo}.joblib'

if not Path(fname).exists():
    fit_params = {'cat_features': cat_cols_idx}
    cat_gpu_cv_result, run_time[algo] = run_cv(cat_clf_gpu,
                                               y=y,
                                               X=X_factors,
                                               fit_params=fit_params, 
                                               n_jobs=1)
    joblib.dump(cat_gpu_cv_result, fname)
else:
    cat_gpu_cv_result = joblib.load(fname)

#### Plotting Results

In [88]:
cat_gpu_result = stack_results(cat_gpu_cv_result)

cat_gpu_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()

In [89]:
plot_result(cat_gpu_result, model='CatBoost', fname=f'figures/{algo}_cv_result')
plt.show()

### Comparing Results

In [90]:
results = {'Baseline': dummy_result,
           'Random Forest': rf_result,
           'AdaBoost': ada_result,
           'Gradient Booster': gb_result,
           'XGBoost': xbg_result,
           'LightGBM Dummies': lgb_dummy_result,
           'LightGBM Factors': lgb_factor_result,
           'CatBoost': cat_result,
           'CatBoost GPU': cat_gpu_result}
df = pd.DataFrame()
for model, result in results.items():
    df = pd.concat([df, result.groupby(['Metric', 'Dataset']
                                       ).Value.mean().unstack()['Test'].to_frame(model)], axis=1)

df.T.sort_values('AUC', ascending=False)

In [91]:
algo_dict = dict(zip(['dummy_clf', 'random_forest', 'adaboost', 'sklearn_gbm', 
                      'xgboost', 'lgb_factors', 'lgb_dummies', 'catboost', 'catboost_gpu'],
                     ['Baseline', 'Random Forest', 'AdaBoost', 'Gradient Booster', 
                      'XGBoost', 'LightGBM Dummies', 'LightGBM Factors', 'CatBoost', 'CatBoost GPU']))

In [93]:
print(run_time)

In [94]:
r = pd.Series(run_time).to_frame('t')
r.index = r.index.to_series().map(algo_dict)
r.to_csv(results_path / 'runtime.csv')

In [97]:
r = pd.read_csv(results_path / 'runtime.csv', index_col=0)

In [98]:
auc = pd.concat([v.loc[(v.Dataset=='Test') & (v.Metric=='AUC'), 'Value'].to_frame('AUC').assign(Model=k) 
                 for k, v in results.items()])

# auc = auc[auc.Model != 'Baseline']

In [99]:
fig, axes = plt.subplots(figsize=(15, 5), ncols=2)

idx = df.T.drop('Baseline')['AUC'].sort_values(ascending=False).index
sns.barplot(x='Model', y='AUC',
            data=auc,
            order=idx, ax=axes[0])
axes[0].set_xticklabels([c.replace(' ', '\n') for c in idx])
axes[0].set_ylim(.49, .58)
axes[0].set_title('Predictive Accuracy')

(r.drop('Baseline').sort_values('t').rename(index=lambda x: x.replace(' ', '\n'))
 .plot.barh(title='Runtime', ax=axes[1], logx=True, legend=False))
axes[1].set_xlabel('Seconds (log scale)')
sns.despine()
fig.tight_layout()
plt.show()