# Realized Volatility Prediction
## 4 (pt.1) Modeling

### Table of Contents
4. KNN
5. Gradient Boosting
6. Hist Gradient Boosting
7. Tuning Hist Gradient Boosting

In [1]:
# Standard imports and libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('../data/interim/features1_train2.csv')
train.drop(columns=['Unnamed: 0'], inplace=True)
X_train= train.drop(['stock_id','time_id', 'target_value'], axis=1)
y_train = train[['target_value']].values.ravel()

In [3]:
test = pd.read_csv('../data/interim/features1_train2.csv')
test.drop(columns=['Unnamed: 0'], inplace=True)
X_test = test.drop(['stock_id','time_id', 'target_value'], axis=1)
y_test = test[['target_value']].values.ravel()

## 4.4 KNN

In [4]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=100)
neigh.fit(X_train, y_train)

In [5]:
y_pred = neigh.predict(X_test)

In [6]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [7]:
rmspe(y_test, y_pred)

0.8725204650743684

## 4.5 Gradient Boosting

In [8]:
from sklearn.ensemble import GradientBoostingRegressor
gbreg = GradientBoostingRegressor(random_state=0)
gbreg.fit(X_train, y_train)

In [9]:
y_pred = gbreg.predict(X_test)
rmspe(y_test, y_pred)

0.30003018540074855

## 4.6 Hist Gradient Boosting

In [10]:
from sklearn.ensemble import HistGradientBoostingRegressor

common_params = {
    "max_iter": 1_000,
    "learning_rate": 0.3,
    "validation_fraction": 0.2,
    "random_state": 42,
    "categorical_features": None,
    "scoring": "neg_root_mean_squared_error",
}

hgbt = HistGradientBoostingRegressor(early_stopping=True, **common_params)
hgbt.fit(X_train, y_train)

In [11]:
y_pred = hgbt.predict(X_test)
rmspe(y_test, y_pred)

0.2919963519445322

## 4.7 Tuning Hist Gradient Boosting

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html#

In [12]:
from sklearn.metrics import make_scorer
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
rmspe_score = make_scorer(rmspe, greater_is_better=False)

In [13]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(hgbt, X_train, y_train, cv=5, scoring=rmspe_score)
print(scores)

[-0.28029399 -0.42583363 -0.27318852 -0.26660448 -0.27738921]


Perform Halving Random Search

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.HalvingRandomSearchCV.html#sklearn.model_selection.HalvingRandomSearchCV

In [14]:
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV

In [15]:
hgbt = HistGradientBoostingRegressor(random_state=57)

In [16]:
param_distributions = {'learning_rate' : [0.1, 0.3, 0.5, 1], 
                       'max_iter' : [10, 100, 1000],
                       'max_leaf_nodes': [10, 25, 50]}

In [17]:
search = HalvingRandomSearchCV(hgbt, param_distributions,
                               scoring=rmspe_score, random_state=57).fit(X_train, y_train)
search.best_params_



{'max_leaf_nodes': 50, 'max_iter': 100, 'learning_rate': 0.3}

In [18]:
df = pd.DataFrame(search.cv_results_)
df.sort_values(by=['rank_test_score'], inplace=True)
select_view = df[['rank_test_score', 'param_max_leaf_nodes', 'param_max_iter', 'param_learning_rate', 'mean_test_score', 'std_test_score']]
select_view.head(10)

Unnamed: 0,rank_test_score,param_max_leaf_nodes,param_max_iter,param_learning_rate,mean_test_score,std_test_score
48,1,50,100,0.3,-0.340481,0.073263
53,2,50,100,0.3,-0.392517,0.03044
49,3,10,1000,0.3,-0.41689,0.084812
52,4,10,1000,0.3,-0.456296,0.069946
51,5,50,1000,1.0,-0.528262,0.118693
50,5,25,1000,1.0,-0.528262,0.118693
23,7,50,100,0.5,-0.778659,1.310867
24,7,10,1000,0.5,-0.778659,1.310867
25,7,25,1000,0.5,-0.778659,1.310867
27,7,10,10,1.0,-0.778659,1.310867


In [19]:
hgbt1 = HistGradientBoostingRegressor(random_state=57, learning_rate=0.3, max_iter=100, max_leaf_nodes=50)
scores = cross_val_score(hgbt, X_train, y_train, cv=5, scoring=rmspe_score)
print(np.mean(scores))

-0.32588546984393696


In [22]:
hgbt1.fit(X_train, y_train)
y_pred = hgbt1.predict(X_test)
final_score = rmspe(y_test, y_pred)
print(final_score)

0.297360198348431


In [23]:
y_naive= X_test['real_vol_1']
naive_score = rmspe(y_test, y_naive)
print(naive_score)

0.34061479341887974


In [24]:
improvement = final_score / naive_score - 1
print(improvement * 100)

-12.698977233574016
