In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [2]:
data = pd.read_csv('spy dal 2022.csv', index_col=0, parse_dates=True, sep=',')
data["returns"] = data["close"].pct_change(1)

### Features engineering

##### Realized Volatility

In [3]:
data["RV_1_sqrt"] = np.sqrt(data["returns"]**2)
data["RV_5_sqrt"] = np.sqrt(data["returns"].rolling(5).sum()**2)
data["RV_30_sqrt"] = np.sqrt(data["returns"].rolling(22).sum()**2)
data["RV_60_sqrt"] = np.sqrt(data["returns"].rolling(66).sum()**2)
data["RV_120_sqrt"] = np.sqrt(data["returns"].rolling(132).sum()**2)

##### Modello HAR

In [4]:
data["modello HAR"] = 0.0 + 0.2 * data["RV_1_sqrt"] + 0.2 * data["RV_5_sqrt"] + 0.2 * data["RV_30_sqrt"] + 0.2 * data["RV_60_sqrt"] + 0.2 * data["RV_120_sqrt"]

##### Target

In [5]:
data["HAR_tomorrow"] = data["modello HAR"].shift(-1)
data["target"] = (data["HAR_tomorrow"] > data["modello HAR"] ).astype(int)

##### X

In [13]:
# Average True Range
data["TR"] = np.maximum(data["high"] - data["low"], data["high"] - data["close"].shift(1), data["low"] - data["close"].shift(1)) 
data["ATR"] = data["TR"]

# Close Relative To Daily Range
data["CRTDR"] = (data["close"] - data["low"]) / (data["high"] - data["low"])

# Exponential Moving Average of realised volatility
data["EMA_RV_1_sqrt"] = data["RV_1_sqrt"].ewm(span=22, adjust=False).mean()
data["EMA_RV_5_sqrt"] = data["RV_5_sqrt"].ewm(span=22, adjust=False).mean()
data["EMA_RV_30_sqrt"] = data["RV_30_sqrt"].ewm(span=22, adjust=False).mean()
data["EMA_RV_60_sqrt"] = data["RV_60_sqrt"].ewm(span=22, adjust=False).mean()
data["EMA_RV_120_sqrt"] = data["RV_120_sqrt"].ewm(span=22, adjust=False).mean()

# Moving Average convergence divergence
data["MACDRV"] = data["EMA_RV_1_sqrt"] - data["EMA_RV_5_sqrt"]

# Relative Strength Index for realised volatility
data["RSI_RV_1_sqrt"] = 100 - 100 / (1 + data["RV_1_sqrt"].rolling(14).mean())

data.dropna(inplace=True)



### Split the data

In [14]:
X = data[["RV_1_sqrt", "RV_5_sqrt", "RV_30_sqrt", "RV_60_sqrt", "RV_120_sqrt", "TR", "ATR", "CRTDR", "EMA_RV_1_sqrt", "EMA_RV_5_sqrt", "EMA_RV_30_sqrt", "EMA_RV_60_sqrt", "EMA_RV_120_sqrt", "MACDRV", "RSI_RV_1_sqrt"]]



y = data["target"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### Random Forest

In [31]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0, max_leaf_nodes=50, min_impurity_decrease=0, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [38]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.5959208044401623

### Gradient Boosting Classifier

In [40]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=5, learning_rate=0.5, random_state=42, max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0, max_leaf_nodes=50, min_impurity_decrease=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


In [41]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.5959208044401623

### Ada Boost Classifier 

In [20]:
from sklearn.ensemble import AdaBoostClassifier 

clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.2, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)



In [21]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.5933017600521094

### Grid Search

In [42]:
from sklearn.model_selection import GridSearchCV

clf

param_grid = {'max_depth': [3, 5, 7, 9], 
              'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
              'min_samples_split': [2, 4, 6, 8],
              'min_samples_leaf': [1, 2, 3, 4, 5],
              'min_weight_fraction_leaf': [0, 0.1, 0.2, 0.3],
              'max_leaf_nodes': [10, 20, 30, 40, 50],
              'min_impurity_decrease': [0, 0.1, 0.2, 0.3]}

grid_search = GridSearchCV(clf, param_grid, cv=5)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)
print(grid_search.best_estimator_)



