In [21]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [22]:
data = yf.download('AAPL', start='2019-01-01', end='2020-01-01')

[*********************100%%**********************]  1 of 1 completed


### Model

In [23]:
from sklearn.model_selection import train_test_split


data['S_3'] = data['Close'].rolling(window=3).mean()
data['S_9'] = data['Close'].rolling(window=9).mean()
data['return'] = np.log(data['Close'] / data['Close'].shift(1))
data['Y'] = (data['return'] > 0).astype(int)
data.dropna(inplace=True)

X = data[['S_3', 'S_9']]
y = data['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [24]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)


In [25]:
from sklearn.metrics import mean_squared_error

pred = tree.predict(X_test)
print(mean_squared_error(y_test, pred))

0.42857142857142855


# Grid Search

#### Defining grid parameters

In [30]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7, 9, 11],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

## Exhaustive grid search

In [31]:
from sklearn.model_selection import GridSearchCV

classifier_gs = GridSearchCV(tree, param_grid, cv=5, scoring='recall', n_jobs=-1, verbose=1)
classifier_gs.fit(X_train, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


In [35]:
from sklearn import metrics

print(f"Best parameters: {classifier_gs.best_params_}")
print(f"Recall (Training set): {classifier_gs.best_score_:.2f}")
print(f"Recall (Test set): {metrics.recall_score(y_test, classifier_gs.predict(X_test)):.4f}")

Best parameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
Recall (Training set): 0.97
Recall (Test set): 0.9630


In [36]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, classifier_gs.predict(X_test))

array([[ 1, 21],
       [ 1, 26]])

## Randomized grid search

In [37]:
from sklearn.model_selection import RandomizedSearchCV

classifier_rs = RandomizedSearchCV(tree, param_grid, cv=5, scoring='recall', n_jobs=-1, verbose=1)
classifier_rs.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [41]:
from sklearn import metrics

print(f"Best parmeters: {classifier_rs.best_params_}")
print(f"Recall: (Training set): {classifier_rs.best_score_ :.2f}")
print(f"Recall: (Test set): {metrics.recall_score(y_test, classifier_rs.predict(X_test)):.4f}")

Best parmeters: {'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 3, 'criterion': 'entropy'}
Recall: (Training set): 0.97
Recall: (Test set): 0.9630


In [42]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, classifier_rs.predict(X_test))

array([[ 1, 21],
       [ 1, 26]])

## Halving Grid Search

In [45]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

classifier_hgs = HalvingGridSearchCV(tree, param_grid, cv=5, scoring='recall', n_jobs=-1, verbose=1)
classifier_hgs.fit(X_train, y_train)

n_iterations: 3
n_required_iterations: 5
n_possible_iterations: 3
min_resources_: 20
max_resources_: 195
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 90
n_resources: 20
Fitting 5 folds for each of 90 candidates, totalling 450 fits
----------
iter: 1
n_candidates: 30
n_resources: 60
Fitting 5 folds for each of 30 candidates, totalling 150 fits
----------
iter: 2
n_candidates: 10
n_resources: 180
Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [46]:
from sklearn import metrics

print(f"Best parmeters: {classifier_hgs.best_params_}")
print(f"Recall: (Training set): {classifier_hgs.best_score_ :.2f}")
print(f"Recall: (Test set): {metrics.recall_score(y_test, classifier_hgs.predict(X_test)):.4f}")

Best parmeters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5}
Recall: (Training set): 0.89
Recall: (Test set): 0.8889


In [47]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, classifier_hgs.predict(X_test))

array([[ 3, 19],
       [ 3, 24]])