In [4]:
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split

# Replace 'TICKER' with the stock symbol you're interested in
ticker = 'SPY'
n_days = 10
n_hist_wk = 520

end_date = datetime.today()
start_date = end_date - timedelta(weeks=n_hist_wk)

data = yf.download(ticker, start=start_date, end=end_date, interval='1d')

data['price_change'] = data['Close'].pct_change()
data['volume_change'] = data['Volume'].pct_change()
data['relative_high'] = (data['High'] - data['Close']) / data['Close']
data['relative_low'] = (data['Low'] - data['Close']) / data['Close']

feature_df = pd.DataFrame()

# Add recent n_days data
for i in range(n_days, 0, -1):
    feature_df[f'price_change_{i}'] = data['price_change'].shift(i)
    feature_df[f'volume_change_{i}'] = data['volume_change'].shift(i)
    feature_df[f'relative_high_{i}'] = data['relative_high'].shift(i)
    feature_df[f'relative_low_{i}'] = data['relative_low'].shift(i)
    
# Add other factor
feature_df['day_in_week'] = data.index.dayofweek

feature_df['direction'] = (data['Close'].pct_change() > 0).astype(int)
feature_df = feature_df.dropna()
#feature_df.head(5)

X = feature_df.drop(columns=['direction'])
y = feature_df['direction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print('Data prepared! Data size:', feature_df.shape)

[*********************100%***********************]  1 of 1 completed
Data prepared! Data size: (2489, 82)


In [5]:
# SVM with CV
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from tqdm.auto import tqdm

# Custom cross-validation iterator with a progress bar
class TqdmKFold(KFold):
    def __iter__(self, *args, **kwargs):
        iterator = super().__iter__(*args, **kwargs)
        for train_index, test_index in iterator:
            yield train_index, test_index
            tqdm.write('.', end='', flush=True)

# Create the SVM classifier
svm = SVC()

# Define the hyperparameter grid for the grid search
param_grid = {
    'C': np.logspace(-3, 3, 7),
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'] + list(np.logspace(-3, 2, 6))
}

# Define the scoring metrics for the grid search
scoring_metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1_score': make_scorer(f1_score)
}

# Create the progress bar
n_splits = 5
progress_bar = tqdm(total=n_splits, desc='GridSearchCV', unit='fold')

# Create the GridSearchCV object with the custom cross-validation iterator
grid_search_svm = GridSearchCV(
    svm, param_grid, scoring=scoring_metrics, refit='accuracy', cv=TqdmKFold(n_splits=n_splits), n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search_svm.fit(X_train, y_train)

# Close the progress bar
progress_bar.close()

# Print the best hyperparameters and the corresponding scores
print(f'Best hyperparameters: {grid_search_svm.best_params_}')
print(f'Best accuracy: {grid_search_svm.best_score_:.4f}')

# Evaluate the best model on the test set
best_svm = grid_search_svm.best_estimator_
y_pred = best_svm.predict(X_test)

print(f'Test accuracy: {accuracy_score(y_test, y_pred):.4f}')
print(f'Test precision: {precision_score(y_test, y_pred):.4f}')
print(f'Test recall: {recall_score(y_test, y_pred):.4f}')
print(f'Test F1 score: {f1_score(y_test, y_pred):.4f}')

GridSearchCV:   0%|          | 0/5 [00:00<?, ?fold/s]

Best hyperparameters: {'C': 0.001, 'gamma': 'scale', 'kernel': 'linear'}
Best accuracy: 0.5419
Test accuracy: 0.5422
Test precision: 0.5422
Test recall: 1.0000
Test F1 score: 0.7031


In [6]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from tqdm.auto import tqdm

# Custom cross-validation iterator with a progress bar
class TqdmKFold(KFold):
    def __iter__(self, *args, **kwargs):
        iterator = super().__iter__(*args, **kwargs)
        for train_index, test_index in iterator:
            yield train_index, test_index
            tqdm.write('.', end='', flush=True)

# Create the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid for the grid search
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt'],
    'bootstrap': [True, False]
}

# Define the scoring metrics for the grid search
scoring_metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1_score': make_scorer(f1_score),
}

# Create the progress bar
n_splits = 5
progress_bar = tqdm(total=n_splits, desc='GridSearchCV', unit='fold')

# Create the GridSearchCV object with the custom cross-validation iterator
grid_search_rf = GridSearchCV(
    rf, param_grid, scoring=scoring_metrics, refit='accuracy', cv=TqdmKFold(n_splits=n_splits), n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search_rf.fit(X_train, y_train)

# Close the progress bar
progress_bar.close()

# Print the best hyperparameters and the corresponding scores
print(f'Best hyperparameters: {grid_search_rf.best_params_}')
print(f'Best accuracy: {grid_search_rf.best_score_:.4f}')

# Evaluate the best model on the test set
best_rf = grid_search_rf.best_estimator_
y_pred = best_rf.predict(X_test)

print(f'Test accuracy: {accuracy_score(y_test, y_pred):.4f}')
print(f'Test precision: {precision_score(y_test, y_pred):.4f}')
print(f'Test recall: {recall_score(y_test, y_pred):.4f}')
print(f'Test F1 score: {f1_score(y_test, y_pred):.4f}')

GridSearchCV:   0%|          | 0/5 [00:00<?, ?fold/s]

  warn(


Best hyperparameters: {'bootstrap': True, 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best accuracy: 0.5289
Test accuracy: 0.5281
Test precision: 0.5432
Test recall: 0.8148
Test F1 score: 0.6519


In [7]:
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from tqdm.auto import tqdm

# Custom cross-validation iterator with a progress bar
class TqdmKFold(KFold):
    def __iter__(self, *args, **kwargs):
        iterator = super().__iter__(*args, **kwargs)
        for train_index, test_index in iterator:
            yield train_index, test_index
            tqdm.write('.', end='', flush=True)

# Create the GBM classifier
gbm = GradientBoostingClassifier(random_state=42)

# Define the hyperparameter grid for the grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 8],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt']
}

# Define the scoring metrics for the grid search
scoring_metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1_score': make_scorer(f1_score),
}

# Create the progress bar
n_splits = 5
progress_bar = tqdm(total=n_splits, desc='GridSearchCV', unit='fold')

# Create the GridSearchCV object with the custom cross-validation iterator
grid_search_gbm = GridSearchCV(
    gbm, param_grid, scoring=scoring_metrics, refit='accuracy', cv=TqdmKFold(n_splits=n_splits), n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search_gbm.fit(X_train, y_train)

# Close the progress bar
progress_bar.close()

# Print the best hyperparameters and the corresponding scores
print(f'Best hyperparameters: {grid_search_gbm.best_params_}')
print(f'Best accuracy: {grid_search_gbm.best_score_:.4f}')

# Evaluate the best model on the test set
best_gbm = grid_search_gbm.best_estimator_
y_pred = best_gbm.predict(X_test)

print(f'Test accuracy: {accuracy_score(y_test, y_pred):.4f}')
print(f'Test precision: {precision_score(y_test, y_pred):.4f}')
print(f'Test recall: {recall_score(y_test, y_pred):.4f}')
print(f'Test F1 score: {f1_score(y_test, y_pred):.4f}')

GridSearchCV:   0%|          | 0/5 [00:00<?, ?fold/s]





Best hyperparameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
Best accuracy: 0.5419
Test accuracy: 0.5261
Test precision: 0.5351
Test recall: 0.9593
Test F1 score: 0.6870




In [12]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer

# Create the MLP classifier
mlp = MLPClassifier(random_state=42)

# Define the hyperparameter grid for the grid search
param_grid = {
    'hidden_layer_sizes': [(50, 50), (100, 100), (50, 100), (100, 50), (50, 50, 50), (100, 50, 100), (50, 100, 50), (50, 100, 100), (50, 50, 50, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant', 'adaptive'],
}

# Define the scoring metrics for the grid search
scoring_metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1_score': make_scorer(f1_score),
}

# Create the GridSearchCV object
grid_search_mlp = GridSearchCV(
    mlp, param_grid, scoring=scoring_metrics, refit='accuracy', cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search_mlp.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding scores
print(f'Best hyperparameters: {grid_search_mlp.best_params_}')
print(f'Best accuracy: {grid_search_mlp.best_score_:.4f}')

# Evaluate the best model on the test set
best_mlp = grid_search_mlp.best_estimator_
y_pred = best_mlp.predict(X_test)

print(f'Test accuracy: {accuracy_score(y_test, y_pred):.4f}')
print(f'Test precision: {precision_score(y_test, y_pred):.4f}')
print(f'Test recall: {recall_score(y_test, y_pred):.4f}')
print(f'Test F1 score: {f1_score(y_test, y_pred):.4f}')

Best hyperparameters: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (50, 100), 'learning_rate': 'constant', 'solver': 'adam'}
Best accuracy: 0.5093
Test accuracy: 0.4839
Test precision: 0.5217
Test recall: 0.5778
Test F1 score: 0.5483




In [13]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Retrieve the best models from the grid searches
best_svm = grid_search_svm.best_estimator_
best_rf = grid_search_rf.best_estimator_
best_gbm = grid_search_gbm.best_estimator_
best_mlp = grid_search_mlp.best_estimator_

# Create the stacking classifier
stacked_clf = StackingClassifier(
    estimators=[
        ('svm', best_svm),
        ('random_forest', best_rf),
        ('gbm', best_gbm),
        ('mlp', best_mlp),
    ],
    final_estimator=LogisticRegression(random_state=42),
    cv=5,
    n_jobs=-1
)

# Fit the stacking classifier to the training data
stacked_clf.fit(X_train, y_train)

# Evaluate the stacking classifier on the test set
y_pred = stacked_clf.predict(X_test)

print(f'Stacking Test accuracy: {accuracy_score(y_test, y_pred):.4f}')
print(f'Stacking Test precision: {precision_score(y_test, y_pred):.4f}')
print(f'Stacking Test recall: {recall_score(y_test, y_pred):.4f}')
print(f'Stacking Test F1 score: {f1_score(y_test, y_pred):.4f}')

Stacking Test accuracy: 0.5422
Stacking Test precision: 0.5422
Stacking Test recall: 1.0000
Stacking Test F1 score: 0.7031


In [17]:
# Prepare the input data for prediction
last_data = X.iloc[-1:].values

print(last_data)

# Make a prediction using the stacking classifier
tomorrow_direction = stacked_clf.predict(last_data)

# Print the predicted direction for tomorrow
if tomorrow_direction[0] == 1:
    print("The predicted direction for tomorrow is UP.")
else:
    print("The predicted direction for tomorrow is DOWN.")

[[-6.25433880e-03  1.55219549e-01  5.39435511e-04 -1.43084858e-02
   1.75451772e-02 -1.71926202e-01  9.08878550e-04 -2.47910357e-02
  -1.54502420e-02 -1.88531994e-02  1.13079919e-02 -3.69240872e-03
   9.61563141e-03 -3.37932771e-01  1.09215016e-03 -9.32082847e-03
   1.31305266e-02 -1.64589418e-02  1.25341555e-03 -8.34779038e-03
  -1.70464973e-02  2.20951399e-01  2.64721769e-02 -1.01956157e-04
   2.70339475e-03  6.80530772e-02  1.55657728e-02 -7.17248831e-03
   6.56201258e-03 -9.77693582e-02  2.27407045e-04 -1.60454987e-02
   1.86984266e-03 -3.12697340e-01  6.12883869e-03 -2.34556407e-03
  -2.24465725e-03 -1.50501821e-01  2.24970707e-03 -4.82811838e-03
   1.45348835e-02  2.32635669e-01  6.22897711e-04 -6.65258101e-03
   5.85525369e-03 -9.88142905e-02  1.61009135e-03 -4.80555458e-03
   1.40946303e-02  6.03301833e-01  7.57218172e-04 -1.18225327e-02
   3.81054130e-03 -3.98157261e-01  1.02198053e-03 -6.10782258e-03
  -5.54811707e-03 -1.17166807e-02  7.95262655e-03 -3.49921244e-03
  -2.61826

