In [1]:
# Standard library imports
from datetime import datetime, timedelta

# Third-party library imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import yfinance as yf

# Scikit-learn imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
from scripts.get_top_stocks import *
top_stocks_long, top_stocks_long_short = generate_top_stocks_df()
common_top_stocks = get_common_top_stocks(top_stocks_long, top_stocks_long_short)

# specify variables for dataset generation
start_date = '2023-01-01'
end_date = '2024-08-01'
interval = "1h"
trading_days_per_year = 252
hours_per_day = 6.5


Results saved to 'top_30_stocks_long_only.csv' and 'top_30_stocks_long_short.csv'

Detailed metrics for top stock (Long-Only Strategy) SMCI:
Final Close Price: 284.26
60-day Return: 250.61%
Average Hourly Volatility: 0.6678
Average Sharpe Ratio: 1.6242
Average Hourly Volume: 355714

num common_top_stocks:  35


In [3]:
from scripts.generate_dataset_features import *


df = get_all_stock_features_df(
    stocks_list=common_top_stocks,
    start_date=start_date,
    end_date=end_date,
    interval=interval,
    trading_days_per_year=trading_days_per_year, 
    hours_per_day=hours_per_day)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

In [4]:
df.head()

Unnamed: 0,Datetime,Open,High,Low,Close,Adj Close,Volume,Log_Return,EMAVolumeDiff2,SMAVolumeDiff2,...,MACD16,MACD32,MACD64,MACD128,MACD256,FamaFrenchMktReturns,Log_Return_shift,stock_name,Stock_Position,Target
0,2023-02-24 13:30:00-05:00,34.439999,34.537498,34.380001,34.455002,34.455002,430096,0.000363,0.237227,0.377802,...,2.95974,-0.088659,-0.726666,-3.268206,-37.998322,-1.09,0.004199,CPRT,buy,2
1,2023-02-24 14:30:00-05:00,34.450001,34.634998,34.41,34.599998,34.599998,694483,0.004199,0.199732,0.235099,...,-0.570454,0.152135,21.164271,35.81704,10.515641,-1.09,0.000434,CPRT,hold,1
2,2023-02-24 15:30:00-05:00,34.610001,34.634998,34.455002,34.615002,34.615002,456536,0.000434,-0.081993,-0.206727,...,-0.579874,0.162966,5.581055,16.741564,9.517186,-1.09,0.009417,CPRT,buy,2
3,2023-02-27 09:30:00-05:00,34.895,35.02,34.740002,34.942501,34.942501,181650,0.009417,-0.366788,-0.43073,...,0.050891,0.093693,0.34685,1.293227,2.683604,0.31,-0.006244,CPRT,sell,0
4,2023-02-27 10:30:00-05:00,34.93,34.93,34.695,34.724998,34.724998,108775,-0.006244,-0.35307,-0.250925,...,2.594881,0.522353,1.372016,3.948259,5.474724,0.31,0.003019,CPRT,buy,2


In [38]:
df.Target.value_counts()

Target
1.0    30558
0.0    29656
Name: count, dtype: int64

In [28]:
# additional logic to ensure buy remains +ve and sell remainvs -ve

print(len(df[(df.Log_Return_shift < 0) & (df.Stock_Position == 'buy')]))
print(len(df[(df.Log_Return_shift < 0) & (df.Stock_Position == 'strong buy')]))

print(len(df[(df.Log_Return_shift > 0) & (df.Stock_Position == 'sell')]))
print(len(df[(df.Log_Return_shift > 0) & (df.Stock_Position == 'strong sell')]))

# all good

0
0
0
0


In [30]:
df.drop(['Log_Return_shift'], axis=1, inplace=True)
df['Target'] = df['Target'].map({1:0, 2:1, 3:2})
df=df.dropna()

In [32]:
target = 'Target'
features = df.columns[8:-1]
features = features.drop('stock_name')
features = features.drop('Stock_Position')

In [34]:
# sort df according to date
# df1 = df.sort_values("Datetime").reset_index(drop=True)
df1 = df.sort_values("Datetime").reset_index(drop=True)
# train test split on index
train_size, val_size, test_size = 0.8, 0.1, 0.1
train_size, val_size, test_size = int(len(df1) * train_size), int(len(df1) * val_size), int(len(df1) * test_size)
train_df = df1.iloc[:train_size]
val_df = df1.iloc[train_size:train_size + val_size]
test_df = df1.iloc[train_size + val_size:]

train_df.shape,val_df.shape,test_df.shape

((48171, 83), (6021, 83), (6022, 83))

In [36]:
test_df.Target.value_counts()

Target
1.0    3047
0.0    2975
Name: count, dtype: int64

In [None]:
features

In [None]:
features_numpy = df[features].to_numpy()
target_numpy = df[target].to_numpy()
# features = features.drop('Target')
X_train = train_df[features].to_numpy()
y_train = train_df[target].to_numpy()
print(X_train.shape,y_train.shape)

X_val = val_df[features].to_numpy()
y_val = val_df[target].to_numpy()
print(X_val.shape,y_val.shape)

X_test = test_df[features].to_numpy()
y_test = test_df[target].to_numpy()
print(X_test.shape,y_test.shape)

In [None]:
from scripts.train_predict import *

# standardise the values

scaler = StandardScaler()

# Fit the scaler on the training data and transform the training data
X_train = scaler.fit_transform(X_train)

# Transform the validation data
X_val = scaler.transform(X_val)

# Transform the test data
X_test = scaler.transform(X_test)

X_train.shape,y_train.shape,X_val.shape,y_val.shape,X_test.shape,y_test.shape

In [None]:
X_train

In [None]:
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
# from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import ParameterGrid
# from tqdm import tqdm
from alive_progress import alive_it

def train():
    # initialise training params
    best_score = float('inf')
    best_params = None
    best_model = None
    

    # Define the parameter grid
    # param_grid={"learning_rate": (0.05, 0.10, 0.15),
    #             "max_depth": [ 3, 4, 6, 8],
    #             "min_samples_leaf": [0.001, 0.05, 0.1],
    #             "n_estimators": [100, 200, 500]
    #             },
    param_grid = {
        'eta': [0.1, 0.3, 0.5],
        'gamma': [0, ],
        'max_depth': [6, 8],
    }
    # best_model = xgb.XGBClassifier()
    # best_model.fit(X_train, y_train)
    # y_pred = best_model.predict(X_val)
    # best_score = accuracy_score(y_val, y_pred)
    # best_params = 0
    for params in alive_it(ParameterGrid(param_grid)):
        model = xgb.XGBClassifier(**params) # initialise new model before each run
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        score = accuracy_score(y_val, y_pred)
        
        if score < best_score:
            best_score = score
            best_params = params
            best_model = model
            print(f"score: {round(best_score,7)}")
            # print(f"score: {round(best_score,7)}, params: {best_params}")

    print(f"Best parameters found: {best_params}")
    print(f"Best validation score: {round(best_score,5)}")

    return best_model, best_params, best_score, X_test, y_test

def predict(best_model,X_test, y_test):
    y_pred = best_model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print(f"Test set accuracy score: {round(score,5)}")
    
    return y_pred

def train_predict():
    best_model, best_params, best_score, X_test, y_test = train()
    y_pred = predict(best_model, X_test, y_test)
    acc = accuracy_score(y_test, y_pred)
 
    
    return best_model, best_params, best_score, acc, y_pred
    

In [None]:
best_model, best_params, best_score, acc, y_pred = train_predict()

In [None]:
unique, counts = np.unique(y_pred, return_counts=True)

dict(zip(unique, counts))

In [None]:
unique, counts = np.unique(y_test, return_counts=True)

dict(zip(unique, counts))

In [None]:
best_model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

print(f"accuracy: {round(acc*100,2)}")
labels = ['sell', 'hold', 'buy']
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=labels)
disp.plot()
plt.show()

In [None]:
stock_df = test_df[test_df['stock_name']=='LRCX']
stock_X_test = stock_df[features].to_numpy()
stock_y_test = stock_df[target].to_numpy()
stock_preds = best_model.predict(stock_X_test)
acc = accuracy_score(stock_y_test, stock_preds)

In [None]:
features

In [None]:
stock_df[features]

In [None]:
stock_y_test

In [None]:
for ticker in test_df.stock_name.unique():
    stock_df = test_df[test_df['stock_name']==ticker]
    stock_X_test = stock_df[features].to_numpy()
    stock_y_test = stock_df[target].to_numpy()
    stock_preds = best_model.predict(stock_X_test)
    acc = accuracy_score(stock_y_test, stock_preds)

    print(f"{ticker}: {round(acc*100, 2)}")

In [None]:
test_df.Target.value_counts()