In [None]:
import os 
import pandas as pd
import numpy as np

directory = os.getcwd()

cwd = directory.replace('Strat_7', '\module_1')
os.chdir(cwd)

import Preprocessing_functions as pf 
import calculateMaxDD 
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
os.getcwd()

In [None]:
ticker = 'QQQ'

df = pf.downlaod_symbol_data(ticker, period= '120mo')
try:
    df = df.drop(columns=['Stock Splits', 'Dividends', 'Capital Gains'])
except KeyError:
    print("Columns not available (see above line of code)")
    
df = pf.create_momentum_feat(df, symbol=ticker) ### need to inspect in more detail how the create momemntum features work and the shift in this case
df = pf.technical_indicators(df,MA_DIVERGENCE=True)
df = pf.format_idx_date(df)

#df['prev_close'] = df['Close'].shift(1)
#df['tom_open'] = df['Open'].shift(-1)

df['overnight_pct'] = (df['Open'].shift(-1) - df['Close']) / df['Close']
#df['overnight_pct'] = (df['Open'] - df['Close'].shift(1)) / df['Close'].shift(1) WRONG - OVERFITTING!

df = df[df.index <= '2024-02-01']

df = df.dropna()

df.tail()

In [None]:
print('Start date of dataframe: ', df.index.min())
print('End date of dataframe: ', df.index.max())

In [None]:
y = df.pop('overnight_pct')
X = df.copy()
X = X.drop(columns= ['Open', 'High', 'Low', 'Close',])

# Split data into train (60%), validation (20%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [None]:
X.columns

In [None]:
model_columns = pd.DataFrame(X.columns, columns=['features'])

model_columns.to_csv(directory + f'/model_features/{ticker}_features_{len(X.columns)}.csv', index = False)


In [None]:
# Define Random Forest model
rf = RandomForestRegressor(random_state=42)

# Grid Search for best hyperparameters
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid_search = GridSearchCV(rf, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best parameters from Grid Search
best_rf = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

In [None]:
# Cross-validation with 10-folds
cv_scores = cross_val_score(best_rf, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
rmse_cv = np.sqrt(-cv_scores)
print(f"10-Fold CV RMSE: {rmse_cv.mean():.4f} ± {rmse_cv.std():.4f}")

In [None]:
# Evaluate on validation set
y_val_pred = best_rf.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"Validation RMSE: {rmse_val:.4f}")

In [None]:
X_val.head()

In [None]:
# Final evaluation on test set
y_test_pred = best_rf.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f"Test RMSE: {rmse_test:.4f}")

In [None]:
os.chdir(cwd.replace('module_1', 'Strat_7\models'))

In [None]:
os.getcwd()

In [None]:
# Save the best model
os.chdir(cwd.replace('module_1', 'Strat_7\models'))
joblib.dump(best_rf, f"{ticker}_overnight_regression_random_forest.pkl")
print(f"Model saved as {ticker}_overnight_regression_random_forest.pkl")



# Save the best model
joblib.dump(best_rf, "best_random_forest.pkl")
print("Model saved as best_random_forest.pkl")

# Load the model for future use
loaded_model = joblib.load("best_random_forest.pkl")
y_loaded_pred = loaded_model.predict(X_test)

#### BACKTESTING

In [None]:
#ticker = 'SPY'

df1 = pf.downlaod_symbol_data(ticker, period= '360mo')
try:
    df1 = df1.drop(columns=['Stock Splits', 'Dividends', 'Capital Gains'])
except KeyError:
    print("Columns not available (see above line of code)")
    
df1 = pf.create_momentum_feat(df1, symbol=ticker) ### need to inspect in more detail how the create momemntum features work and the shift in this case
df1 = pf.technical_indicators(df1,MA_DIVERGENCE=True)
df1 = pf.format_idx_date(df1)

df1 = df1.dropna()

model_start_date = df.index.min()

df1 = df1[df1.index < model_start_date]

df1.head()

In [None]:
X.columns

In [None]:
df2 = df1[X.columns]

df2.head()

In [None]:
os.chdir(cwd.replace('module_1', 'Strat_7\models'))
os.getcwd()

In [None]:
best_rf = joblib.load(f"{ticker}_overnight_regression_random_forest.pkl")

In [None]:
df1['predictions'] = best_rf.predict(df2)
df1['overnight_pct'] = (df1['Open'].shift(-1) - df1['Close']) / df1['Close']
df1['action'] = np.where(df1['predictions'] > 0, 'BUY', 'SELL')
df1.head()

In [None]:
df1 = df1.dropna()
df1['prev_close'] = df1['Close'].shift()
df1['tom_open'] = df1['Open'].shift(-1)

cols = df1.columns
cols = [i for i in cols if i not in X.columns]

df1 = df1[cols]

df1.head()

In [None]:
import matplotlib.pyplot as plt 

df1['error'] = df1['predictions'] - df1['overnight_pct']

plt.figure(figsize=[10,7])
plt.plot(df1.index, df1['error'] , color = 'b')
plt.title('Overning Error in PCT points')
plt.xlabel('Date')
plt.ylabel('Pct Difference')

In [None]:
df1.head()

In [None]:
df1['return'] = np.where(df1['action'] == 'SELL', df1['overnight_pct'] * (-1), df1['overnight_pct'])
df1.head()

In [None]:
df1['cum_ret'] = df1['return'].cumsum()

plt.figure(figsize=[10,7])
plt.plot(df1.index, df1['cum_ret'] , color = 'b')
plt.title('Backtest Cummulative Return PCT')
plt.xlabel('Date')
plt.ylabel('Cummulative Return PCT')

In [None]:
##### LOSSES
df1[df1['return'] < 0]

In [None]:

#####   MAX DRAWDOWN
#from calculateMaxDD import calculateMaxDD

cum_ret = np.cumprod(1+ df1['return']) - 1
maxDrawdown, maxDrawdownDuration, startDrawdownDay=calculateMaxDD.calculateMaxDD(cum_ret.values)

#####   SHARPE RATIO
sharpe_ratio = round(np.sqrt(252) * np.mean(df1['return']) / np.std(df1['return']),2)

#####   AVG YEARLY RETURN AND STD
mean_ret = df1['return'].mean() * 252
std = df1['return'].std()*np.sqrt(252)

print(f'Sharpe Ratio: {sharpe_ratio}')
print(f'Maximum Drawdown: {round(maxDrawdown,4)}')
print(f'Max Drawdown Duration: {maxDrawdownDuration} days' )
print(f'Start day Drawdown: {startDrawdownDay}')
print(f"Average Yearly Return: {round(mean_ret*100, 2)} %")

In [None]:
rets = df1[['return']]
rets.to_csv(directory + '\strat_returns' + f'\{ticker}.csv')

In [None]:
directory

In [None]:
df1.predictions.describe()