In [911]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import StratifiedKFold
%matplotlib inline

In [912]:
# Read the CSV file into a DataFrame
apple_df = pd.read_csv("AAPL.csv", index_col = "Date")

In [913]:
fear_index_df = pd.read_csv("^VIX.csv", index_col = "Date")

# Rename the "High" column to "Fear_index"
fear_index_df = fear_index_df.rename(columns={"High": "Fear_index"})


In [914]:
spy_index_df = pd.read_csv("SPY.csv", index_col = "Date")

# Rename the "High" column to "Fear_index"
spy_index_df = spy_index_df.rename(columns={"High": "Spy_index"})

In [915]:
fedfunds_df = pd.read_csv("FEDFUNDS (1).csv", index_col = "DATE")

fedfunds_df.head()

Unnamed: 0_level_0,FEDFUNDS
DATE,Unnamed: 1_level_1
2014-04-01,0.09
2014-05-01,0.09
2014-06-01,0.1
2014-07-01,0.09
2014-08-01,0.09


In [916]:
# Convert the Date index to datetime format
fedfunds_df.index = pd.to_datetime(fedfunds_df.index)

fedfunds_df.head()

Unnamed: 0_level_0,FEDFUNDS
DATE,Unnamed: 1_level_1
2014-04-01,0.09
2014-05-01,0.09
2014-06-01,0.1
2014-07-01,0.09
2014-08-01,0.09


In [917]:
# Resample the fed_funds_df to have daily frequency and forward fill the values
fedfunds_df_monthly = fedfunds_df.resample('D').ffill()

fedfunds_df_monthly.tail()

Unnamed: 0_level_0,FEDFUNDS
DATE,Unnamed: 1_level_1
2024-03-28,5.33
2024-03-29,5.33
2024-03-30,5.33
2024-03-31,5.33
2024-04-01,5.33


In [918]:
spy_index_df.head()

Unnamed: 0_level_0,Open,Spy_index,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-04-30,187.440002,188.5,187.179993,188.309998,157.372498,101508000
2014-05-01,188.220001,188.839996,187.729996,188.330002,157.389206,93019000
2014-05-02,188.309998,189.139999,187.779999,188.059998,157.163528,98122000
2014-05-05,187.139999,188.550003,186.619995,188.419998,157.464355,75883000
2014-05-06,188.0,188.130005,186.740005,186.779999,156.093781,85454000


In [919]:
fear_index_df.head()

Unnamed: 0_level_0,Open,Fear_index,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-04-30,14.05,14.18,13.34,13.41,13.41,0.0
2014-05-01,13.64,13.75,13.1,13.25,13.25,0.0
2014-05-02,13.15,13.5,12.83,12.91,12.91,0.0
2014-05-05,13.95,14.2,13.08,13.29,13.29,0.0
2014-05-06,13.65,13.9,13.28,13.8,13.8,0.0


In [920]:
# Display the DataFrame
apple_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-04-30,21.165714,21.408215,21.064285,21.074642,18.562872,456640800
2014-05-01,21.142857,21.242857,20.941429,21.124287,18.60659,244048000
2014-05-02,21.155001,21.22143,21.061071,21.16357,18.641197,191514400
2014-05-05,21.076429,21.464287,21.071428,21.462856,18.90481,287067200
2014-05-06,21.492857,21.586071,21.22893,21.22893,18.698767,374564400


In [921]:
# Concatinate two dataframes and add "High" column from ^VIX.csv as fear_index
concatenate_df = pd.concat([apple_df, fear_index_df['Fear_index']], axis=1)


In [922]:
concatenate_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Fear_index
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-04-30,21.165714,21.408215,21.064285,21.074642,18.562872,456640800.0,14.18
2014-05-01,21.142857,21.242857,20.941429,21.124287,18.60659,244048000.0,13.75
2014-05-02,21.155001,21.22143,21.061071,21.16357,18.641197,191514400.0,13.5
2014-05-05,21.076429,21.464287,21.071428,21.462856,18.90481,287067200.0,14.2
2014-05-06,21.492857,21.586071,21.22893,21.22893,18.698767,374564400.0,13.9


In [923]:
concatenate_df_2 = pd.concat([concatenate_df, spy_index_df['Spy_index']], axis = 1)

concatenate_df_2.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Fear_index,Spy_index
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-04-30,21.165714,21.408215,21.064285,21.074642,18.562872,456640800.0,14.18,188.5
2014-05-01,21.142857,21.242857,20.941429,21.124287,18.60659,244048000.0,13.75,188.839996
2014-05-02,21.155001,21.22143,21.061071,21.16357,18.641197,191514400.0,13.5,189.139999
2014-05-05,21.076429,21.464287,21.071428,21.462856,18.90481,287067200.0,14.2,188.550003
2014-05-06,21.492857,21.586071,21.22893,21.22893,18.698767,374564400.0,13.9,188.130005


In [924]:
# Convert index of concatenate_df_2 to datetime
concatenate_df_2.index = pd.to_datetime(concatenate_df_2.index)

# Ensure both DataFrames have matching date ranges up to April 1, 2024
concatenate_df_2 = concatenate_df_2[concatenate_df_2.index <= '2024-04-01']

# Attempt the merge operation again
concatenated_df = pd.merge(concatenate_df_2, fedfunds_df_monthly, left_index=True, right_index=True)

# Display the merged DataFrame
concatenated_df.head()



Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Fear_index,Spy_index,FEDFUNDS
2014-04-30,21.165714,21.408215,21.064285,21.074642,18.562872,456640800.0,14.18,188.5,0.09
2014-05-01,21.142857,21.242857,20.941429,21.124287,18.60659,244048000.0,13.75,188.839996,0.09
2014-05-02,21.155001,21.22143,21.061071,21.16357,18.641197,191514400.0,13.5,189.139999,0.09
2014-05-05,21.076429,21.464287,21.071428,21.462856,18.90481,287067200.0,14.2,188.550003,0.09
2014-05-06,21.492857,21.586071,21.22893,21.22893,18.698767,374564400.0,13.9,188.130005,0.09


In [925]:
concatenated_df.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Fear_index,Spy_index,FEDFUNDS
2014-04-30,21.165714,21.408215,21.064285,21.074642,18.562872,456640800.0,14.18,188.5,0.09
2014-05-01,21.142857,21.242857,20.941429,21.124287,18.60659,244048000.0,13.75,188.839996,0.09
2014-05-02,21.155001,21.22143,21.061071,21.16357,18.641197,191514400.0,13.5,189.139999,0.09
2014-05-05,21.076429,21.464287,21.071428,21.462856,18.90481,287067200.0,14.2,188.550003,0.09
2014-05-06,21.492857,21.586071,21.22893,21.22893,18.698767,374564400.0,13.9,188.130005,0.09


In [926]:
# Drop all rows containing NaN values
concatenated_df = concatenated_df.dropna()

In [927]:
concatenated_df.tail()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Fear_index,Spy_index,FEDFUNDS
2024-03-25,170.570007,171.940002,169.449997,170.850006,170.850006,54288300.0,13.67,520.950012,5.33
2024-03-26,170.0,171.419998,169.580002,169.710007,169.710007,57388400.0,13.43,521.580017,5.33
2024-03-27,170.410004,173.600006,170.110001,173.309998,173.309998,60273300.0,13.34,523.210022,5.33
2024-03-28,171.75,172.229996,170.509995,171.479996,171.479996,65672700.0,13.1,524.609985,5.33
2024-04-01,171.190002,171.25,169.479996,170.029999,170.029999,46240500.0,14.15,524.380005,5.33


In [928]:
columns_to_drop = ['Volume', 'Fear_index', 'Spy_index', 'FEDFUNDS']
concatenated_df = concatenated_df.drop(columns=columns_to_drop)
concatenated_df.head()

Unnamed: 0,Open,High,Low,Close,Adj Close
2014-04-30,21.165714,21.408215,21.064285,21.074642,18.562872
2014-05-01,21.142857,21.242857,20.941429,21.124287,18.60659
2014-05-02,21.155001,21.22143,21.061071,21.16357,18.641197
2014-05-05,21.076429,21.464287,21.071428,21.462856,18.90481
2014-05-06,21.492857,21.586071,21.22893,21.22893,18.698767


In [929]:
# Define features set
X = concatenated_df.copy()
X.drop("High",axis=1, inplace=True)
X.tail()



Unnamed: 0,Open,Low,Close,Adj Close
2024-03-25,170.570007,169.449997,170.850006,170.850006
2024-03-26,170.0,169.580002,169.710007,169.710007
2024-03-27,170.410004,170.110001,173.309998,173.309998
2024-03-28,171.75,170.509995,171.479996,171.479996
2024-04-01,171.190002,169.479996,170.029999,170.029999


In [930]:
# Define target vector
y = concatenated_df["High"]
y[:5]

2014-04-30    21.408215
2014-05-01    21.242857
2014-05-02    21.221430
2014-05-05    21.464287
2014-05-06    21.586071
Name: High, dtype: float64

In [931]:
date_cutoff = "2022-04-30"
X_train_df = X[X.index <= date_cutoff]
X_test_df = X[X.index > date_cutoff]

print("Train:", X_train_df.index.min(), X_train_df.index.max())
print("Test:", X_test_df.index.min(), X_test_df.index.max())

Train: 2014-04-30 00:00:00 2021-08-30 00:00:00
Test: 2021-08-31 00:00:00 2024-04-01 00:00:00


In [932]:
y_train_df = y[y.index <= date_cutoff]
y_test_df = y[y.index > date_cutoff]

print("Train:", y_train_df.index.min(), y_train_df.index.max())
print("Test:", y_test_df.index.min(), y_test_df.index.max())

Train: 2014-04-30 00:00:00 2021-08-30 00:00:00
Test: 2021-08-31 00:00:00 2024-04-01 00:00:00


In [933]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [934]:
# Fitting Standard Scaler
X_scaler = scaler.fit(X_train_df)

In [935]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train_df)
X_test_scaled = X_scaler.transform(X_test_df)

In [936]:
# Create a Gradient Boosting classifier
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

In [937]:
# Fitting the model
gb_model.fit(X_train_scaled, y_train_df)

In [938]:
# Making predictions using the testing data
y_pred_gb_test = gb_model.predict(X_test_scaled)

In [939]:
# Making predictions using the training data
y_pred_gb_train = gb_model.predict(X_train_scaled)

In [940]:
mse_gb = mean_squared_error(y_test_df, y_pred_gb_test)
rmse_gb = np.sqrt(mse_gb)
r2_gb = r2_score(y_test_df, y_pred_gb_test)

print("Mean Squared Error (MSE) - Gradient Boosting:", mse_gb)
print("Root Mean Squared Error (RMSE) - Gradient Boosting:", rmse_gb)
print("R-squared (R2) Score - Gradient Boosting:", r2_gb)


Mean Squared Error (MSE) - Gradient Boosting: 430.7141464880051
Root Mean Squared Error (RMSE) - Gradient Boosting: 20.75365381054635
R-squared (R2) Score - Gradient Boosting: -0.4454178030193616


In [910]:
mse_gb = mean_squared_error(y_train_df, y_pred_gb_train)
rmse_gb = np.sqrt(mse_gb)
r2_gb = r2_score(y_train_df, y_pred_gb_train)

print("Mean Squared Error (MSE) - Gradient Boosting:", mse_gb)
print("Root Mean Squared Error (RMSE) - Gradient Boosting:", rmse_gb)
print("R-squared (R2) Score - Gradient Boosting:", r2_gb)

Mean Squared Error (MSE) - Gradient Boosting: 0.1082738264542952
Root Mean Squared Error (RMSE) - Gradient Boosting: 0.3290498844465611
R-squared (R2) Score - Gradient Boosting: 0.999911542590199


In [864]:
model = GradientBoostingRegressor() 

In [865]:
# Assuming X_train and y_train are your training data
scores = cross_val_score(model, X_train_df, y_train_df, cv=5, scoring='neg_mean_squared_error')
average_mse = -scores.mean()
print("Average Cross-Validation MSE:", average_mse)

Average Cross-Validation MSE: 389.3182840493495


In [866]:
# Assuming you have X_train, y_train, X_val, y_val, X_test, y_test
# Train the model on the training set
model.fit(X_train_df, y_train_df)

# Tune hyperparameters using the validation set (not shown here)

# Evaluate final performance on the test set
test_predictions = model.predict(X_test_df)
test_mse = mean_squared_error(y_test_df, test_predictions)
print("Test MSE:", test_mse)


Test MSE: 674.8790397764058


In [867]:
# Assuming model is your trained Gradient Boosting model
feature_importances = model.feature_importances_
feature_names = X_train_df.columns
# Pair feature names with their importance scores
feature_importance_dict = dict(zip(feature_names, feature_importances))
# Sort features by importance
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
print("Feature Importance:")
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")


Feature Importance:
Adj Close: 0.40977720375840065
Open: 0.4095931103623743
Close: 0.10132422092487231
Low: 0.07930546495435274


In [754]:
# Define the model with different hyperparameters
model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3)

# Fit the model to your data
model.fit(X_train_df, y_train_df)

# Get the feature importances
feature_importances = model.feature_importances_

# Pair feature names with their importance scores
feature_names = X_train_df.columns
feature_importance_dict = dict(zip(feature_names, feature_importances))

# Sort features by importance
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print feature importance
print("Feature Importance:")
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")


Feature Importance:
Open: 0.4039024124800167
Adj Close: 0.38356527169777055
Close: 0.1324181114704385
Low: 0.07994294772940685
Spy_index: 0.00013720500998363872
Volume: 1.916147745936206e-05
Fear_index: 1.4411510762847608e-05
FEDFUNDS: 4.786241615289605e-07


In [757]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],  # Number of trees in the ensemble
    'learning_rate': [0.01, 0.001, 0.1, 0.15, 0.2],  # Learning rate (shrinkage) of each tree
    #'max_depth': [3, 4, 5],  # Maximum depth of each tree
    #'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    #'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be a leaf node
    #'max_features': ['sqrt', 'log2', None]  # Number of features to consider when looking for the best split
}

# Create a GradientBoostingRegressor model
model = GradientBoostingRegressor()

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=1)

# Fit the grid search to your data
grid_search.fit(X_train_df, y_train_df)

# Get the best model found by grid search
best_model = grid_search.best_estimator_

# Evaluate R-squared score on the training set
train_predictions = best_model.predict(X_train_df)
train_r2 = r2_score(y_train_df, train_predictions)

# Evaluate R-squared score on the test set
test_predictions = best_model.predict(X_test_df)
test_r2 = r2_score(y_test_df, test_predictions)

# Print the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)
print("R-squared score on the training set:", train_r2)
print("R-squared score on the test set:", test_r2)

Best Hyperparameters: {'learning_rate': 0.15, 'n_estimators': 600}
R-squared score on the training set: 0.9999898574777165
R-squared score on the test set: 0.936558578043949


In [758]:
# Given numbers
x = train_r2
y = test_r2

# Calculate the difference
difference = max(x, y) - min(x, y)

# Calculate the percentage difference
percentage_difference = (difference / min(x, y)) * 100

# Round the percentage difference to two decimal places
percentage_difference = round(percentage_difference, 3)

print("Percentage difference:", str(percentage_difference) + "%")


Percentage difference: 6.773%
