In [392]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, mean_squared_error
%matplotlib inline

In [393]:
AAPL_path = Path("../files/AAPL.csv")
VIX_path = Path("../files/^VIX.csv")
FEDFUNDS_path = Path("../files/FEDFUNDS (1).csv")
SPY_index = Path("../files/SPY.csv")

In [394]:
from finta import TA

Load sample data or replace it with your own data
data = pd.read_csv(AAPL_path)  # Replace 'sample_data.csv' with your data file

Calculate moving average (MA)
data['MA'] = TA.SMA(data, 20)  # 20-period Simple Moving Average

Calculate Relative Strength Index (RSI)
data['RSI'] = TA.RSI(data, 14)  # 14-period RSI

Calculate Bollinger Bands
data['BB_UPPER'], data['BB_MIDDLE'], data['BB_LOWER'] = TA.BBANDS(data, 20, 2)  # 20-period Bollinger Bands with 2 standard deviations

data.tail()

In [395]:
# Read the CSV file into a DataFrame
apple_df = pd.read_csv(AAPL_path, index_col = "Date")
apple_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-04-30,21.165714,21.408215,21.064285,21.074642,18.562872,456640800
2014-05-01,21.142857,21.242857,20.941429,21.124287,18.60659,244048000
2014-05-02,21.155001,21.22143,21.061071,21.16357,18.641197,191514400
2014-05-05,21.076429,21.464287,21.071428,21.462856,18.90481,287067200
2014-05-06,21.492857,21.586071,21.22893,21.22893,18.698767,374564400


In [396]:
fear_index_df = pd.read_csv(VIX_path, index_col = "Date")

# Rename the "High" column to "Fear_index"
fear_index_df = fear_index_df.rename(columns={"High": "Fear_index"})
fear_index_df.head()

Unnamed: 0_level_0,Open,Fear_index,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-04-30,14.05,14.18,13.34,13.41,13.41,0.0
2014-05-01,13.64,13.75,13.1,13.25,13.25,0.0
2014-05-02,13.15,13.5,12.83,12.91,12.91,0.0
2014-05-05,13.95,14.2,13.08,13.29,13.29,0.0
2014-05-06,13.65,13.9,13.28,13.8,13.8,0.0


In [397]:
spy_index_df = pd.read_csv(SPY_index)


spy_index_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2014-04-30,187.440002,188.5,187.179993,188.309998,157.372498,101508000
1,2014-05-01,188.220001,188.839996,187.729996,188.330002,157.389206,93019000
2,2014-05-02,188.309998,189.139999,187.779999,188.059998,157.163528,98122000
3,2014-05-05,187.139999,188.550003,186.619995,188.419998,157.464355,75883000
4,2014-05-06,188.0,188.130005,186.740005,186.779999,156.093781,85454000


In [398]:
spy_index_df = pd.read_csv(SPY_index, index_col = "Date")

spy_index_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-04-30,187.440002,188.5,187.179993,188.309998,157.372498,101508000
2014-05-01,188.220001,188.839996,187.729996,188.330002,157.389206,93019000
2014-05-02,188.309998,189.139999,187.779999,188.059998,157.163528,98122000
2014-05-05,187.139999,188.550003,186.619995,188.419998,157.464355,75883000
2014-05-06,188.0,188.130005,186.740005,186.779999,156.093781,85454000


In [399]:
# Convert the Date index to datetime format
spy_index_df.index = pd.to_datetime(spy_index_df.index)

# Display the DataFrame
spy_index_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-04-30,187.440002,188.5,187.179993,188.309998,157.372498,101508000
2014-05-01,188.220001,188.839996,187.729996,188.330002,157.389206,93019000
2014-05-02,188.309998,189.139999,187.779999,188.059998,157.163528,98122000
2014-05-05,187.139999,188.550003,186.619995,188.419998,157.464355,75883000
2014-05-06,188.0,188.130005,186.740005,186.779999,156.093781,85454000


In [400]:
fedfunds_df = pd.read_csv(FEDFUNDS_path, index_col = "DATE")

fedfunds_df.head()

Unnamed: 0_level_0,FEDFUNDS
DATE,Unnamed: 1_level_1
2014-04-01,0.09
2014-05-01,0.09
2014-06-01,0.1
2014-07-01,0.09
2014-08-01,0.09


In [401]:
# Convert the Date index to datetime format
fedfunds_df.index = pd.to_datetime(fedfunds_df.index)

fedfunds_df.head()

Unnamed: 0_level_0,FEDFUNDS
DATE,Unnamed: 1_level_1
2014-04-01,0.09
2014-05-01,0.09
2014-06-01,0.1
2014-07-01,0.09
2014-08-01,0.09


In [402]:
# Resample the fed_funds_df to have daily frequency and forward fill the values
fedfunds_df_monthly = fedfunds_df.resample('D').ffill()

fedfunds_df_monthly.tail()

Unnamed: 0_level_0,FEDFUNDS
DATE,Unnamed: 1_level_1
2024-03-28,5.33
2024-03-29,5.33
2024-03-30,5.33
2024-03-31,5.33
2024-04-01,5.33


In [403]:
# Rename the "high" column to "SPY_index"
spy_index_df.rename(columns={"High": "SPY_index"}, inplace=True)

# Display the DataFrame
spy_index_df.head()

Unnamed: 0_level_0,Open,SPY_index,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-04-30,187.440002,188.5,187.179993,188.309998,157.372498,101508000
2014-05-01,188.220001,188.839996,187.729996,188.330002,157.389206,93019000
2014-05-02,188.309998,189.139999,187.779999,188.059998,157.163528,98122000
2014-05-05,187.139999,188.550003,186.619995,188.419998,157.464355,75883000
2014-05-06,188.0,188.130005,186.740005,186.779999,156.093781,85454000


In [404]:
fear_index_df.head()

Unnamed: 0_level_0,Open,Fear_index,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-04-30,14.05,14.18,13.34,13.41,13.41,0.0
2014-05-01,13.64,13.75,13.1,13.25,13.25,0.0
2014-05-02,13.15,13.5,12.83,12.91,12.91,0.0
2014-05-05,13.95,14.2,13.08,13.29,13.29,0.0
2014-05-06,13.65,13.9,13.28,13.8,13.8,0.0


In [405]:
# Display the DataFrame
apple_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-04-30,21.165714,21.408215,21.064285,21.074642,18.562872,456640800
2014-05-01,21.142857,21.242857,20.941429,21.124287,18.60659,244048000
2014-05-02,21.155001,21.22143,21.061071,21.16357,18.641197,191514400
2014-05-05,21.076429,21.464287,21.071428,21.462856,18.90481,287067200
2014-05-06,21.492857,21.586071,21.22893,21.22893,18.698767,374564400


In [406]:
# Concatinate two dataframes and add "High" column from ^VIX.csv as fear_index
concatenate_df = pd.concat([apple_df, fear_index_df['Fear_index']], axis=1)


In [407]:
concatenate_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Fear_index
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-04-30,21.165714,21.408215,21.064285,21.074642,18.562872,456640800.0,14.18
2014-05-01,21.142857,21.242857,20.941429,21.124287,18.60659,244048000.0,13.75
2014-05-02,21.155001,21.22143,21.061071,21.16357,18.641197,191514400.0,13.5
2014-05-05,21.076429,21.464287,21.071428,21.462856,18.90481,287067200.0,14.2
2014-05-06,21.492857,21.586071,21.22893,21.22893,18.698767,374564400.0,13.9


In [408]:
# Convert index of concatenate_df to DatetimeIndex
concatenate_df.index = pd.to_datetime(concatenate_df.index)

# Now, concatenate the DataFrames
concatenate_df_2 = pd.concat([concatenate_df, spy_index_df['SPY_index']], axis=1)

# Print the resulting DataFrame
concatenate_df_2.head()


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Fear_index,SPY_index
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-04-30,21.165714,21.408215,21.064285,21.074642,18.562872,456640800.0,14.18,188.5
2014-05-01,21.142857,21.242857,20.941429,21.124287,18.60659,244048000.0,13.75,188.839996
2014-05-02,21.155001,21.22143,21.061071,21.16357,18.641197,191514400.0,13.5,189.139999
2014-05-05,21.076429,21.464287,21.071428,21.462856,18.90481,287067200.0,14.2,188.550003
2014-05-06,21.492857,21.586071,21.22893,21.22893,18.698767,374564400.0,13.9,188.130005


In [409]:
# Convert index of concatenate_df_2 to datetime
concatenate_df_2.index = pd.to_datetime(concatenate_df_2.index)

# Ensure both DataFrames have matching date ranges up to April 1, 2024
concatenate_df_2 = concatenate_df_2[concatenate_df_2.index <= '2024-04-01']

# Attempt the merge operation again
concatenated_df = pd.merge(concatenate_df_2, fedfunds_df_monthly, left_index=True, right_index=True)

# Display the merged DataFrame
concatenated_df.head()



Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Fear_index,SPY_index,FEDFUNDS
2014-04-30,21.165714,21.408215,21.064285,21.074642,18.562872,456640800.0,14.18,188.5,0.09
2014-05-01,21.142857,21.242857,20.941429,21.124287,18.60659,244048000.0,13.75,188.839996,0.09
2014-05-02,21.155001,21.22143,21.061071,21.16357,18.641197,191514400.0,13.5,189.139999,0.09
2014-05-05,21.076429,21.464287,21.071428,21.462856,18.90481,287067200.0,14.2,188.550003,0.09
2014-05-06,21.492857,21.586071,21.22893,21.22893,18.698767,374564400.0,13.9,188.130005,0.09


In [410]:
concatenated_df.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Fear_index,SPY_index,FEDFUNDS
2014-04-30,21.165714,21.408215,21.064285,21.074642,18.562872,456640800.0,14.18,188.5,0.09
2014-05-01,21.142857,21.242857,20.941429,21.124287,18.60659,244048000.0,13.75,188.839996,0.09
2014-05-02,21.155001,21.22143,21.061071,21.16357,18.641197,191514400.0,13.5,189.139999,0.09
2014-05-05,21.076429,21.464287,21.071428,21.462856,18.90481,287067200.0,14.2,188.550003,0.09
2014-05-06,21.492857,21.586071,21.22893,21.22893,18.698767,374564400.0,13.9,188.130005,0.09


In [411]:
# Drop all rows containing NaN values
concatenated_df = concatenated_df.dropna()

In [418]:
concatenated_df.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Fear_index,SPY_index,FEDFUNDS,Target
2014-04-30,21.165714,21.408215,21.064285,21.074642,18.562872,456640800.0,14.18,188.5,0.09,21.242857
2014-05-01,21.142857,21.242857,20.941429,21.124287,18.60659,244048000.0,13.75,188.839996,0.09,21.22143
2014-05-02,21.155001,21.22143,21.061071,21.16357,18.641197,191514400.0,13.5,189.139999,0.09,21.464287
2014-05-05,21.076429,21.464287,21.071428,21.462856,18.90481,287067200.0,14.2,188.550003,0.09,21.586071
2014-05-06,21.492857,21.586071,21.22893,21.22893,18.698767,374564400.0,13.9,188.130005,0.09,21.331785


In [419]:
#columns_to_drop = ['Volume', 'Fear_index', 'SPY_index', 'FEDFUNDS']
#concatenated_df = concatenated_df.drop(columns=columns_to_drop)
#concatenated_df.head()

In [420]:
# Shift the target variable (e.g., Close price) backward by one time step
concatenated_df['Target'] = concatenated_df['High'].shift(-1)

# Drop the last row to remove NaN values created by shifting
concatenated_df = concatenated_df.dropna()
concatenated_df.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Fear_index,SPY_index,FEDFUNDS,Target
2014-04-30,21.165714,21.408215,21.064285,21.074642,18.562872,456640800.0,14.18,188.5,0.09,21.242857
2014-05-01,21.142857,21.242857,20.941429,21.124287,18.60659,244048000.0,13.75,188.839996,0.09,21.22143
2014-05-02,21.155001,21.22143,21.061071,21.16357,18.641197,191514400.0,13.5,189.139999,0.09,21.464287
2014-05-05,21.076429,21.464287,21.071428,21.462856,18.90481,287067200.0,14.2,188.550003,0.09,21.586071
2014-05-06,21.492857,21.586071,21.22893,21.22893,18.698767,374564400.0,13.9,188.130005,0.09,21.331785


In [421]:
# Define features set
X = concatenated_df.copy()
X.drop("High",axis=1, inplace=True)
X.head()



Unnamed: 0,Open,Low,Close,Adj Close,Volume,Fear_index,SPY_index,FEDFUNDS,Target
2014-04-30,21.165714,21.064285,21.074642,18.562872,456640800.0,14.18,188.5,0.09,21.242857
2014-05-01,21.142857,20.941429,21.124287,18.60659,244048000.0,13.75,188.839996,0.09,21.22143
2014-05-02,21.155001,21.061071,21.16357,18.641197,191514400.0,13.5,189.139999,0.09,21.464287
2014-05-05,21.076429,21.071428,21.462856,18.90481,287067200.0,14.2,188.550003,0.09,21.586071
2014-05-06,21.492857,21.22893,21.22893,18.698767,374564400.0,13.9,188.130005,0.09,21.331785


In [422]:
# Define target vector
y = concatenated_df["High"]
y[:5]

2014-04-30    21.408215
2014-05-01    21.242857
2014-05-02    21.221430
2014-05-05    21.464287
2014-05-06    21.586071
Name: High, dtype: float64

In [423]:
date_cutoff = "2022-04-30"
X_train_df = X[X.index <= date_cutoff]
X_test_df = X[X.index > date_cutoff]

print("Train:", X_train_df.index.min(), X_train_df.index.max())
print("Test:", X_test_df.index.min(), X_test_df.index.max())

Train: 2014-04-30 00:00:00 2022-04-29 00:00:00
Test: 2022-05-02 00:00:00 2024-03-26 00:00:00


In [424]:
y_train_df = y[y.index <= date_cutoff]
y_test_df = y[y.index > date_cutoff]

print("Train:", y_train_df.index.min(), y_train_df.index.max())
print("Test:", y_test_df.index.min(), y_test_df.index.max())

Train: 2014-04-30 00:00:00 2022-04-29 00:00:00
Test: 2022-05-02 00:00:00 2024-03-26 00:00:00


In [425]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [426]:
# Fitting Standard Scaler
X_scaler = scaler.fit(X_train_df)

In [427]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train_df)
X_test_scaled = X_scaler.transform(X_test_df)

In [428]:
# Create a Gradient Boosting classifier
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

In [429]:
# Fitting the model
gb_model.fit(X_train_scaled, y_train_df)

In [430]:
# Making predictions using the testing data
y_pred_gb_test = gb_model.predict(X_test_scaled)

In [431]:
# Making predictions using the training data
y_pred_gb_train = gb_model.predict(X_train_scaled)

In [432]:
mse_gb = mean_squared_error(y_test_df, y_pred_gb_test)
rmse_gb = np.sqrt(mse_gb)
r2_gb = r2_score(y_test_df, y_pred_gb_test)

print("Mean Squared Error (MSE) - Gradient Boosting:", mse_gb)
print("Root Mean Squared Error (RMSE) - Gradient Boosting:", rmse_gb)
print("R-squared (R2) Score - Gradient Boosting:", r2_gb)


Mean Squared Error (MSE) - Gradient Boosting: 25.59614825593343
Root Mean Squared Error (RMSE) - Gradient Boosting: 5.059263608069204
R-squared (R2) Score - Gradient Boosting: 0.9274469431136902


In [433]:
mse_gb = mean_squared_error(y_train_df, y_pred_gb_train)
rmse_gb = np.sqrt(mse_gb)
r2_gb = r2_score(y_train_df, y_pred_gb_train)

print("Mean Squared Error (MSE) - Gradient Boosting:", mse_gb)
print("Root Mean Squared Error (RMSE) - Gradient Boosting:", rmse_gb)
print("R-squared (R2) Score - Gradient Boosting:", r2_gb)

Mean Squared Error (MSE) - Gradient Boosting: 0.11741018053793571
Root Mean Squared Error (RMSE) - Gradient Boosting: 0.3426516898220928
R-squared (R2) Score - Gradient Boosting: 0.9999425845373463


In [434]:
model = GradientBoostingRegressor() 

In [435]:
# Assuming X_train and y_train are your training data
scores = cross_val_score(model, X_train_df, y_train_df, cv=5, scoring='neg_mean_squared_error')
average_mse = -scores.mean()
print("Average Cross-Validation MSE:", average_mse)

Average Cross-Validation MSE: 84.56013485745837


In [436]:
# Assuming you have X_train, y_train, X_val, y_val, X_test, y_test
# Train the model on the training set
model.fit(X_train_df, y_train_df)

# Tune hyperparameters using the validation set (not shown here)

# Evaluate final performance on the test set
test_predictions = model.predict(X_test_df)
test_mse = mean_squared_error(y_test_df, test_predictions)
print("Test MSE:", test_mse)


Test MSE: 25.8016748113033


In [437]:
# Assuming model is your trained Gradient Boosting model
feature_importances = model.feature_importances_
feature_names = X_train_df.columns
# Pair feature names with their importance scores
feature_importance_dict = dict(zip(feature_names, feature_importances))
# Sort features by importance
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
print("Feature Importance:")
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")


Feature Importance:
Open: 0.4023459156527525
Adj Close: 0.33772007182240044
Target: 0.16277434758679243
Low: 0.06000565206645414
Close: 0.0370316500956707
SPY_index: 9.93473568379525e-05
Volume: 1.3200422489753949e-05
Fear_index: 9.595814897682057e-06
FEDFUNDS: 2.1918170433756618e-07


In [438]:
# Define the model with different hyperparameters
model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3)

# Fit the model to your data
model.fit(X_train_df, y_train_df)

# Get the feature importances
feature_importances = model.feature_importances_

# Pair feature names with their importance scores
feature_names = X_train_df.columns
feature_importance_dict = dict(zip(feature_names, feature_importances))

# Sort features by importance
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print feature importance
print("Feature Importance:")
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")


Feature Importance:
Open: 0.40196878506304007
Adj Close: 0.3366158417133607
Target: 0.16176668231809307
Low: 0.05976972826933317
Close: 0.03974823420478444
SPY_index: 0.00010223266353077294
Volume: 1.5291202051961815e-05
Fear_index: 1.255828211011939e-05
FEDFUNDS: 6.46283695708792e-07


In [439]:
# Define the date cutoff for splitting the data
date_cutoff = "2022-04-30"

# Split the data into training and test sets based on the date cutoff
X_train_df = X[X.index <= date_cutoff]
X_test_df = X[X.index > date_cutoff]
y_train_df = y[y.index <= date_cutoff]
y_test_df = y[y.index > date_cutoff]

In [440]:
# Define a custom scorer using mean squared error
def custom_scorer(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return -mse  # Negative because GridSearchCV maximizes the scorer

In [441]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [250, 300, 350],  # Reduce the number of estimators
    'learning_rate': [0.05, 0.1, 0.15],  # Adjust the learning rate
    #'max_depth': [3, 5, 7],  # Limit the depth of the trees
    #'min_samples_split': [2, 5, 10],  # Increase the minimum samples required to split a node
    #'min_samples_leaf': [1, 2, 4]  # Increase the minimum samples required to be a leaf node
}

# Create a GradientBoostingRegressor model
model = GradientBoostingRegressor()

# Define GridSearchCV with the custom scorer
grid_search = GridSearchCV(model, param_grid, scoring = make_scorer(custom_scorer, greater_is_better=True), cv=10, verbose=1, n_jobs=-1)

# Fit the grid search to your training data
grid_search.fit(X_train_df, y_train_df)

# Get the best model found by grid search
best_model = grid_search.best_estimator_

# Evaluate R-squared score on the training set
train_predictions = best_model.predict(X_train_df)
train_r2 = r2_score(y_train_df, train_predictions)

# Evaluate R-squared score on the test set
test_predictions = best_model.predict(X_test_df)
test_r2 = r2_score(y_test_df, test_predictions)

# Print the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)
print("R-squared score on the training set:", train_r2)
print("R-squared score on the test set:", test_r2)

# If training R-squared is below 95%, try another approach or modify hyperparameters
if train_r2 < 0.95:
    print("Warning: Training R-squared is below 95%. Consider adjusting hyperparameters or using a different approach.")

Fitting 10 folds for each of 9 candidates, totalling 90 fits
Best Hyperparameters: {'learning_rate': 0.05, 'n_estimators': 350}
R-squared score on the training set: 0.9999742737744736
R-squared score on the test set: 0.9359526338357356


In [381]:
# Given numbers
x = train_r2
y = test_r2

# Calculate the difference
difference = max(x, y) - min(x, y)

# Calculate the percentage difference
percentage_difference = (difference / min(x, y)) * 100

# Round the percentage difference to two decimal places
percentage_difference = round(percentage_difference, 3)

print("Percentage difference:", str(percentage_difference) + "%")


Percentage difference: 4.912%
