In [2]:
# importing useful libraries
import pandas as pd
import yfinance as yf

In [3]:
#importing SNP dataframe
data = yf.download('^GSPC', start = '2023-11-14', end='2024-11-15')
data.to_csv('snp500_one_year.csv')

[*********************100%***********************]  1 of 1 completed


In [4]:
# display dataframe information
print(data.shape)

(253, 6)


In [5]:
# display tail of df
print(data.tail())

Price                        Adj Close        Close         High          Low  \
Ticker                           ^GSPC        ^GSPC        ^GSPC        ^GSPC   
Date                                                                            
2024-11-08 00:00:00+00:00  5995.540039  5995.540039  6012.450195  5976.759766   
2024-11-11 00:00:00+00:00  6001.350098  6001.350098  6017.310059  5986.689941   
2024-11-12 00:00:00+00:00  5983.990234  5983.990234  6009.919922  5960.080078   
2024-11-13 00:00:00+00:00  5985.379883  5985.379883  6008.189941  5965.910156   
2024-11-14 00:00:00+00:00  5949.169922  5949.169922  5993.879883  5942.279785   

Price                             Open      Volume  
Ticker                           ^GSPC       ^GSPC  
Date                                                
2024-11-08 00:00:00+00:00  5976.759766  4666740000  
2024-11-11 00:00:00+00:00  6008.859863  4333000000  
2024-11-12 00:00:00+00:00  6003.600098  4243400000  
2024-11-13 00:00:00+00:00  5985.7

In [6]:
# Get rid of non useful columns
data = data.drop(columns = ["Open","High","Low","Close"])

In [7]:
# display the columns
data.columns

MultiIndex([('Adj Close', '^GSPC'),
            (   'Volume', '^GSPC')],
           names=['Price', 'Ticker'])

In [8]:
# modify the indexes
data = data.reset_index()
data = data[['Date', 'Adj Close', 'Volume']]

In [9]:
# change the date format
data['Date'] = pd.to_datetime(data['Date']).dt.date

In [10]:
# display the dataset columns
data.columns

MultiIndex([(     'Date',      ''),
            ('Adj Close', '^GSPC'),
            (   'Volume', '^GSPC')],
           names=['Price', 'Ticker'])

In [11]:
# reset the columns of the dataset
data.columns = ['Date', 'Adj Close', 'Volume']

In [12]:
# reset the index of the dataset
data.set_index('Date', inplace=True)

In [13]:
# adding the returns to be able to calculate other indicators (vol...)
data['Returns'] = data['Adj Close'].pct_change()

In [14]:
# import the VIX data from yfinance
vix_data = yf.download('^VIX', start='2023-11-14', end='2024-11-15')

[*********************100%***********************]  1 of 1 completed


In [15]:
# display the vix dataset
print(vix_data)

Price                     Adj Close  Close   High    Low   Open Volume
Ticker                         ^VIX   ^VIX   ^VIX   ^VIX   ^VIX   ^VIX
Date                                                                  
2023-11-14 00:00:00+00:00     14.16  14.16  14.86  13.91  14.83      0
2023-11-15 00:00:00+00:00     14.18  14.18  14.35  13.97  14.21      0
2023-11-16 00:00:00+00:00     14.32  14.32  14.42  13.68  14.12      0
2023-11-17 00:00:00+00:00     13.80  13.80  14.19  13.67  14.18      0
2023-11-20 00:00:00+00:00     13.41  13.41  14.31  13.39  14.26      0
...                             ...    ...    ...    ...    ...    ...
2024-11-08 00:00:00+00:00     14.94  14.94  15.33  14.66  15.13      0
2024-11-11 00:00:00+00:00     14.97  14.97  15.56  14.89  15.33      0
2024-11-12 00:00:00+00:00     14.71  14.71  15.37  14.69  15.09      0
2024-11-13 00:00:00+00:00     14.02  14.02  15.26  13.77  15.09      0
2024-11-14 00:00:00+00:00     14.31  14.31  14.32  13.59  14.17      0

[253 

In [16]:
# get rid of the useless columns
vix_data.drop(columns=["Close","High","Low","Open","Volume"])

Price,Adj Close
Ticker,^VIX
Date,Unnamed: 1_level_2
2023-11-14 00:00:00+00:00,14.16
2023-11-15 00:00:00+00:00,14.18
2023-11-16 00:00:00+00:00,14.32
2023-11-17 00:00:00+00:00,13.80
2023-11-20 00:00:00+00:00,13.41
...,...
2024-11-08 00:00:00+00:00,14.94
2024-11-11 00:00:00+00:00,14.97
2024-11-12 00:00:00+00:00,14.71
2024-11-13 00:00:00+00:00,14.02


In [17]:
# have the same format as our main dataset to merge it correctly
vix_data = vix_data.reset_index()
vix_data = vix_data[['Date', 'Adj Close']]

In [18]:
# change the format date
vix_data['Date'] = pd.to_datetime(vix_data['Date']).dt.date

In [19]:
# put date as index
vix_data.set_index('Date', inplace=True)

In [20]:
# only want one column
vix_data.columns = ['Adj Close']

In [21]:
#merging main dataset and vix dataset
data = pd.merge(data,vix_data,on="Date",how="left")

In [22]:
# rename the columns of the main dataset
data = data.rename(columns={
    'Adj Close_x': 'S&P 500 Adj Close Price',
    'Adj Close_y': 'VIX Adj Close Price',
    'Volume': 'Volume S&P 500'
})

In [23]:
# import the data from yf
tnx_data = yf.download('^TNX', start='2023-11-14', end='2024-11-15')
fed_data = yf.download('^IRX', start='2023-11-14', end='2024-11-15')

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [24]:
# get rid of the useless columns
fed_data.drop(columns=["Close","High","Low","Open","Volume"])
tnx_data.drop(columns=["Close","High","Low","Open","Volume"])

Price,Adj Close
Ticker,^TNX
Date,Unnamed: 1_level_2
2023-11-14 00:00:00+00:00,4.441
2023-11-15 00:00:00+00:00,4.535
2023-11-16 00:00:00+00:00,4.445
2023-11-17 00:00:00+00:00,4.441
2023-11-20 00:00:00+00:00,4.422
...,...
2024-11-08 00:00:00+00:00,4.306
2024-11-11 00:00:00+00:00,4.308
2024-11-12 00:00:00+00:00,4.432
2024-11-13 00:00:00+00:00,4.451


In [25]:
# get rid of the weird index format
tnx_data = tnx_data.reset_index()
fed_data = fed_data.reset_index()

In [26]:
# keep only useful columns
tnx_data = tnx_data[['Date', 'Close']]
fed_data = fed_data[['Date', 'Close']]

In [27]:
# change the date format
tnx_data['Date'] = pd.to_datetime(tnx_data['Date']).dt.date
fed_data['Date'] = pd.to_datetime(fed_data['Date']).dt.date

In [28]:
# 
tnx_data.set_index('Date', inplace=True)
fed_data.set_index('Date', inplace=True)

In [29]:
# only want one column
tnx_data.columns = ['Adj Close']
fed_data.columns = ['Adj Close']

In [30]:
fed_data.tail()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2024-11-08,4.423
2024-11-11,4.415
2024-11-12,4.423
2024-11-13,4.39
2024-11-14,4.383


In [31]:
# merge main dataset and fed_data
data=pd.merge(data,fed_data,on="Date",how="left")

In [32]:
# rename the column in the main dataset
data = data.rename(columns={
    'Adj Close': 'FED Rates',
})

In [33]:
# merge main dataset and tnx_data
data = pd.merge(data,tnx_data,on="Date",how="left")

In [34]:
# rename the column in the main dataset
data = data.rename(columns={
    'Adj Close': '10 Y Treasury Rates',
})

In [35]:
#adding Volume Weighted Average Price
data["VWAP"] = (data['S&P 500 Adj Close Price'] * data['Volume S&P 500']).cumsum() / data['Volume S&P 500'].cumsum()

In [36]:
#adding spread between 10 Y Treasury Bunds Rates and FED Rates
data['Spread_10Y_Fed'] = data['10 Y Treasury Rates'] - data['FED Rates']

In [37]:
#adding SMA (Simple Moving Average) and EMA (Exponential Moving Average)
data['SMA_10'] = data['S&P 500 Adj Close Price'].rolling(window=10).mean()
data['EMA_10'] = data['S&P 500 Adj Close Price'].ewm(span=10).mean()

In [38]:
# adding Bollinger bounds on 10 last days (volatility measure)
rolling_mean = data['S&P 500 Adj Close Price'].rolling(window=10).mean()
rolling_std = data['S&P 500 Adj Close Price'].rolling(window=10).std()
data['Bollinger_Upper'] = rolling_mean + 2 * rolling_std
data['Bollinger_Lower'] = rolling_mean - 2 * rolling_std

In [39]:
# adding sharpe ration on 10 last days
rolling_returns = data['Returns'].rolling(window=10)
data['Sharpe_10'] = rolling_returns.mean() / rolling_returns.std()

In [40]:
# get rid of the 10 first rows
data = data.iloc[10:]

In [41]:
# display head of data
data.tail()

Unnamed: 0_level_0,S&P 500 Adj Close Price,Volume S&P 500,Returns,VIX Adj Close Price,FED Rates,10 Y Treasury Rates,VWAP,Spread_10Y_Fed,SMA_10,EMA_10,Bollinger_Upper,Bollinger_Lower,Sharpe_10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2024-11-08,5995.540039,4666740000,0.003757,14.94,4.423,4.306,5238.392205,-0.117,5829.748975,5863.04224,6040.308074,5619.189875,0.286892
2024-11-11,6001.350098,4333000000,0.000969,14.97,4.415,4.308,5241.778179,-0.107,5847.531982,5888.189124,6084.175027,5610.888938,0.271452
2024-11-12,5983.990234,4243400000,-0.002893,14.71,4.423,4.432,5244.990013,0.009,5862.639014,5905.607507,6113.968777,5611.30925,0.228504
2024-11-13,5985.379883,4220180000,0.000232,14.02,4.39,4.451,5248.162774,0.061,5879.81001,5920.111576,6139.591012,5620.029008,0.262744
2024-11-14,5949.169922,4184570000,-0.00605,14.31,4.383,4.418,5251.128819,0.035,5904.181982,5925.394911,6135.423465,5672.9405,0.462806


In [42]:
# Check for the NaN values in my dataset
print(data.isna().sum())

S&P 500 Adj Close Price    0
Volume S&P 500             0
Returns                    0
VIX Adj Close Price        0
FED Rates                  0
10 Y Treasury Rates        0
VWAP                       0
Spread_10Y_Fed             0
SMA_10                     0
EMA_10                     0
Bollinger_Upper            0
Bollinger_Lower            0
Sharpe_10                  0
dtype: int64


In [43]:
# understand the nature of the data
data.dtypes 

S&P 500 Adj Close Price    float64
Volume S&P 500               int64
Returns                    float64
VIX Adj Close Price        float64
FED Rates                  float64
10 Y Treasury Rates        float64
VWAP                       float64
Spread_10Y_Fed             float64
SMA_10                     float64
EMA_10                     float64
Bollinger_Upper            float64
Bollinger_Lower            float64
Sharpe_10                  float64
dtype: object

In [44]:
# we want to erase the time dependence and set the index as just a counter of rows
data.reset_index(inplace =True)

In [45]:
data.head()

Unnamed: 0,Date,S&P 500 Adj Close Price,Volume S&P 500,Returns,VIX Adj Close Price,FED Rates,10 Y Treasury Rates,VWAP,Spread_10Y_Fed,SMA_10,EMA_10,Bollinger_Upper,Bollinger_Lower,Sharpe_10
0,2023-11-29,4550.580078,4418760000,-0.000946,12.98,5.24,4.271,4531.129087,-0.969,4538.257031,4545.01058,4581.367254,4495.146809,0.430481
1,2023-11-30,4567.799805,5399300000,0.003784,12.92,5.238,4.352,4535.486806,-0.886,4544.749023,4549.563825,4583.515439,4505.982608,0.487993
2,2023-12-01,4594.629883,4397120000,0.005874,12.63,5.215,4.226,4540.705434,-0.989,4553.387988,4558.408906,4594.431434,4512.344543,0.584993
3,2023-12-04,4569.779785,4369910000,-0.005409,13.08,5.223,4.288,4543.049452,-0.935,4558.963965,4560.608868,4590.220671,4527.707258,0.308879
4,2023-12-05,4567.180176,3909950000,-0.000569,12.85,5.243,4.171,4544.673019,-1.072,4560.943994,4561.865594,4591.438575,4530.449413,0.129937


In [46]:
# import libraries needed to implement our new categorical variables that relate to the dates

import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar

In [47]:
# adding categorical variables based on dates (month, weekday, if it's a bank holiday) to break free from time dependency

data['Month'] = pd.to_datetime(data['Date']).dt.month
data['Weekday'] = pd.to_datetime(data['Date']).dt.dayofweek  # Monday=0, Sunday=6
data['Is_Bank_Holiday'] = pd.to_datetime(data['Date']).isin(USFederalHolidayCalendar().holidays())

In [48]:
#adding categorical variable if it is a vacation

data['Is_Vacation'] = pd.to_datetime(data['Date']).apply(
    lambda x: (x.month == 12 and x.day >= 24) or (x.month == 1 and x.day <= 2)
)

In [49]:
# converting boolean values into numerical values

data['Is_Bank_Holiday'] = data['Is_Bank_Holiday'].astype(int)
data['Is_Vacation'] = data['Is_Vacation'].astype(int)

In [50]:
# get rid of the date column to get rid of time dependency

data.drop(columns=['Date'], inplace=True)

In [51]:
data.head(20)

Unnamed: 0,S&P 500 Adj Close Price,Volume S&P 500,Returns,VIX Adj Close Price,FED Rates,10 Y Treasury Rates,VWAP,Spread_10Y_Fed,SMA_10,EMA_10,Bollinger_Upper,Bollinger_Lower,Sharpe_10,Month,Weekday,Is_Bank_Holiday,Is_Vacation
0,4550.580078,4418760000,-0.000946,12.98,5.24,4.271,4531.129087,-0.969,4538.257031,4545.01058,4581.367254,4495.146809,0.430481,11,2,0,0
1,4567.799805,5399300000,0.003784,12.92,5.238,4.352,4535.486806,-0.886,4544.749023,4549.563825,4583.515439,4505.982608,0.487993,11,3,0,0
2,4594.629883,4397120000,0.005874,12.63,5.215,4.226,4540.705434,-0.989,4553.387988,4558.408906,4594.431434,4512.344543,0.584993,12,4,0,0
3,4569.779785,4369910000,-0.005409,13.08,5.223,4.288,4543.049452,-0.935,4558.963965,4560.608868,4590.220671,4527.707258,0.308879,12,0,0,0
4,4567.180176,3909950000,-0.000569,12.85,5.243,4.171,4544.673019,-1.072,4560.943994,4561.865594,4591.438575,4530.449413,0.129937,12,1,0,0
5,4549.339844,4245680000,-0.003906,12.97,5.243,4.121,4544.99076,-1.122,4562.058984,4559.492484,4589.520442,4534.597527,0.070106,12,2,0,0
6,4585.589844,3818880000,0.007968,13.06,5.233,4.129,4547.333604,-1.104,4564.955957,4564.399361,4595.774363,4534.137552,0.152691,12,3,0,0
7,4604.370117,3707010000,0.004095,12.35,5.233,4.245,4550.359103,-0.988,4569.458984,4571.868404,4608.651696,4530.266272,0.228357,12,4,0,0
8,4622.439941,3823210000,0.003924,12.63,5.233,4.239,4554.097938,-0.994,4576.659961,4581.270907,4625.57031,4527.749612,0.36756,12,0,0,0
9,4643.700195,3808380000,0.004599,12.07,5.248,4.206,4558.500125,-1.042,4585.540967,4592.830588,4647.416272,4523.665661,0.44198,12,1,0,0


In [52]:
# check for NaN values

print(data.isnull().sum())

S&P 500 Adj Close Price    0
Volume S&P 500             0
Returns                    0
VIX Adj Close Price        0
FED Rates                  0
10 Y Treasury Rates        0
VWAP                       0
Spread_10Y_Fed             0
SMA_10                     0
EMA_10                     0
Bollinger_Upper            0
Bollinger_Lower            0
Sharpe_10                  0
Month                      0
Weekday                    0
Is_Bank_Holiday            0
Is_Vacation                0
dtype: int64


In [53]:
# import library to standardize variables

from sklearn.preprocessing import StandardScaler

In [54]:
# list of columns to standardize

cols_to_standardize = [
    "S&P 500 Adj Close Price",
    "Volume S&P 500",
    "VIX Adj Close Price",
    "FED Rates",
    "10 Y Treasury Rates",
    "VWAP",
    "Spread_10Y_Fed",
    "SMA_10",
    "EMA_10",
    "Bollinger_Upper",
    "Bollinger_Lower"
]

In [55]:
# scaler initialization

scaler = StandardScaler()

In [56]:
# applying standardization to my dataset

data[cols_to_standardize] = scaler.fit_transform(data[cols_to_standardize])

In [57]:
data.head()

Unnamed: 0,S&P 500 Adj Close Price,Volume S&P 500,Returns,VIX Adj Close Price,FED Rates,10 Y Treasury Rates,VWAP,Spread_10Y_Fed,SMA_10,EMA_10,Bollinger_Upper,Bollinger_Lower,Sharpe_10,Month,Weekday,Is_Bank_Holiday,Is_Vacation
0,-2.036408,0.65542,-0.000946,-0.70486,0.632686,0.396402,-2.029308,-0.254951,-2.012436,-1.999142,-2.07387,-1.905749,0.430481,11,2,0,0
1,-1.988846,1.95273,0.003784,-0.722814,0.625569,0.718112,-2.007473,0.016387,-1.994376,-1.986453,-2.068166,-1.874731,0.487993,11,3,0,0
2,-1.914739,0.626789,0.005874,-0.809591,0.543724,0.217675,-1.981324,-0.320335,-1.970343,-1.961802,-2.039182,-1.85652,0.584993,12,4,0,0
3,-1.983377,0.590789,-0.005409,-0.674936,0.572192,0.463922,-1.969579,-0.143801,-1.954831,-1.955671,-2.050362,-1.812544,0.308879,12,0,0,0
4,-1.990557,-0.017764,-0.000569,-0.74376,0.643362,-0.000768,-1.961444,-0.591672,-1.949323,-1.952169,-2.047128,-1.804694,0.129937,12,1,0,0


In [58]:
# importing useful libraries to perform Random Forest Model

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [59]:
# Preparing variables for model

X = data.drop(columns=["S&P 500 Adj Close Price"])  
y = data["S&P 500 Adj Close Price"]

In [60]:
#preparing model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:
#initializing random forest model

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [62]:
# training model

rf_model.fit(X_train, y_train)

In [63]:
# prediciting on test variable

y_pred = rf_model.predict(X_test)

In [64]:
# evaluating performances

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R²: {r2}")

RMSE: 0.11764995808509143
R²: 0.9881859515015347


In [65]:
# example of prediction

print("Some predictions :")
for i in range(5):
    print(f"True : {y_test.iloc[i]}, Predicted : {y_pred[i]}")

Some predictions :
True : -1.6549656923567797, Predicted : -1.4761641101391216
True : -1.9397082860489097, Predicted : -1.9167609695835999
True : 0.9558245822603009, Predicted : 0.7994877366876771
True : 1.1534230948862723, Predicted : 1.441258099771467
True : 1.9546712127803993, Predicted : 1.831972090392627


In [66]:
# import libraries to compute good regression metrics

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import scipy.stats as stats

In [67]:
# computing right metrics to evaluate the predictions and thus the model

residuals = y_test - y_pred
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
z_scores = stats.zscore(residuals)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R²: {r2}")
print(f"Average of the residual Z-scores: {np.mean(z_scores)}")
print(f"Standard error the the residual Z-scores: {np.std(z_scores)}")

RMSE: 0.11764995808509143
MAE: 0.08218960659977396
R²: 0.9881859515015347
Average of the residual Z-scores: 0.0
Standard error the the residual Z-scores: 0.9999999999999998


Understanding what we're doing, criticizing our job and then trying to define what we're going to do next.

Here our RandomForestRegression Model provides a very high R² which leads me to worry about overfitting so I could think of testing it on another dataset or even think about some methods like Ridge or Lasso.

Also, my model assumes that the relationships between features are constant in time which is a strong assumption and could be solved by using other variables or models like neural network.

Moreover, extrapolation could be difficult because the model focuses on past data but it is norma since it is the only data we can base ourselves on. One solution could be to test the model on realistic future periods or even use more dynamic models that ajust the model parameters with newer data.

Talking about optimizing the parameters, we're now going to try and use GridCV to think about and try to choose the right parameters for our model.

In [68]:
# trying to find out what the best parameters are for our random forest regression model

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

rf = RandomForestRegressor()

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', None]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print(f"Best parameters : {grid_search.best_params_}")
print(f"Best score : {-grid_search.best_score_}")

Best parameters : {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best score : 0.011880130561495731


Now let's think about the core strategy of my project. 

Here I am trying to predict the future price of the S&P 500 index. Here Thus, I've used a Random Forest which is a model that we've studied in class but here I used the regression way instead of the Random Forest Classification because what I have here is a regression problem becuse of what I'm trying to predict. 

An idea would be to try to think about implementing other models we've seen in class by changing the aim of the project and turning it into a classification problem, at leat for the sake of the stage 2, in order to use models we've studied in class. Here, what I would be trying to predict would be if the S&P 500 stock woul rather go up or down. 

In further stages I'll then be able to comeback to more difficult and deep regression models such as SVMs, XGBoost... by coming back to my initial regression problem.

Alright, as mentioned above I'm going to change my regression into a classification problem and try to implement first a simple model such as Logistic Regression to see the outputs and results I get to see what I'll try to do next to enhance my results.

In [69]:
# creating a categorical variable to tell if the price of the stock will go up or down
data['Target'] = (data['Returns'] > 0).astype(int)

In [70]:
# checking the class distribution
print(data['Target'].value_counts())

Target
1    141
0    102
Name: count, dtype: int64


In [71]:
# setting the variables for the model implementation
features = [
    'S&P 500 Adj Close Price', 'Volume S&P 500', 'VIX Adj Close Price',
    'FED Rates', '10 Y Treasury Rates', 'VWAP', 'Spread_10Y_Fed',
    'SMA_10', 'EMA_10', 'Bollinger_Upper', 'Bollinger_Lower',
    'Sharpe_10', 'Month', 'Weekday', 'Is_Bank_Holiday', 'Is_Vacation'
]
X = data[features]
y = data['Target']

In [72]:
# separating the model and train sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [73]:
# implementing the model and printing the results
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score

logistic_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("Logistic Regression Precision:", precision_score(y_test, y_pred))
print("Logistic Regression Recall:", recall_score(y_test, y_pred))
print("Logistic Regression F1 Score:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.673469387755102
Logistic Regression Precision: 0.8125
Logistic Regression Recall: 0.7222222222222222
Logistic Regression F1 Score: 0.7647058823529411
              precision    recall  f1-score   support

           0       0.41      0.54      0.47        13
           1       0.81      0.72      0.76        36

    accuracy                           0.67        49
   macro avg       0.61      0.63      0.62        49
weighted avg       0.71      0.67      0.69        49



Ok so here we printed the results of our first model after having transformed it into a classification problem. The model used here was a rather simple one to begin. I chose to use a Logistic Regression model which assumes linear relationship between features and the probability to belong to a certain class. The problem here is that the model doesn't catch the complex relationships between features. We can easily see it by looking at the accuracy score that there is a lot of room for improvement. The precision score shows that the model is good for the positive class (price rises) but struggles predicting for the negative class surely because the model doesn't capture complex enough relationships. The low recall score also points out that the model still fails to miss some elements from the positive class. Thus, I will try to enhance my results by implementing a bagging and then a boosting model.

In [74]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bagging_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=5),
    n_estimators=10,
    random_state=42
)
bagging_model.fit(X_train, y_train)

y_pred_bagging = bagging_model.predict(X_test)

print(classification_report(y_test, y_pred_bagging))
print("Bagging Accuracy:", accuracy_score(y_test, y_pred_bagging))
print("Bagging Precision:", precision_score(y_test, y_pred_bagging))
print("Bagging Recall:", recall_score(y_test, y_pred_bagging))
print("Bagging F1 Score:", f1_score(y_test, y_pred_bagging))

              precision    recall  f1-score   support

           0       0.33      0.46      0.39        13
           1       0.77      0.67      0.72        36

    accuracy                           0.61        49
   macro avg       0.55      0.56      0.55        49
weighted avg       0.66      0.61      0.63        49

Bagging Accuracy: 0.6122448979591837
Bagging Precision: 0.7741935483870968
Bagging Recall: 0.6666666666666666
Bagging F1 Score: 0.7164179104477612


Here we understand that the bagging model, even though it is able to capture more complex relationships, seems less appropriate to this dataset. Here the problem might come from an imbalance between classes, maybe because of trees not deep enough. One issue might also be overfitting and that would be why Logistic Regression performs better because it doesn't overadjust sub-samples.

In [76]:
from sklearn.ensemble import AdaBoostClassifier

boosting_model = AdaBoostClassifier(
    n_estimators=50,
    random_state=42
)
boosting_model.fit(X_train, y_train)

y_pred_boosting = boosting_model.predict(X_test)
print(classification_report(y_test, y_pred_boosting))
print("Boosting Accuracy:", accuracy_score(y_test, y_pred_boosting))
print("Boosting Precision:", precision_score(y_test, y_pred_boosting))
print("Boosting Recall:", recall_score(y_test, y_pred_boosting))
print("Boosting F1 Score:", f1_score(y_test, y_pred_boosting))

              precision    recall  f1-score   support

           0       0.35      0.54      0.42        13
           1       0.79      0.64      0.71        36

    accuracy                           0.61        49
   macro avg       0.57      0.59      0.57        49
weighted avg       0.68      0.61      0.63        49

Boosting Accuracy: 0.6122448979591837
Boosting Precision: 0.7931034482758621
Boosting Recall: 0.6388888888888888
Boosting F1 Score: 0.7076923076923077




Okay so let's try to understand why the boosting model isn't capturing well the relationships between features. Among those models, Logistic Regression is the more adapted but is limited for the minority class. The problem of bagging is that it doesn't provide any significative improvement and is less performing for the minority class. The boosting model shows clear improvements for the minority class thanks to the bias and error reduction provided by this type of model. One idea for the next improvements would be to try to improve the boosting parameters and hyperparameters to see how the results evolve. Another idea would be to try to improve the Logistic Regression by also changing the parameters and adjsuting the weights of the under represented class. Also, I could also think of implementing other advanced models like XGBoost or GradientBoosting. 