In [1]:
import pandas as pd
import yfinance as yf
import statsmodels.api as sm

### Download and Preprocess Data from Yahoo Finance

In [6]:
data = yf.download("^GSPC", start='1990-01-01', end='2023-9-15')

[*********************100%%**********************]  1 of 1 completed


In [7]:
data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1990-01-02,353.399994,359.690002,351.980011,359.690002,359.690002,162070000
1990-01-03,359.690002,360.589996,357.890015,358.760010,358.760010,192330000
1990-01-04,358.760010,358.760010,352.890015,355.670013,355.670013,177000000
1990-01-05,355.670013,355.670013,351.350006,352.200012,352.200012,158530000
1990-01-08,352.200012,354.239990,350.540009,353.790009,353.790009,140110000
...,...,...,...,...,...,...
2023-09-08,4451.299805,4473.529785,4448.379883,4457.490234,4457.490234,3259290000
2023-09-11,4480.979980,4490.770020,4467.890137,4487.459961,4487.459961,3369920000
2023-09-12,4473.270020,4487.109863,4456.830078,4461.899902,4461.899902,3435740000
2023-09-13,4462.649902,4479.390137,4453.520020,4467.439941,4467.439941,3529430000


In [8]:
df = data['Adj Close'].pct_change() * 100

In [10]:
df = df.rename("Today")

In [12]:
df = df.reset_index()

In [13]:
df

Unnamed: 0,Date,Today
0,1990-01-02,
1,1990-01-03,-0.258554
2,1990-01-04,-0.861299
3,1990-01-05,-0.975624
4,1990-01-08,0.451447
...,...,...
8486,2023-09-08,0.142662
8487,2023-09-11,0.672345
8488,2023-09-12,-0.569589
8489,2023-09-13,0.124163


In [14]:
for i in range(1,6): 
    df['Lag '+str(i)] = df['Today'].shift(i)

In [15]:
df

Unnamed: 0,Date,Today,Lag 1,Lag 2,Lag 3,Lag 4,Lag 5
0,1990-01-02,,,,,,
1,1990-01-03,-0.258554,,,,,
2,1990-01-04,-0.861299,-0.258554,,,,
3,1990-01-05,-0.975624,-0.861299,-0.258554,,,
4,1990-01-08,0.451447,-0.975624,-0.861299,-0.258554,,
...,...,...,...,...,...,...,...
8486,2023-09-08,0.142662,-0.321127,-0.697160,-0.419418,0.179913,-0.159694
8487,2023-09-11,0.672345,0.142662,-0.321127,-0.697160,-0.419418,0.179913
8488,2023-09-12,-0.569589,0.672345,0.142662,-0.321127,-0.697160,-0.419418
8489,2023-09-13,0.124163,-0.569589,0.672345,0.142662,-0.321127,-0.697160


In [16]:
df['Volume'] = data.Volume.shift(1).values/1000_000_000

In [17]:
df = df.dropna()

In [19]:
df = sm.add_constant(df)

In [22]:
df['Direction'] = [1 if i > 0 else 0 for i in df['Today']]

In [23]:
X = df[['const','Lag 1','Lag 2','Lag 3','Lag 4','Lag 5','Volume']]

In [24]:
y = df.Direction

### Create Logistic Regression Model

In [25]:
model = sm.Logit(y,X)

In [26]:
result = model.fit()

Optimization terminated successfully.
         Current function value: 0.689595
         Iterations 4


In [27]:
result.summary()

0,1,2,3
Dep. Variable:,Direction,No. Observations:,8485.0
Model:,Logit,Df Residuals:,8478.0
Method:,MLE,Df Model:,6.0
Date:,"Fri, 15 Sep 2023",Pseudo R-squ.:,0.001699
Time:,13:24:07,Log-Likelihood:,-5851.2
converged:,True,LL-Null:,-5861.2
Covariance Type:,nonrobust,LLR p-value:,0.00287

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.1298,0.036,3.573,0.000,0.059,0.201
Lag 1,-0.0784,0.019,-4.060,0.000,-0.116,-0.041
Lag 2,-0.0211,0.019,-1.101,0.271,-0.059,0.017
Lag 3,-0.0195,0.019,-1.013,0.311,-0.057,0.018
Lag 4,-0.0140,0.019,-0.728,0.467,-0.052,0.024
Lag 5,-0.0233,0.019,-1.214,0.225,-0.061,0.014
Volume,0.0059,0.012,0.499,0.618,-0.017,0.029


In [28]:
prediction = result.predict(X)

In [29]:
prediction

6       0.562340
7       0.557845
8       0.539044
9       0.583168
10      0.569590
          ...   
8486    0.550152
8487    0.539887
8488    0.529886
8489    0.549443
8490    0.536252
Length: 8485, dtype: float64

### Test Results

In [33]:
def confusion_matrix(act,pred):
    predtrans = ['Up' if i > 0.5 else 'Down' for i in pred]
    actuals = ['Up' if i > 0 else 'Down' for i in act]
    confusion_matrix = pd.crosstab(pd.Series(actuals),
                                  pd.Series(predtrans),
                                  rownames = ['Acutal'],
                                  colnames = ['Predicted'])
    return confusion_matrix

In [34]:
confusion_matrix(y,prediction)

Predicted,Down,Up
Acutal,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,204,3746
Up,179,4356


In [35]:
len(df)

8485

In [36]:
(204 + 4356) / 8585

0.5311589982527665

In [37]:
x_train = df[df.Date.dt.year < 2023][['const','Lag 1','Lag 2','Lag 3','Lag 4','Lag 5','Volume']]
y_train = df[df.Date.dt.year < 2023]['Direction']
x_test = df[df.Date.dt.year == 2023][['const','Lag 1','Lag 2','Lag 3','Lag 4','Lag 5','Volume']]
y_test = df[df.Date.dt.year == 2023]['Direction']

In [38]:
model = sm.Logit(y_train, x_train)

In [39]:
result = model.fit() 

Optimization terminated successfully.
         Current function value: 0.689520
         Iterations 4


In [41]:
prediction = result.predict(x_test)

In [42]:
confusion_matrix(y_test, prediction)

Predicted,Down,Up
Acutal,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,1,82
Up,1,92


In [44]:
(1+92) / len(x_test)

0.5284090909090909