In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression

stock = pd.read_csv('StockMarket.csv', sep=',', header=0)
stock['intercept'] = 1.0


In [2]:
"""
Question 1.1: find summary statistic
"""
stock.describe(include='all').T


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Year,1250,,,,2003.02,1.40902,2001.0,2002.0,2003.0,2004.0,2005.0
Lag1,1250,,,,0.0038344,1.1363,-4.922,-0.6395,0.039,0.59675,5.733
Lag2,1250,,,,0.0039192,1.13628,-4.922,-0.6395,0.039,0.59675,5.733
Lag3,1250,,,,0.001716,1.1387,-4.922,-0.64,0.0385,0.59675,5.733
Lag4,1250,,,,0.001636,1.13877,-4.922,-0.64,0.0385,0.59675,5.733
Lag5,1250,,,,0.0056096,1.14755,-4.922,-0.64,0.0385,0.597,5.733
Volume,1250,,,,1.47831,0.360357,0.35607,1.2574,1.42295,1.64167,3.15247
Today,1250,,,,0.0031384,1.13633,-4.922,-0.6395,0.0385,0.59675,5.733
Direction,1250,2.0,Up,648.0,,,,,,,
intercept,1250,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [3]:
""" 
Q 1.2: find all of the pairwaise correlations in the data set. 
Can you find significant correlations between variables?
"""
stock.corr(method='pearson')

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,intercept
Year,1.0,0.0297,0.030596,0.033195,0.035689,0.029788,0.539006,0.030095,
Lag1,0.0297,1.0,-0.026294,-0.010803,-0.002986,-0.005675,0.04091,-0.026155,
Lag2,0.030596,-0.026294,1.0,-0.025897,-0.010854,-0.003558,-0.043383,-0.01025,
Lag3,0.033195,-0.010803,-0.025897,1.0,-0.024051,-0.018808,-0.041824,-0.002448,
Lag4,0.035689,-0.002986,-0.010854,-0.024051,1.0,-0.027084,-0.048414,-0.0069,
Lag5,0.029788,-0.005675,-0.003558,-0.018808,-0.027084,1.0,-0.022002,-0.03486,
Volume,0.539006,0.04091,-0.043383,-0.041824,-0.048414,-0.022002,1.0,0.014592,
Today,0.030095,-0.026155,-0.01025,-0.002448,-0.0069,-0.03486,0.014592,1.0,
intercept,,,,,,,,,


In [4]:
"""
Question 2.1: Try to fit a linear regression model in order to predict 
Today using other variables (excluding Direction).   
Explain the regression result. 
How can you interpret coefficients, their p-values, and R-squared? 

Use OLS function to fit a linear regression model
X is a DataFrame (or numpy array) containing exogenous (independent) variables.
Y is a Series (or numpy array) of dependent variable.
"""
# y = [insert variables] + intercept
X=stock[['Year','Lag1','Lag2','Lag3','Lag4','Lag5','Volume','intercept']]
y=stock[['Today']]
X = sm.add_constant(X)


# fit model
model = sm.OLS(y, X).fit()

# evaluate model
predictions = model.predict(X) 
print_model = model.summary()
print_model


0,1,2,3
Dep. Variable:,Today,R-squared:,0.003
Model:,OLS,Adj. R-squared:,-0.002
Method:,Least Squares,F-statistic:,0.5715
Date:,"Tue, 14 Sep 2021",Prob (F-statistic):,0.78
Time:,22:38:41,Log-Likelihood:,-1930.9
No. Observations:,1250,AIC:,3878.0
Df Residuals:,1242,BIC:,3919.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Year,0.0291,0.027,1.062,0.289,-0.025,0.083
Lag1,-0.0276,0.028,-0.974,0.330,-0.083,0.028
Lag2,-0.0127,0.028,-0.447,0.655,-0.069,0.043
Lag3,-0.0054,0.028,-0.191,0.849,-0.061,0.050
Lag4,-0.0098,0.028,-0.344,0.731,-0.066,0.046
Lag5,-0.0363,0.028,-1.290,0.197,-0.091,0.019
Volume,-0.0182,0.107,-0.169,0.865,-0.228,0.192
intercept,-58.1705,54.727,-1.063,0.288,-165.537,49.196

0,1,2,3
Omnibus:,83.164,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,302.241
Skew:,0.215,Prob(JB):,2.3400000000000002e-66
Kurtosis:,5.37,Cond. No.,3410000.0


In [5]:
""" 
Question 2.2: . Try to fit a linear regression model in order to predict 
Direction using other variables (excluding Today).  Note that you need to 
convert the factor data – Direction into numeric data type. 
Explain the regression result. How can you interpret coefficients, 
their p-values, and adjusted R-squared?
"""
# Convert Direction data into categorical codes and include intercept dummy
Xs=stock[['Year','Lag1','Lag2','Lag3','Lag4','Lag5','Volume','intercept']]
Xs = sm.add_constant(Xs)
y = stock['Direction'].astype('category').cat.codes

model2 = sm.OLS(y, Xs).fit()
predictions = model2.predict(Xs) 
print_model = model2.summary()
print_model

0,1,2,3
Dep. Variable:,y,R-squared:,0.009
Model:,OLS,Adj. R-squared:,0.003
Method:,Least Squares,F-statistic:,1.529
Date:,"Tue, 14 Sep 2021",Prob (F-statistic):,0.153
Time:,22:38:42,Log-Likelihood:,-901.03
No. Observations:,1250,AIC:,1818.0
Df Residuals:,1242,BIC:,1859.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Year,0.0320,0.012,2.667,0.008,0.008,0.056
Lag1,-0.0185,0.012,-1.490,0.136,-0.043,0.006
Lag2,-0.0128,0.012,-1.023,0.307,-0.037,0.012
Lag3,0.0004,0.012,0.032,0.975,-0.024,0.025
Lag4,-0.0003,0.012,-0.021,0.983,-0.025,0.024
Lag5,0.0008,0.012,0.065,0.948,-0.023,0.025
Volume,-0.0349,0.047,-0.743,0.458,-0.127,0.057
intercept,-63.5673,24.009,-2.648,0.008,-110.670,-16.464

0,1,2,3
Omnibus:,4726.666,Durbin-Watson:,2.116
Prob(Omnibus):,0.0,Jarque-Bera (JB):,201.216
Skew:,-0.072,Prob(JB):,2.03e-44
Kurtosis:,1.04,Cond. No.,3410000.0


In [6]:
"""
Try to fit a linear regression model in order to predict Direction 
using Lag1, and Lag2. Explain the regression result. How can you interpret
coefficients, their p-values, and adjusted R-squared?
"""
Xd = stock[['Lag1','Lag2']]
Xd = sm.add_constant(Xd)
y = stock['Direction'].astype('category').cat.codes

model = sm.OLS(y, Xd).fit()
predictions = model.predict(Xd) 
print_model = model.summary()
print_model


0,1,2,3
Dep. Variable:,y,R-squared:,0.002
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,1.383
Date:,"Tue, 14 Sep 2021",Prob (F-statistic):,0.251
Time:,22:38:43,Log-Likelihood:,-905.01
No. Observations:,1250,AIC:,1816.0
Df Residuals:,1247,BIC:,1831.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5185,0.014,36.685,0.000,0.491,0.546
Lag1,-0.0178,0.012,-1.428,0.153,-0.042,0.007
Lag2,-0.0111,0.012,-0.889,0.374,-0.035,0.013

0,1,2,3
Omnibus:,4653.347,Durbin-Watson:,2.106
Prob(Omnibus):,0.0,Jarque-Bera (JB):,206.488
Skew:,-0.074,Prob(JB):,1.45e-45
Kurtosis:,1.014,Cond. No.,1.15


In [7]:
"""
Q 3-1. Try to fit a logistic regression model in order to predict 
Direction using other variables (excluding Today).  
Explain the regression result. How can you interpret coefficients and 
their p-values?
"""

stock['intercept'] = 1

Xi = stock[['Year','Lag1','Lag2','Lag3','Lag4','Lag5','Volume', 'intercept']]
y = stock['Direction'].astype('category').cat.codes


logit_model = sm.Logit(y, Xi).fit()
print_model = logit_model.summary()
print_model

Optimization terminated successfully.
         Current function value: 0.688186
         Iterations 5


0,1,2,3
Dep. Variable:,y,No. Observations:,1250.0
Model:,Logit,Df Residuals:,1242.0
Method:,MLE,Df Model:,7.0
Date:,"Tue, 14 Sep 2021",Pseudo R-squ.:,0.006187
Time:,22:38:43,Log-Likelihood:,-860.23
converged:,True,LL-Null:,-865.59
Covariance Type:,nonrobust,LLR p-value:,0.1518

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Year,0.1290,0.049,2.660,0.008,0.034,0.224
Lag1,-0.0749,0.050,-1.490,0.136,-0.173,0.024
Lag2,-0.0515,0.050,-1.024,0.306,-0.150,0.047
Lag3,0.0016,0.050,0.032,0.974,-0.097,0.100
Lag4,-0.0011,0.050,-0.022,0.982,-0.100,0.097
Lag5,0.0033,0.050,0.066,0.947,-0.094,0.101
Volume,-0.1413,0.190,-0.745,0.456,-0.513,0.230
intercept,-258.1635,96.995,-2.662,0.008,-448.269,-68.058


In [8]:
"""
Try to fit a logistic regression model in order to predict Direction 
using Lag1, and Lag2. Explain the regression result. How can you interpret
coefficients and their p-values?
"""

X = stock[['Lag1','Lag2','intercept']]
y = stock['Direction'].astype('category').cat.codes

logit_model1 = sm.Logit(y,X).fit()
logit_model1.summary()


Optimization terminated successfully.
         Current function value: 0.691361
         Iterations 4


0,1,2,3
Dep. Variable:,y,No. Observations:,1250.0
Model:,Logit,Df Residuals:,1247.0
Method:,MLE,Df Model:,2.0
Date:,"Tue, 14 Sep 2021",Pseudo R-squ.:,0.001601
Time:,22:38:44,Log-Likelihood:,-864.2
converged:,True,LL-Null:,-865.59
Covariance Type:,nonrobust,LLR p-value:,0.2502

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Lag1,-0.0715,0.050,-1.427,0.153,-0.170,0.027
Lag2,-0.0445,0.050,-0.890,0.374,-0.142,0.054
intercept,0.0742,0.057,1.310,0.190,-0.037,0.185


In [9]:
"""
Q 3-3.  Predict the probability that the market will go up, given values 
of the predictors. Then, in order to make a prediction as to whether the 
market will go up or down on a particular day, we must convert these 
predicted probabilities into class labels, Up or Down. So create a new 
vector of predictions based on whether the predicted probability of a 
market increase is greater than or less than 0.5. Then, tabulate the 
prediction vector to determine how many observations were correctly or 
incorrectly classified. What does this table imply about predictions of 
the logistic regression model? 
"""

X=stock[['Lag1','Lag2','intercept']]
y = stock['Direction'].astype('category').cat.codes
logit_model3 = sm.Logit(y, X).fit()


pred_prob = logit_model3.predict(X)
X.loc[:,'prediction']=0
X.loc[pred_prob > 0.41,'prediction']=1
pd.crosstab(y,X['prediction'],rownames =['actual'],colnames=['predicted'])


Optimization terminated successfully.
         Current function value: 0.691361
         Iterations 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,602
1,1,647


In [10]:
"""
Q 3-4. The previous result in Q3-3 is misleading because we trained and 
tested the model on the same set of 1,250 observations. In order to better
assess the accuracy of the logistic regression, we can fit the model using
part of the data, and then examine how well it predicts the held out data. Let’s use data before 2005 for training and compute predictions for 2005.
Now fit a logistic regression model using training data set, using the 
subset argument.Obtain predicted probabilities on the validation data set,
that is for days in 2005. Tabulate the prediction vector to determine how
many observations were correctly or incorrectly classified. How can you 
interpret the prediction accuracy?

"""
# split data
y=stock['Direction'].astype('category').cat.codes
X=stock[['Lag1','Lag2','intercept']]

X_train = X[stock['Year'] < 2005]
X_test = X[stock['Year'] == 2005]
Y_train = y[stock['Year'] < 2005]
Y_test = y[stock['Year'] == 2005]

# fit model
logit_model4 = sm.Logit(Y_train, X_train).fit()

# evaluate model
pred_prob = logit_model4.predict(X_test)
X_test.loc[:,'prediction'] = 0
X_test.loc[pred_prob > 0.5, 'prediction'] = 1
pd.crosstab(Y_test,X_test['prediction'],rownames=['actual'],colnames=['prediction'])

Optimization terminated successfully.
         Current function value: 0.692085
         Iterations 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,35,76
1,35,106


In [11]:
"""
Q 3-5 
Repeat Q3-4 varying the threshold (other than 0.5) to determine 
predictions based on whether the predicted probability of a market 
increase is greater than or less than the threshold.
"""

# split data
y=stock['Direction'].astype('category').cat.codes
X=stock[['Lag1','Lag2','intercept']]

X_train = X[stock['Year'] < 2005]
X_test = X[stock['Year'] == 2005]
Y_train = y[stock['Year'] < 2005]
Y_test = y[stock['Year'] == 2005]

# fit model
logit_model4 = sm.Logit(Y_train, X_train).fit()

# evaluate model
pred_prob = logit_model4.predict(X_test)
X_test.loc[:,'prediction'] = 0
X_test.loc[pred_prob > 0.47, 'prediction'] = 1
pd.crosstab(Y_test,X_test['prediction'],rownames=['actual'],colnames=['prediction'])

Optimization terminated successfully.
         Current function value: 0.692085
         Iterations 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


prediction,1
actual,Unnamed: 1_level_1
0,111
1,141
