In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
data = pd.read_csv("BANKNIFTYFUTALL.csv")
feature_cols_JUL = ['trade', 'event', 'ob', 'spreadreturn','btrade','bevent','bspreadeurn','bindex','niftytrade','niftyidevent', 'niftyidob','niftyidspreadreturn','niftyidindex']

data.drop([data.index[457897]],inplace=True)
data.isnull().sum(axis = 0)

noofticks               0
timestamp               0
lotsize                 0
asset                   0
instrumentname          0
bid                     0
ask                     0
spreadbps               0
tradeinn                0
trade                   0
event                   0
ob                      0
spreadreturn            0
btrade                  0
bevent                  0
bob                     0
bspreadeurn             0
bindex                  0
niftytrade              0
niftyidevent            0
niftyidob               0
niftyidspreadreturn     0
niftyidindex            0
niftyidbidaskt          0
bbidaskt                0
bspreadreturn.tValue    0
mid                     0
shiftmid200             0
shiftmid400             0
shiftmid600             0
shiftmid800             0
return200               0
return400               0
return600               0
return800               0
dtype: int64

##### Performing Linear Regression

In [5]:
X= data[feature_cols_JUL]
Y = data.return800
reg= LinearRegression()
reg.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [6]:
r_sq = reg.score(X,Y)
r_sq

0.003509212358210667

In [7]:
data["predict"] = reg.predict(X)
print(data[['predict','return800']].corr())

            predict  return800
predict    1.000000   0.059239
return800  0.059239   1.000000


In [8]:
Coef = pd.Series(reg.coef_,index=feature_cols_JUL)
print(Coef)

trade                  0.080230
event                  0.071107
ob                     0.128572
spreadreturn           0.085134
btrade                -0.016945
bevent                 0.088479
bspreadeurn           -0.048780
bindex                 0.007328
niftytrade             0.004392
niftyidevent           0.046049
niftyidob             -0.013798
niftyidspreadreturn    0.000364
niftyidindex           0.019068
dtype: float64


##### Performing Linear Regression by splitting the data into train and test set

In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(X,Y,test_size=0.2,random_state=88)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((366317, 13), (91580, 13), (366317,), (91580,))

In [10]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train,y_train)
reg.intercept_

0.03495223161830502

In [11]:
reg.coef_

array([ 0.07868083,  0.07191404,  0.1331837 ,  0.08280525, -0.01827936,
        0.09151793, -0.05455947,  0.00935661,  0.00204851,  0.04559824,
       -0.01199224,  0.00504971,  0.02110671])

In [12]:
from sklearn.metrics import r2_score,mean_squared_error
y_pred = reg.predict(x_test)
mse=mean_squared_error(y_test,y_pred)
rmse= np.sqrt(mse)
rmse
r2_score(y_test,y_pred)

0.003136894795351619

In [13]:
reg.score(x_test,y_test)

0.003136894795351619

In [14]:
reg.score(x_train,y_train)

0.003595254580356033

In [16]:
from statsmodels.formula.api import ols
reg1 = ols(formula="return800~trade+event+ob+spreadreturn+btrade+bevent+bspreadeurn+bindex+niftytrade+niftyidevent+niftyidob+niftyidspreadreturn+niftyidindex ",data=data)
fit1= reg1.fit()
print(fit1.summary())

                            OLS Regression Results                            
Dep. Variable:              return800   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     124.0
Date:                Mon, 28 Sep 2020   Prob (F-statistic):               0.00
Time:                        15:05:41   Log-Likelihood:            -1.3048e+06
No. Observations:              457897   AIC:                         2.610e+06
Df Residuals:                  457883   BIC:                         2.610e+06
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               0.0296    

#### From the above Regression Results we can conclude that:

- p-value for "bindex", 'niftytrade', niftyidob' and "niftyidspreadreturn" is greater than 0.05(signigicance level)
- Observing the confidence interval range we can see that for bindex", 'niftytrade', niftyidob' and "niftyidspreadreturn" zero comes in between which also states that these columns are not giving us best results
- So to make our model better we try to remove those columns and observe the results accordingly
- Hence, we are performing Forward Inclusion and omiiting the columns which are not giving us best results



In [31]:
import statsmodels.formula.api as smf
def forward_inclusion(data,input_col,response):
    remaining=set(input_col)
    selected=[]
    lower=[]
    for col in remaining:
        formula = "{} ~ {} ".format(response,' + '.join(selected + [col]))
        selected.append(col)
        score = smf.ols(formula, data).fit().pvalues
        #score.drop(['Intercept'])
        score2=list(score)
        for i in score2:
            if(i>0.05):
                var = score[score == i].index[0]
                lower.append(var)
    final=set(selected)-set(lower)
    formula = "{} ~ {} ".format(response,' + '.join(final))
    model = smf.ols(formula, data).fit()
    return model

In [32]:
model = forward_inclusion(data,feature_cols_JUL,'return800')
print(model.model.formula)

return800 ~ niftyidindex + event + spreadreturn + ob + bevent + trade + niftyidevent + btrade 


In [33]:
from statsmodels.formula.api import ols
reg1 = ols(formula=model.model.formula,data=data)
fit1= reg1.fit()
print(fit1.summary())

                            OLS Regression Results                            
Dep. Variable:              return800   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     196.4
Date:                Mon, 28 Sep 2020   Prob (F-statistic):               0.00
Time:                        15:29:56   Log-Likelihood:            -1.3048e+06
No. Observations:              457897   AIC:                         2.610e+06
Df Residuals:                  457888   BIC:                         2.610e+06
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.0248      0.008      2.958   

##### From tha above OLS Regression resuts we can also conclude that overall Significance(F-Statistic) of the model has increased

Performing Linear Regression for updated columns


In [34]:
feature_cols_JUL = ['trade', 'event', 'ob', 'bevent','btrade','spreadreturn', 'niftyidevent','niftyidindex']
X= data[feature_cols_JUL]
Y = data.return800

In [35]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(X,Y,test_size=0.2,random_state=88)
x_train.shape,x_test.shape,y_train.shape,y_test.shape


((366317, 8), (91580, 8), (366317,), (91580,))

In [36]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [37]:
reg.score(x_test,y_test)

0.003127725956926341

In [38]:
reg.score(x_train,y_train)

0.0034872472944075246

In [39]:
data["predict_after_elim"] = reg.predict(X)
print(data[['predict_after_elim','return800']].corr())

                    predict_after_elim  return800
predict_after_elim            1.000000   0.058443
return800                     0.058443   1.000000


In [40]:
#Performing Regression on X and Y 
reg2= LinearRegression()
reg2.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [41]:
r_sq = reg2.score(X,Y)
print('coefficient of determination:', r_sq)

coefficient of determination: 0.0034188210916376027


In [42]:
data["predict2_after_elim"] = reg2.predict(X)
print(data[['predict2_after_elim','return800']].corr())

                     predict2_after_elim  return800
predict2_after_elim             1.000000   0.058471
return800                       0.058471   1.000000


##### Performing Backward Elimination 
Backward Elimination consists of the following steps:

- Select a significance level to stay in the model (eg. SL = 0.05)
- Fit the model with all possible predictors
- Consider the predictor with the highest P-value. If P>SL, remove the predictor.
- Fit the model without this variable and repeat the step 3 until the condition becomes false.


In [43]:
#Backward elimination
import statsmodels.formula.api as smf
def backward_elimination(data,input_col,response):
    selected=list(input_col)
    lower=[]
    formula = "{} ~ {} ".format(response,' + '.join(selected))
    score = smf.ols(formula, data).fit().pvalues
        #score.drop(['Intercept'])
    score2=list(score)
    for i in score2:
        if(i>0.05):
            var = score[score == i].index[0]
            lower.append(var)
    final=set(selected)-set(lower)
    formula = "{} ~ {} ".format(response,' + '.join(final))
    model = smf.ols(formula, data).fit()
    return model

##### Calling the Backward elimination method which performs backward elimination and returns the columns which will produce better results.

In [44]:
feature_cols_JUL = ['trade', 'event', 'ob', 'bevent','btrade','spreadreturn', 'niftyidevent','niftyidindex']
model = backward_elimination(data,feature_cols_JUL,'return800')
print(model.model.formula)

return800 ~ event + spreadreturn + niftyidindex + ob + bevent + trade + niftyidevent + btrade 


##### Conclusion : We got the same columns by performing Backward elimination as we got in Forward Inclusion

#### Fitting the model for higher returns only

In [46]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
data = pd.read_csv("BANKNIFTYFUTALL.csv")
data.drop([data.index[457897]],inplace=True)
x=data.return800

newpoints= data.index[data['return800'] >= np.percentile(x,75)].tolist()
df= data.iloc[newpoints,:]

feature_cols_JUL = ['trade', 'event', 'ob', 'spreadreturn','btrade','bevent','bspreadeurn','bindex','niftytrade','niftyidevent', 'niftyidob','niftyidspreadreturn','niftyidindex']

X= df[feature_cols_JUL]
Y = df.return800
reg= LinearRegression()
reg.fit(X,Y)
r_sq = reg.score(X,Y)
print("rsquare: ",r_sq)

rsquare:  0.04443068029821461


In [47]:
df["predict"] = reg.predict(X)
print(df[['predict','return800']].corr())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


            predict  return800
predict    1.000000   0.210786
return800  0.210786   1.000000


In [49]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
data = pd.read_csv("BANKNIFTYFUTALL.csv")
data.drop([data.index[457897]],inplace=True)
x=data.return800

newpoints= data.index[data['return800'] <= np.percentile(x,25)].tolist()
df= data.iloc[newpoints,:]


feature_cols_JUL = ['trade', 'event', 'ob', 'spreadreturn','btrade','bevent','bspreadeurn','bindex','niftytrade','niftyidevent', 'niftyidob','niftyidspreadreturn','niftyidindex']

X= df[feature_cols_JUL]
Y = df.return800
reg= LinearRegression()
reg.fit(X,Y)
r_sq = reg.score(X,Y)
print("r_square :",r_sq)

r_square : 0.04792163129190674
