In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
data = pd.read_csv("BANKNIFTYFUTALL.csv")
feature_cols_JUN = ['trade', 'event', 'ob', 'spreadreturn','niftytrade','niftyidevent', 'niftyidob','niftyidspreadreturn','niftyidindex']

#data.dropna(subset=['return800'], axis=0)

data.drop([data.index[457897]],inplace=True)
data.isnull().sum(axis = 0)


noofticks               0
timestamp               0
lotsize                 0
asset                   0
instrumentname          0
bid                     0
ask                     0
spreadbps               0
tradeinn                0
trade                   0
event                   0
ob                      0
spreadreturn            0
btrade                  0
bevent                  0
bob                     0
bspreadeurn             0
bindex                  0
niftytrade              0
niftyidevent            0
niftyidob               0
niftyidspreadreturn     0
niftyidindex            0
niftyidbidaskt          0
bbidaskt                0
bspreadreturn.tValue    0
mid                     0
shiftmid200             0
shiftmid400             0
shiftmid600             0
shiftmid800             0
return200               0
return400               0
return600               0
return800               0
dtype: int64

#### Performing Linear Regression 

In [2]:
X= data[feature_cols_JUN]
Y = data.return800
reg= LinearRegression()
reg.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [3]:
print(reg)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)


In [4]:
r_sq = reg.score(X,Y)
r_sq

0.003197200419108648

In [5]:
data["predict"] = reg.predict(X)
print(data[['predict','return800']].corr())

            predict  return800
predict    1.000000   0.056544
return800  0.056544   1.000000


In [6]:
Coef = pd.Series(reg.coef_,index=feature_cols_JUN)
print(Coef)

trade                  0.073048
event                  0.136368
ob                     0.121636
spreadreturn           0.053351
niftytrade             0.008547
niftyidevent           0.055097
niftyidob             -0.017960
niftyidspreadreturn    0.002271
niftyidindex           0.021552
dtype: float64


#### Performing Linear Regression by splitting the data into train and test set

In [7]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(X,Y,test_size=0.2,random_state=88)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((366317, 9), (91580, 9), (366317,), (91580,))

In [8]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train,y_train)
reg.intercept_

0.02841733864926308

In [9]:
reg.coef_

array([ 0.07089065,  0.13905809,  0.12643154,  0.04780166,  0.00715076,
        0.05556982, -0.01658027,  0.00688673,  0.02410835])

In [10]:
from sklearn.metrics import r2_score,mean_squared_error
y_pred = reg.predict(x_test)
mse=mean_squared_error(y_test,y_pred)
rmse= np.sqrt(mse)
rmse
r2_score(y_test,y_pred)

0.002970227562862582

In [11]:
reg.score(x_test,y_test)

0.0029702275628625823

In [12]:
reg.score(x_train,y_train)

0.0032487944784646006

#### Observing Ordinry Least Squares Table 

In [13]:
from statsmodels.formula.api import ols
reg1 = ols(formula="return800~trade+event+ob+spreadreturn+niftytrade+niftyidevent+niftyidob+niftyidspreadreturn+niftyidindex ",data=data)
fit1= reg1.fit()
print(fit1.summary())

                            OLS Regression Results                            
Dep. Variable:              return800   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     163.2
Date:                Mon, 28 Sep 2020   Prob (F-statistic):          3.62e-310
Time:                        14:18:23   Log-Likelihood:            -1.3049e+06
No. Observations:              457897   AIC:                         2.610e+06
Df Residuals:                  457887   BIC:                         2.610e+06
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               0.0247    

#### From the above Regression Results we can conclude that:
- p-value for "niftytrade" and "niftyspreadreturn" is greater than 0.05(signigicance level)
- Observing the confidence interval range we can see that for "niftytrade" and "niftyspreadreturn" zero comes in between which also states that these columns are not giving us best results
- So to make our model better we try to remove those two columns and observe the results accordingly 
- Hence, we are performing Forward Inclusion and omiiting the columns which are not giving us best results

In [12]:
import statsmodels.formula.api as smf
def forward_inclusion(data,input_col,response):
    remaining=set(input_col)
    selected=[]
    lower=[]
    for col in remaining:
        formula = "{} ~ {} ".format(response,' + '.join(selected + [col]))
        selected.append(col)
        score = smf.ols(formula, data).fit().pvalues
        #score.drop(['Intercept'])
        score2=list(score)
        for i in score2:
            if(i>0.05):
                var = score[score == i].index[0]
                lower.append(var)
    final=set(selected)-set(lower)
    formula = "{} ~ {} ".format(response,' + '.join(final))
    model = smf.ols(formula, data).fit()
    return model

##### Calling the forward Inclusion method which performs forward inclusion and returns the columns which will produce better results.

In [13]:
model = forward_inclusion(data,feature_cols_JUN,'return800')
print(model.model.formula)

return800 ~ trade + spreadreturn + niftyidevent + niftyidob + ob + niftyidindex + event 


#### Including the columns returned by forward Inclusion and generating OLS Regression Summary Table

In [14]:
from statsmodels.formula.api import ols
reg1 = ols(formula=model.model.formula,data=data)
fit1= reg1.fit()
print(fit1.summary())

                            OLS Regression Results                            
Dep. Variable:              return800   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     209.6
Date:                Tue, 04 Aug 2020   Prob (F-statistic):          3.95e-312
Time:                        23:58:15   Log-Likelihood:            -1.3049e+06
No. Observations:              457897   AIC:                         2.610e+06
Df Residuals:                  457889   BIC:                         2.610e+06
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.0251      0.008      2.990   

##### From tha above OLS Regression resuts we can also conclude that overall Significance(F-Statistic) of the model has increased 

#### Performing Linear Regression for updated columns

In [15]:
feature_cols_JUN = ['trade', 'event', 'ob', 'spreadreturn','niftyidevent', 'niftyidob','niftyidindex']
X= data[feature_cols_JUN]
Y = data.return800

In [16]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(X,Y,test_size=0.2,random_state=88)
x_train.shape,x_test.shape,y_train.shape,y_test.shape


((366317, 7), (91580, 7), (366317,), (91580,))

In [17]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [18]:
reg.score(x_test,y_test)

0.002971791169101179

In [19]:
reg.score(x_train,y_train)

0.0032448437094623195

In [20]:
data["predict_after_elim"] = reg.predict(X)
print(data[['predict_after_elim','return800']].corr())

                    predict_after_elim  return800
predict_after_elim            1.000000   0.056483
return800                     0.056483   1.000000


In [21]:
#Performing Regression on X and Y 
reg2= LinearRegression()
reg2.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [22]:
r_sq = reg2.score(X,Y)
print('coefficient of determination:', r_sq)

coefficient of determination: 0.0031935721278721907


In [23]:
data["predict2_after_elim"] = reg2.predict(X)
print(data[['predict2_after_elim','return800']].corr())

                     predict2_after_elim  return800
predict2_after_elim             1.000000   0.056512
return800                       0.056512   1.000000


##### Performing Backward Elimination 
Backward Elimination consists of the following steps:

- Select a significance level to stay in the model (eg. SL = 0.05)
- Fit the model with all possible predictors
- Consider the predictor with the highest P-value. If P>SL, remove the predictor.
- Fit the model without this variable and repeat the step 3 until the condition becomes false.


In [15]:
#Backward elimination
import statsmodels.formula.api as smf
def backward_elimination(data,input_col,response):
    selected=list(input_col)
    lower=[]
    formula = "{} ~ {} ".format(response,' + '.join(selected))
    score = smf.ols(formula, data).fit().pvalues
        #score.drop(['Intercept'])
    score2=list(score)
    for i in score2:
        if(i>0.05):
            var = score[score == i].index[0]
            lower.append(var)
    final=set(selected)-set(lower)
    formula = "{} ~ {} ".format(response,' + '.join(final))
    model = smf.ols(formula, data).fit()
    return model

##### Calling the Backward elimination method which performs backward elimination and returns the columns which will produce better results.

In [16]:
feature_cols_JUN = ['trade', 'event', 'ob', 'spreadreturn','niftytrade','niftyidevent', 'niftyidob','niftyidspreadreturn','niftyidindex']
model = backward_elimination(data,feature_cols_JUN,'return800')
print(model.model.formula)

return800 ~ spreadreturn + niftyidob + niftyidevent + trade + ob + niftyidindex + event 


##### Conclusion : We got the same columns by performing Backward elimination as we got in Forward Inclusion

##### Fitting the model for higher returns only

In [17]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
data = pd.read_csv("BANKNIFTYFUTALL.csv")
data.drop([data.index[457897]],inplace=True)
x=data.return800

newpoints= data.index[data['return800'] >= np.percentile(x,75)].tolist()
df= data.iloc[newpoints,:]

feature_cols_JUN = ['trade', 'event', 'ob', 'spreadreturn','niftytrade','niftyidevent', 'niftyidob','niftyidspreadreturn','niftyidindex']

X= df[feature_cols_JUN]
Y = df.return800
reg= LinearRegression()
reg.fit(X,Y)
r_sq = reg.score(X,Y)
print("rsquare: ",r_sq)

rsquare:  0.017486619372168444


In [18]:
df["predict"] = reg.predict(X)
print(df[['predict','return800']].corr())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


            predict  return800
predict    1.000000   0.132237
return800  0.132237   1.000000


In [19]:
from statsmodels.formula.api import ols
reg1 = ols(formula="return800~trade+event+ob+spreadreturn+niftytrade+niftyidevent+niftyidob+niftyidspreadreturn+niftyidindex ",data=df)
fit1= reg1.fit()
print(fit1.summary())

                            OLS Regression Results                            
Dep. Variable:              return800   R-squared:                       0.017
Model:                            OLS   Adj. R-squared:                  0.017
Method:                 Least Squares   F-statistic:                     226.4
Date:                Mon, 28 Sep 2020   Prob (F-statistic):               0.00
Time:                        22:20:01   Log-Likelihood:            -2.3689e+05
No. Observations:              114475   AIC:                         4.738e+05
Df Residuals:                  114465   BIC:                         4.739e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               5.4488    

In [20]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
data = pd.read_csv("BANKNIFTYFUTALL.csv")
data.drop([data.index[457897]],inplace=True)
x=data.return800

newpoints= data.index[data['return800'] <= np.percentile(x,25)].tolist()
df= data.iloc[newpoints,:]

feature_cols_JUN = ['trade', 'event', 'ob', 'spreadreturn','niftytrade','niftyidevent', 'niftyidob','niftyidspreadreturn','niftyidindex']
X= df[feature_cols_JUN]
Y = df.return800
reg= LinearRegression()
reg.fit(X,Y)
r_sq = reg.score(X,Y)
print("r_square :",r_sq)

r_square : 0.028110517478668773


In [21]:
df["predict"] = reg.predict(X)
print(df[['predict','return800']].corr())

            predict  return800
predict    1.000000   0.167662
return800  0.167662   1.000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [22]:
from statsmodels.formula.api import ols
reg1 = ols(formula="return800~trade+event+ob+spreadreturn+niftytrade+niftyidevent+niftyidob+niftyidspreadreturn+niftyidindex ",data=df)
fit1= reg1.fit()
print(fit1.summary())

                            OLS Regression Results                            
Dep. Variable:              return800   R-squared:                       0.028
Model:                            OLS   Adj. R-squared:                  0.028
Method:                 Least Squares   F-statistic:                     367.9
Date:                Mon, 28 Sep 2020   Prob (F-statistic):               0.00
Time:                        22:27:59   Log-Likelihood:            -2.4092e+05
No. Observations:              114475   AIC:                         4.819e+05
Df Residuals:                  114465   BIC:                         4.820e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              -5.4132    