In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import statsmodels.formula.api as smf


In [2]:
train_df = pd.read_csv("option_train.csv")
train_df.head()

Unnamed: 0,Value,S,K,tau,r,BS
0,21.670404,431.623898,420,0.34127,0.03013,Under
1,0.125,427.015526,465,0.166667,0.03126,Over
2,20.691244,427.762336,415,0.265873,0.03116,Under
3,1.035002,451.711658,460,0.063492,0.02972,Over
4,39.55302,446.718974,410,0.166667,0.02962,Under


In [3]:
#create training and validation sets from df_train
train_nums = train_df.select_dtypes(["float64","int64"]) #not sure if allowed to use BS as a feature

y=train_nums["Value"].values
X=train_nums.drop("Value",axis=1).values

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.20,random_state = 1)

In [4]:
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_val_norm = mms.transform(X_val)

In [5]:
def normed_df(X,y):
    df1 = pd.DataFrame(X,columns = train_nums.columns[1:])
    df2 = pd.DataFrame(y,columns = ["Value"])
    df3 = pd.concat([df2,df1],axis=1)
    return df3

train_df_norm = normed_df(X_train_norm,y_train)
val_df_norm = normed_df(X_val_norm,y_val)

In [6]:
result1 = smf.ols('Value ~ K', data=train_df_norm).fit()
result1.summary()

0,1,2,3
Dep. Variable:,Value,R-squared:,0.783
Model:,OLS,Adj. R-squared:,0.783
Method:,Least Squares,F-statistic:,4849.0
Date:,"Mon, 12 Apr 2021",Prob (F-statistic):,0.0
Time:,19:36:51,Log-Likelihood:,-4421.0
No. Observations:,1344,AIC:,8846.0
Df Residuals:,1342,BIC:,8856.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,48.2368,0.510,94.514,0.000,47.236,49.238
K,-65.6902,0.943,-69.635,0.000,-67.541,-63.840

0,1,2,3
Omnibus:,17.326,Durbin-Watson:,1.987
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14.705
Skew:,0.189,Prob(JB):,0.000641
Kurtosis:,2.655,Cond. No.,6.73


In [7]:
result1_test = smf.ols('Value ~ K', data=val_df_norm).fit()
result1_test.summary()

0,1,2,3
Dep. Variable:,Value,R-squared:,0.791
Model:,OLS,Adj. R-squared:,0.79
Method:,Least Squares,F-statistic:,1264.0
Date:,"Mon, 12 Apr 2021",Prob (F-statistic):,1.39e-115
Time:,19:36:51,Log-Likelihood:,-1109.1
No. Observations:,336,AIC:,2222.0
Df Residuals:,334,BIC:,2230.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,50.4188,1.041,48.444,0.000,48.371,52.466
K,-69.3981,1.952,-35.558,0.000,-73.237,-65.559

0,1,2,3
Omnibus:,4.262,Durbin-Watson:,1.973
Prob(Omnibus):,0.119,Jarque-Bera (JB):,3.518
Skew:,0.152,Prob(JB):,0.172
Kurtosis:,2.601,Cond. No.,6.83


In [8]:
feats = ' + '.join(train_df_norm.columns[1:])
result2 = smf.ols('Value ~' + feats,data=train_df_norm).fit()       
result2.summary()

0,1,2,3
Dep. Variable:,Value,R-squared:,0.911
Model:,OLS,Adj. R-squared:,0.91
Method:,Least Squares,F-statistic:,3415.0
Date:,"Mon, 12 Apr 2021",Prob (F-statistic):,0.0
Time:,19:36:51,Log-Likelihood:,-3824.9
No. Observations:,1344,AIC:,7660.0
Df Residuals:,1339,BIC:,7686.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,36.0066,0.530,67.902,0.000,34.966,37.047
S,18.8558,0.529,35.641,0.000,17.818,19.894
K,-73.6662,0.637,-115.687,0.000,-74.915,-72.417
tau,12.4906,0.456,27.412,0.000,11.597,13.384
r,1.2907,0.545,2.368,0.018,0.221,2.360

0,1,2,3
Omnibus:,123.869,Durbin-Watson:,1.939
Prob(Omnibus):,0.0,Jarque-Bera (JB):,159.197
Skew:,0.843,Prob(JB):,2.7000000000000003e-35
Kurtosis:,3.005,Cond. No.,9.66


In [9]:
result2_test = smf.ols('Value ~' + feats,data=val_df_norm).fit()       
result2_test.summary()

0,1,2,3
Dep. Variable:,Value,R-squared:,0.916
Model:,OLS,Adj. R-squared:,0.915
Method:,Least Squares,F-statistic:,906.1
Date:,"Mon, 12 Apr 2021",Prob (F-statistic):,7.6300000000000005e-177
Time:,19:36:51,Log-Likelihood:,-955.34
No. Observations:,336,AIC:,1921.0
Df Residuals:,331,BIC:,1940.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,36.8872,1.114,33.113,0.000,34.696,39.079
S,19.1089,1.081,17.676,0.000,16.982,21.236
K,-74.4105,1.262,-58.949,0.000,-76.894,-71.927
tau,11.5616,0.912,12.679,0.000,9.768,13.355
r,0.9601,1.120,0.857,0.392,-1.243,3.164

0,1,2,3
Omnibus:,39.129,Durbin-Watson:,1.974
Prob(Omnibus):,0.0,Jarque-Bera (JB):,49.398
Skew:,0.917,Prob(JB):,1.88e-11
Kurtosis:,3.407,Cond. No.,10.1


In [10]:
result3 = smf.ols('Value ~ K + K*r + K/tau + K*tau + S ', data=train_df_norm).fit()
result3.summary()

0,1,2,3
Dep. Variable:,Value,R-squared:,0.918
Model:,OLS,Adj. R-squared:,0.918
Method:,Least Squares,F-statistic:,2508.0
Date:,"Mon, 12 Apr 2021",Prob (F-statistic):,0.0
Time:,19:36:51,Log-Likelihood:,-3764.5
No. Observations:,1344,AIC:,7543.0
Df Residuals:,1337,BIC:,7579.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,41.7522,0.915,45.652,0.000,39.958,43.546
K,-85.6810,1.692,-50.628,0.000,-89.001,-82.361
r,5.6887,1.171,4.856,0.000,3.391,7.987
K:r,-9.7073,2.343,-4.144,0.000,-14.303,-5.112
K:tau,26.2145,2.461,10.652,0.000,21.387,31.042
tau,-0.6867,1.306,-0.526,0.599,-3.249,1.875
S,19.1893,0.508,37.789,0.000,18.193,20.185

0,1,2,3
Omnibus:,124.624,Durbin-Watson:,1.943
Prob(Omnibus):,0.0,Jarque-Bera (JB):,160.002
Skew:,0.844,Prob(JB):,1.8e-35
Kurtosis:,3.088,Cond. No.,41.7


In [11]:
result3_test = smf.ols('Value ~ K + K*r + K/tau + K*tau + S ', data=val_df_norm).fit()
result3_test.summary()

0,1,2,3
Dep. Variable:,Value,R-squared:,0.919
Model:,OLS,Adj. R-squared:,0.917
Method:,Least Squares,F-statistic:,620.8
Date:,"Mon, 12 Apr 2021",Prob (F-statistic):,4.4599999999999996e-176
Time:,19:36:51,Log-Likelihood:,-950.18
No. Observations:,336,AIC:,1914.0
Df Residuals:,329,BIC:,1941.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,42.4252,2.050,20.696,0.000,38.393,46.458
K,-85.4535,3.668,-23.296,0.000,-92.669,-78.238
r,-3.5563,2.784,-1.278,0.202,-9.032,1.919
K:r,9.0794,5.439,1.669,0.096,-1.620,19.779
K:tau,14.4745,5.273,2.745,0.006,4.102,24.847
tau,4.2874,2.763,1.552,0.122,-1.147,9.722
S,19.0978,1.068,17.883,0.000,16.997,21.199

0,1,2,3
Omnibus:,36.301,Durbin-Watson:,1.972
Prob(Omnibus):,0.0,Jarque-Bera (JB):,45.125
Skew:,0.885,Prob(JB):,1.59e-10
Kurtosis:,3.298,Cond. No.,44.9


In [12]:
result4 = smf.ols('Value ~ K + K*r + K/tau + K*tau + S + S/K + S/r + S*tau + S*K ', data=train_df_norm).fit()
result4.summary()

0,1,2,3
Dep. Variable:,Value,R-squared:,0.921
Model:,OLS,Adj. R-squared:,0.92
Method:,Least Squares,F-statistic:,1725.0
Date:,"Mon, 12 Apr 2021",Prob (F-statistic):,0.0
Time:,19:36:53,Log-Likelihood:,-3743.7
No. Observations:,1344,AIC:,7507.0
Df Residuals:,1334,BIC:,7559.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,35.5145,1.358,26.146,0.000,32.850,38.179
K,-77.2104,2.556,-30.207,0.000,-82.225,-72.196
r,9.7624,1.441,6.774,0.000,6.935,12.589
K:r,-16.6818,2.753,-6.060,0.000,-22.082,-11.281
K:tau,27.4583,2.502,10.975,0.000,22.550,32.366
tau,2.1185,1.465,1.446,0.148,-0.755,4.992
S,29.6810,1.768,16.785,0.000,26.212,33.150
S:K,-13.8209,2.711,-5.098,0.000,-19.139,-8.503
S:r,-1.1166,2.135,-0.523,0.601,-5.304,3.071

0,1,2,3
Omnibus:,121.32,Durbin-Watson:,1.947
Prob(Omnibus):,0.0,Jarque-Bera (JB):,154.769
Skew:,0.83,Prob(JB):,2.47e-34
Kurtosis:,3.077,Cond. No.,61.7


In [13]:
result4_test = smf.ols('Value ~ K + K/r + K/tau + K*tau + S + S/K + S/r + S/tau + S*K ', data=val_df_norm).fit()
result4_test.summary()

0,1,2,3
Dep. Variable:,Value,R-squared:,0.925
Model:,OLS,Adj. R-squared:,0.923
Method:,Least Squares,F-statistic:,502.1
Date:,"Mon, 12 Apr 2021",Prob (F-statistic):,1.31e-178
Time:,19:36:54,Log-Likelihood:,-937.56
No. Observations:,336,AIC:,1893.0
Df Residuals:,327,BIC:,1927.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,34.4903,2.182,15.810,0.000,30.199,38.782
K,-69.7740,4.531,-15.400,0.000,-78.687,-60.861
K:r,0.9099,3.407,0.267,0.790,-5.793,7.612
K:tau,13.5088,5.216,2.590,0.010,3.248,23.770
tau,5.8278,3.142,1.855,0.065,-0.353,12.009
S,32.0578,3.125,10.260,0.000,25.911,38.205
S:K,-24.8679,4.950,-5.023,0.000,-34.607,-15.129
S:r,0.5582,4.071,0.137,0.891,-7.450,8.566
S:tau,-1.8899,3.803,-0.497,0.620,-9.371,5.591

0,1,2,3
Omnibus:,33.314,Durbin-Watson:,1.983
Prob(Omnibus):,0.0,Jarque-Bera (JB):,40.512
Skew:,0.839,Prob(JB):,1.6e-09
Kurtosis:,3.281,Cond. No.,49.9


In [16]:
result4 = smf.ols('Value ~ K + K/r + K*r + K/tau + K*tau + S + S/K + S/r + S*tau + S*K ', data=train_df_norm).fit()
result4.summary()

0,1,2,3
Dep. Variable:,Value,R-squared:,0.921
Model:,OLS,Adj. R-squared:,0.92
Method:,Least Squares,F-statistic:,1725.0
Date:,"Mon, 12 Apr 2021",Prob (F-statistic):,0.0
Time:,19:49:32,Log-Likelihood:,-3743.7
No. Observations:,1344,AIC:,7507.0
Df Residuals:,1334,BIC:,7559.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,35.5145,1.358,26.146,0.000,32.850,38.179
K,-77.2104,2.556,-30.207,0.000,-82.225,-72.196
K:r,-16.6818,2.753,-6.060,0.000,-22.082,-11.281
r,9.7624,1.441,6.774,0.000,6.935,12.589
K:tau,27.4583,2.502,10.975,0.000,22.550,32.366
tau,2.1185,1.465,1.446,0.148,-0.755,4.992
S,29.6810,1.768,16.785,0.000,26.212,33.150
S:K,-13.8209,2.711,-5.098,0.000,-19.139,-8.503
S:r,-1.1166,2.135,-0.523,0.601,-5.304,3.071

0,1,2,3
Omnibus:,121.32,Durbin-Watson:,1.947
Prob(Omnibus):,0.0,Jarque-Bera (JB):,154.769
Skew:,0.83,Prob(JB):,2.47e-34
Kurtosis:,3.077,Cond. No.,61.7


In [17]:
result4 = smf.ols('Value ~ K + K/r + K*r + K/tau + K*tau + S + S/K + S/r + S*tau + S*K ', data=val_df_norm).fit()
result4.summary()

0,1,2,3
Dep. Variable:,Value,R-squared:,0.925
Model:,OLS,Adj. R-squared:,0.923
Method:,Least Squares,F-statistic:,448.9
Date:,"Mon, 12 Apr 2021",Prob (F-statistic):,7.92e-178
Time:,19:49:32,Log-Likelihood:,-936.18
No. Observations:,336,AIC:,1892.0
Df Residuals:,326,BIC:,1931.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,31.2524,2.936,10.645,0.000,25.477,37.028
K,-64.6606,5.487,-11.783,0.000,-75.456,-53.865
K:r,-7.6297,6.211,-1.228,0.220,-19.848,4.589
r,5.4594,3.324,1.643,0.101,-1.079,11.998
K:tau,13.1498,5.207,2.525,0.012,2.906,23.393
tau,6.1050,3.138,1.945,0.053,-0.069,12.279
S,35.3157,3.694,9.560,0.000,28.049,42.583
S:K,-29.3194,5.632,-5.205,0.000,-40.400,-18.239
S:r,-1.7997,4.307,-0.418,0.676,-10.272,6.672

0,1,2,3
Omnibus:,31.13,Durbin-Watson:,1.988
Prob(Omnibus):,0.0,Jarque-Bera (JB):,37.421
Skew:,0.811,Prob(JB):,7.48e-09
Kurtosis:,3.206,Cond. No.,66.6


In [None]:
# build this again in sklearn 