In [56]:
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_hdf('SP500_Options_Monthly.h5')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 180814 entries, 0 to 180813
Data columns (total 37 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   optid                            180814 non-null  int64         
 1   date                             180814 non-null  datetime64[ns]
 2   secid                            180814 non-null  int64         
 3   cp_flag                          180814 non-null  object        
 4   strike                           180814 non-null  float64       
 5   bid                              180814 non-null  float64       
 6   ask                              180814 non-null  float64       
 7   volume                           180814 non-null  float64       
 8   openint                          180814 non-null  float64       
 9   impvol                           180814 non-null  float64       
 10  delta                            180814 non-null 

In [4]:
def get_summary(df_col):
    print(f'\nSummary of {df_col.name}:')
    print(f'Mean: {np.round( df_col.mean(),2)}')
    print(f'Median: {np.round( df_col.median(),2)}')
    print(f'Standard deviation: {np.round( df_col.std(),2)}')
    print(f'Number of observations: {np.round( df_col.count(),2)}')    

In [5]:
df[['cp_flag', 'date','strike', 'midprice', 'spot_close', 'mness','divrate', 'delta', 'gamma', 'vega', 'theta']].head()

Unnamed: 0,cp_flag,date,strike,midprice,spot_close,mness,divrate,delta,gamma,vega,theta
0,P,1996-12-20,825.0,80.25,748.87,0.493527,0.021764,-0.499509,0.002263,392.6179,2.388857
1,P,1997-01-17,825.0,63.25,776.17,0.264026,0.022156,-0.413864,0.002102,404.5886,-1.370254
2,P,1997-02-21,825.0,57.625,801.77,0.119626,0.022782,-0.37213,0.001925,398.2385,-4.68585
3,P,1997-03-21,825.0,63.375,784.1,0.21125,0.023084,-0.400019,0.001992,388.8088,-3.081652
4,P,1997-04-18,825.0,65.875,766.34,0.32591,0.023036,-0.43677,0.002214,379.395,-0.389018


In [6]:
df[['mness', 'ttm', 'embed_lev', 'impvol', 'gamma', 'vega', 'theta', 'midprice']].apply(lambda x: get_summary(x), axis=0)


Summary of mness:
Mean: -0.37
Median: -0.42
Standard deviation: 1.54
Number of observations: 180814

Summary of ttm:
Mean: 204.96
Median: 91.0
Standard deviation: 222.96
Number of observations: 180814

Summary of embed_lev:
Mean: 17.53
Median: 12.67
Standard deviation: 16.29
Number of observations: 180814

Summary of impvol:
Mean: 0.27
Median: 0.22
Standard deviation: 0.18
Number of observations: 180814

Summary of gamma:
Mean: 0.0
Median: 0.0
Standard deviation: 0.0
Number of observations: 180814

Summary of vega:
Mean: 202.41
Median: 131.15
Standard deviation: 217.08
Number of observations: 180814

Summary of theta:
Mean: -51.82
Median: -39.13
Standard deviation: 50.28
Number of observations: 180814

Summary of midprice:
Mean: 79.15
Median: 20.75
Standard deviation: 158.05
Number of observations: 180814


mness        None
ttm          None
embed_lev    None
impvol       None
gamma        None
vega         None
theta        None
midprice     None
dtype: object

In [95]:
df['date'].value_counts()[:60]

2017-05-19    1718
2016-02-19    1664
2015-09-18    1656
2015-10-16    1643
2017-08-18    1613
2015-08-21    1571
2015-11-20    1550
2015-12-18    1545
2016-01-15    1522
2016-05-20    1511
2015-04-17    1510
2017-06-16    1495
2015-05-15    1467
2017-07-21    1463
2015-03-20    1454
2015-01-16    1422
2015-02-20    1414
2015-06-19    1406
2015-07-17    1396
2017-09-15    1376
2014-12-19    1374
2016-03-18    1372
2016-04-15    1360
2014-10-17    1350
2016-06-17    1348
2014-11-21    1339
2017-04-21    1337
2016-07-15    1314
2016-08-19    1302
2016-09-16    1286
2014-08-15    1267
2016-11-18    1264
2016-10-21    1232
2014-09-19    1231
2014-02-21    1223
2016-12-16    1206
2014-04-17    1199
2014-05-16    1199
2014-07-18    1197
2014-06-20    1165
2014-03-21    1160
2013-06-21    1160
2017-01-20    1144
2014-01-17    1141
2013-08-16    1135
2017-02-17    1126
2013-12-20    1120
2013-09-20    1116
2013-07-19    1093
2013-05-17    1085
2013-04-19    1072
2013-11-15    1070
2013-10-18  

In [82]:
clear_df = df[['optid','date','mness', 'ttm', 'embed_lev', 'impvol', 'gamma', 'vega', 'theta', 'midprice']]

In [96]:
first_df = clear_df[clear_df['date'] < '2007-02-20'].sort_values(by='date')
first_df.tail(45)

Unnamed: 0,optid,date,mness,ttm,embed_lev,impvol,gamma,vega,theta,midprice
72467,32391891,2007-02-16,-2.528281,28,39.43543,0.293655,0.000103,4.756402,-9.231032,0.15
72468,32391892,2007-02-16,-2.806844,28,49.342806,0.253702,5.4e-05,2.159604,-3.616476,0.05
72469,32392586,2007-02-16,-0.352688,119,14.61196,0.122753,0.003358,282.2967,-84.99206,70.1
72473,32394188,2007-02-16,1.441061,63,66.070758,0.077865,0.003873,108.5324,-29.96691,2.3
72475,32394189,2007-02-16,1.606049,63,21.550504,0.069866,0.00355,89.25993,30.77129,61.9
6479,32402039,2007-02-16,-1.188433,63,37.029746,0.137506,0.001993,98.60223,-35.13997,3.6
50255,32310551,2007-02-16,-1.436895,28,62.786797,0.133911,0.002258,47.38307,-39.81388,1.4
72476,32396677,2007-02-16,-0.93461,63,36.340415,0.127938,0.002894,133.2589,-42.89698,5.6
72478,32396683,2007-02-16,0.253592,63,34.48449,0.093829,0.007038,237.6519,-87.77573,19.6
72480,32396685,2007-02-16,0.077096,63,32.36967,0.095519,0.006911,237.5576,-42.56083,20.8


In [54]:
first_df = clear_df[clear_df['date'] < '2007-01-01'].sort_values(by='date').drop(columns=['date'])

train_idx = int(first_df.shape[0]*0.85)

# Не робити train/test split
train_set = first_df.head(train_idx)
test_set = first_df.tail(first_df.shape[0] - train_idx)

train_y = train_set['midprice']
train_x = train_set.drop(columns=['midprice'])

test_y = test_set['midprice']
test_x = test_set.drop(columns=['midprice'])

In [57]:
reg = LinearRegression()

In [58]:
reg.fit(train_x, train_y)

LinearRegression()

In [None]:
y_pred = 