In [96]:
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression

In [97]:
df = pd.read_csv('data.csv', parse_dates=[1]).sort_values(by = "date")
df['date'] = df['date'].dt.to_period('M')

In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 180814 entries, 135026 to 180813
Data columns (total 37 columns):
 #   Column                           Non-Null Count   Dtype    
---  ------                           --------------   -----    
 0   optid                            180814 non-null  int64    
 1   date                             180814 non-null  period[M]
 2   secid                            180814 non-null  int64    
 3   cp_flag                          180814 non-null  object   
 4   strike                           180814 non-null  float64  
 5   bid                              180814 non-null  float64  
 6   ask                              180814 non-null  float64  
 7   volume                           180814 non-null  float64  
 8   openint                          180814 non-null  float64  
 9   impvol                           180814 non-null  float64  
 10  delta                            180814 non-null  float64  
 11  gamma                            1

In [99]:
df.head()

Unnamed: 0,optid,date,secid,cp_flag,strike,bid,ask,volume,openint,impvol,...,vix,dhedged_return_mid,dhedged_return_spot,dhedged_return_spot_gamma,dhedged_return_mid_delev,dhedged_return_spot_delev,dhedged_return_spot_gamma_delev,IV_mness_deriv_1,IV_ttm_deriv_1,short_rate
135026,10043558,1996-02,108105,P,575.0,0.375,0.4375,20.0,2019.0,0.233068,...,15.37,-0.395369,-0.000248,-0.171185,-0.00971,-6.087413e-06,-0.004204,-0.037859,1.418111e-05,0.05427
421,10368325,1996-02,108105,C,500.0,155.25,156.25,0.0,750.0,0.424562,...,15.37,0.007059,0.001697,2.714935,0.001731,0.0004160267,0.665643,,,0.05427
91308,11506611,1996-02,108105,P,665.0,23.875,24.875,52.0,459.0,0.122164,...,15.37,0.09959,0.003746,0.447423,0.006534,0.0002457917,0.029355,-0.02641,-7.934008e-06,0.05427
112280,10170877,1996-02,108105,P,525.0,0.0625,0.125,500.0,7725.0,0.310703,...,15.37,-0.117722,-1.7e-05,-0.06171,-0.003241,-4.689261e-07,-0.001699,-0.037859,1.418111e-05,0.05427
67985,11516393,1996-02,108105,C,685.0,7.875,8.375,0.0,458.0,0.128511,...,15.37,0.257634,0.00323,0.470434,0.011572,0.0001451034,0.021131,-0.010402,8.521277e-07,0.05427


In [59]:
def get_summary(df_col):
    print(f'\nSummary of {df_col.name}:')
    print(f'Mean: {np.round( df_col.mean(),2)}')
    print(f'Median: {np.round( df_col.median(),2)}')
    print(f'Standard deviation: {np.round( df_col.std(),2)}')
    print(f'Number of observations: {np.round( df_col.count(),2)}')    

In [86]:
def add_months(date ,period_to_add=1):
    return (datetime.strptime(start_date, '%Y-%m') + relativedelta(months=1)).strftime('%Y-%m')

In [100]:
features = ['mness', 'ttm', 'embed_lev', 'impvol', 'gamma', 'vega', 'theta', 'midprice']

In [61]:
print('\nSummary statistics for Call options\n')
df[df["cp_flag"] == 'C'][features].apply(lambda x: get_summary(x), axis=0);


Summary statistics for Call options


Summary of mness:
Mean: 0.49
Median: 0.31
Standard deviation: 1.3
Number of observations: 69153

Summary of ttm:
Mean: 208.77
Median: 91.0
Standard deviation: 224.23
Number of observations: 69153

Summary of embed_lev:
Mean: 22.16
Median: 14.24
Standard deviation: 21.84
Number of observations: 69153

Summary of impvol:
Mean: 0.21
Median: 0.18
Standard deviation: 0.15
Number of observations: 69153

Summary of gamma:
Mean: 0.0
Median: 0.0
Standard deviation: 0.0
Number of observations: 69153

Summary of vega:
Mean: 240.56
Median: 175.74
Standard deviation: 230.0
Number of observations: 69153

Summary of theta:
Mean: -56.11
Median: -43.82
Standard deviation: 51.21
Number of observations: 69153

Summary of midprice:
Mean: 91.37
Median: 37.3
Standard deviation: 163.55
Number of observations: 69153


In [62]:
print('\nSummary statistics for Put options\n')

df[df["cp_flag"] == 'P'][features].apply(lambda x: get_summary(x), axis=0);


Summary statistics for Put options


Summary of mness:
Mean: -0.9
Median: -1.03
Standard deviation: 1.43
Number of observations: 111661

Summary of ttm:
Mean: 202.6
Median: 91.0
Standard deviation: 222.15
Number of observations: 111661

Summary of embed_lev:
Mean: 14.66
Median: 11.79
Standard deviation: 10.61
Number of observations: 111661

Summary of impvol:
Mean: 0.3
Median: 0.26
Standard deviation: 0.18
Number of observations: 111661

Summary of gamma:
Mean: 0.0
Median: 0.0
Standard deviation: 0.0
Number of observations: 111661

Summary of vega:
Mean: 178.79
Median: 104.99
Standard deviation: 205.16
Number of observations: 111661

Summary of theta:
Mean: -49.17
Median: -36.14
Standard deviation: 49.51
Number of observations: 111661

Summary of midprice:
Mean: 71.58
Median: 14.2
Standard deviation: 154.07
Number of observations: 111661


In [101]:
features += ['strike', 'spot_close', 'divrate', 'vix', 'short_rate', 'delta', 'date'] + ['optid']

In [102]:
features

['mness',
 'ttm',
 'embed_lev',
 'impvol',
 'gamma',
 'vega',
 'theta',
 'midprice',
 'strike',
 'spot_close',
 'divrate',
 'vix',
 'short_rate',
 'delta',
 'date',
 'optid']

In [103]:
df[features].head()

Unnamed: 0,mness,ttm,embed_lev,impvol,gamma,vega,theta,midprice,strike,spot_close,divrate,vix,short_rate,delta,date,optid
135026,-1.851039,28,40.719462,0.233068,0.001448,10.47927,-16.01202,0.40625,575.0,647.98,,15.37,0.05427,-0.025529,1996-02,10043558
421,-1.838855,63,4.078667,0.424562,0.000625,13.04659,-33.53668,155.75,500.0,647.98,,15.37,0.05427,0.971854,1996-02,10368325
91308,0.361222,126,15.241659,0.122164,0.008373,147.0848,-14.92941,24.375,665.0,647.98,,15.37,0.05427,-0.573344,1996-02,11506611
112280,-2.445653,28,36.321439,0.310703,0.000276,2.663379,-5.490642,0.09375,525.0,647.98,,15.37,0.05427,-0.005255,1996-02,10170877
67985,0.735826,126,22.263237,0.128511,0.006867,126.8969,-28.27068,8.125,685.0,647.98,,15.37,0.05427,0.279158,1996-02,11516393


In [105]:
clear_df = df[features]

In [106]:
start_date = '2007-02'
y_price = clear_df[clear_df['date'] > start_date]['midprice']
y_price

50265      80.900
141595     86.400
5521      176.300
141621      1.475
6209        3.750
           ...   
85034       0.025
85041       0.025
89125       0.900
84988       1.650
180813      2.500
Name: midprice, Length: 132500, dtype: float64

In [91]:
first_df = clear_df[clear_df['date'] < add_months(start_date)]
first_df.tail(45)

Unnamed: 0,mness,ttm,embed_lev,impvol,gamma,vega,theta,midprice,strike,spot_close,divrate,vix,short_rate,delta,date
118287,-0.148415,28,55.073037,0.092769,0.010476,152.3094,-75.307,10.45,1450.0,1455.54,0.020587,10.02,0.05372,-0.395395,2007-02
6463,-0.259409,119,15.629547,0.119515,0.003609,295.4131,-84.8316,62.6,1430.0,1455.54,0.020587,10.02,0.05372,0.672197,2007-02
164502,-0.333664,490,9.004965,0.147241,0.001248,521.3753,-15.08405,39.35,1375.0,1455.54,0.020587,10.02,0.05372,-0.243446,2007-02
164184,-2.828219,28,45.995064,0.273331,4.7e-05,2.025466,-3.660911,0.05,1175.0,1455.54,0.020587,10.02,0.05372,-0.00158,2007-02
50020,-0.954472,490,9.548246,0.174569,0.000588,291.5377,-13.52153,15.1,1200.0,1455.54,0.020587,10.02,0.05372,-0.099055,2007-02
6455,0.757556,63,46.865544,0.084991,0.006592,201.6322,-63.96737,8.7,1495.0,1455.54,0.020587,10.02,0.05372,0.280123,2007-02
72492,-1.90948,63,35.278865,0.181854,0.000455,29.76996,-14.86059,0.85,1260.0,1455.54,0.020587,10.02,0.05372,-0.020602,2007-02
118600,-1.757421,490,10.344038,0.209539,0.000132,78.43915,-5.085775,2.7,950.0,1455.54,0.020587,10.02,0.05372,-0.019188,2007-02
118590,-0.370208,217,9.67452,0.136293,0.002138,365.2758,-72.04021,108.7,1400.0,1455.54,0.020587,10.02,0.05372,0.722495,2007-02
50175,2.050035,28,15.632868,0.11074,0.001258,21.82507,36.9827,90.8,1550.0,1455.54,0.020587,10.02,0.05372,-0.975215,2007-02


In [None]:
first_df = clear_df[clear_df['date'] < '2007-01-01'].sort_values(by='date').drop(columns=['date'])

train_idx = int(first_df.shape[0]*0.85)

# Не робити train/test split
train_set = first_df.head(train_idx)
test_set = first_df.tail(first_df.shape[0] - train_idx)

train_y = train_set['midprice']
train_x = train_set.drop(columns=['midprice'])

test_y = test_set['midprice']
test_x = test_set.drop(columns=['midprice'])

In [None]:
reg = LinearRegression()

In [None]:
reg.fit(train_x, train_y)

In [None]:
y_pred = 