In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as r2

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor

In [2]:
df = pd.read_csv('data.csv', parse_dates=[1]).sort_values(by = "date")
df['date'] = df['date'].dt.to_period('M')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 180814 entries, 135026 to 180813
Data columns (total 37 columns):
optid                              180814 non-null int64
date                               180814 non-null period[M]
secid                              180814 non-null int64
cp_flag                            180814 non-null object
strike                             180814 non-null float64
bid                                180814 non-null float64
ask                                180814 non-null float64
volume                             180814 non-null float64
openint                            180814 non-null float64
impvol                             180814 non-null float64
delta                              180814 non-null float64
gamma                              180814 non-null float64
vega                               180814 non-null float64
theta                              180814 non-null float64
midprice                           180814 non-null float64
la

In [4]:
df.head()

Unnamed: 0,optid,date,secid,cp_flag,strike,bid,ask,volume,openint,impvol,...,vix,dhedged_return_mid,dhedged_return_spot,dhedged_return_spot_gamma,dhedged_return_mid_delev,dhedged_return_spot_delev,dhedged_return_spot_gamma_delev,IV_mness_deriv_1,IV_ttm_deriv_1,short_rate
135026,10043558,1996-02,108105,P,575.0,0.375,0.4375,20.0,2019.0,0.233068,...,15.37,-0.395369,-0.000248,-0.171185,-0.00971,-6.087413e-06,-0.004204,-0.037859,1.418111e-05,0.05427
421,10368325,1996-02,108105,C,500.0,155.25,156.25,0.0,750.0,0.424562,...,15.37,0.007059,0.001697,2.714935,0.001731,0.0004160267,0.665643,,,0.05427
91308,11506611,1996-02,108105,P,665.0,23.875,24.875,52.0,459.0,0.122164,...,15.37,0.09959,0.003746,0.447423,0.006534,0.0002457917,0.029355,-0.02641,-7.934008e-06,0.05427
112280,10170877,1996-02,108105,P,525.0,0.0625,0.125,500.0,7725.0,0.310703,...,15.37,-0.117722,-1.7e-05,-0.06171,-0.003241,-4.689261e-07,-0.001699,-0.037859,1.418111e-05,0.05427
67985,11516393,1996-02,108105,C,685.0,7.875,8.375,0.0,458.0,0.128511,...,15.37,0.257634,0.00323,0.470434,0.011572,0.0001451034,0.021131,-0.010402,8.521277e-07,0.05427


In [5]:
def get_summary(df_col):
    print(f'\nSummary of {df_col.name}:')
    print(f'Mean: {np.round( df_col.mean(),2)}')
    print(f'Median: {np.round( df_col.median(),2)}')
    print(f'Standard deviation: {np.round( df_col.std(),2)}')
    print(f'Number of observations: {np.round( df_col.count(),2)}')    

In [6]:
def add_months(date ,period_to_add=1):
    return (datetime.strptime(date, '%Y-%m') + relativedelta(months=period_to_add)).strftime('%Y-%m')

In [7]:
features = [ 'midprice', 'mness', 'ttm', 'embed_lev', 'impvol', 'gamma', 'vega', 'theta']

In [104]:
print('\nSummary statistics for Call options\n')
df[df["cp_flag"] == 'C'][features].apply(lambda x: get_summary(x), axis=0);


Summary statistics for Call options


Summary of midprice:
Mean: 91.37
Median: 37.3
Standard deviation: 163.55
Number of observations: 69153

Summary of mness:
Mean: 0.49
Median: 0.31
Standard deviation: 1.3
Number of observations: 69153

Summary of ttm:
Mean: 208.77
Median: 91.0
Standard deviation: 224.23
Number of observations: 69153

Summary of embed_lev:
Mean: 22.16
Median: 14.24
Standard deviation: 21.84
Number of observations: 69153

Summary of impvol:
Mean: 0.21
Median: 0.18
Standard deviation: 0.15
Number of observations: 69153

Summary of gamma:
Mean: 0.0
Median: 0.0
Standard deviation: 0.0
Number of observations: 69153

Summary of vega:
Mean: 240.56
Median: 175.74
Standard deviation: 230.0
Number of observations: 69153

Summary of theta:
Mean: -56.11
Median: -43.82
Standard deviation: 51.21
Number of observations: 69153


In [8]:
print('\nSummary statistics for Put options\n')

df[df["cp_flag"] == 'P'][features].apply(lambda x: get_summary(x), axis=0);


Summary statistics for Put options


Summary of midprice:
Mean: 71.58
Median: 14.2
Standard deviation: 154.07
Number of observations: 111661

Summary of mness:
Mean: -0.9
Median: -1.03
Standard deviation: 1.43
Number of observations: 111661

Summary of ttm:
Mean: 202.6
Median: 91.0
Standard deviation: 222.15
Number of observations: 111661

Summary of embed_lev:
Mean: 14.66
Median: 11.79
Standard deviation: 10.61
Number of observations: 111661

Summary of impvol:
Mean: 0.3
Median: 0.26
Standard deviation: 0.18
Number of observations: 111661

Summary of gamma:
Mean: 0.0
Median: 0.0
Standard deviation: 0.0
Number of observations: 111661

Summary of vega:
Mean: 178.79
Median: 104.99
Standard deviation: 205.16
Number of observations: 111661

Summary of theta:
Mean: -49.17
Median: -36.14
Standard deviation: 49.51
Number of observations: 111661


In [9]:
features = ['midprice', 'mness', 'ttm', 'embed_lev', 'impvol', 'gamma', 'vega', 'theta', 'strike', 'spot_close', 'divrate', 'vix', 'short_rate', 'delta', 'date']# + ['optid']
train_features = features[1:-1]

# Without Put-Call Separation

## Validations

In [10]:
clear_df = df[features][279:]
start_date = '2007-01'
date_df = df[df["date"] < start_date][519:]

y_valid = clear_df[clear_df['date'] > start_date]['midprice']

### Random Forests

In [12]:
mse = np.array([])
r_2 = np.array([])

i = 0
for current_date in date_df[date_df['date'] < start_date]['date'].unique():
    i += 1
    print(i)
    print(current_date)

    x_train = clear_df[clear_df['date'] < current_date][train_features]
    y_train = clear_df[clear_df['date'] < current_date]['midprice']

    # 85 and 15 %
    bound = int(round(0.85 * len(x_train), 0))

    # data for training
    x_train_train = x_train.iloc[1:bound]
    y_train_train = y_train.iloc[1:bound]
    x_train_validation = x_train.iloc[bound:]
    y_train_validation = y_train.iloc[bound:]

    # training
    rf = RandomForestRegressor(n_estimators=300, min_samples_leaf=0.05)
    rf.fit(x_train_train, y_train_train)
    
    y_train_pred = rf.predict(x_train_validation)
    
    mse_train_validation = MSE(y_train_validation, y_train_pred)
    r2_train_validation = r2(y_train_validation, y_train_pred)
    
    mse = np.append(mse, mse_train_validation)
    r_2 = np.append(r_2, r2_train_validation)

print(mse)
print(r_2)

print('MSE: {:.2f}'.format(np.mean(mse)))
print('R2: {:.2f}'.format(np.mean(r_2)))

### Boosted Regression Trees

In [None]:
mse = np.array([])
r_2 = np.array([])

i = 0
for current_date in date_df[date_df['date'] < start_date]['date'].unique():
    i += 1
    print(i)
    print(current_date)

    x_train = clear_df[clear_df['date'] < current_date][train_features]
    y_train = clear_df[clear_df['date'] < current_date]['midprice']

    # 85 and 15 %
    bound = int(round(0.85 * len(x_train), 0))

    # data for training
    x_train_train = x_train.iloc[1:bound]
    y_train_train = y_train.iloc[1:bound]
    x_train_validation = x_train.iloc[bound:]
    y_train_validation = y_train.iloc[bound:]

    # training
    sgbt = GradientBoostingRegressor(max_depth=4, subsample=0.8, max_features=0.2, n_estimators=300)
    sgbt.fit(x_train_train, y_train_train)
    
    y_train_pred = sgbt.predict(x_train_validation)
    
    mse_train_validation = MSE(y_train_validation, y_train_pred)
    r2_train_validation = r2(y_train_validation, y_train_pred)
    
    mse = np.append(mse, mse_train_validation)
    r_2 = np.append(r_2, r2_train_validation)

print(mse)
print(r_2)

print('MSE: {:.2f}'.format(np.mean(mse)))
print('R2: {:.2f}'.format(np.mean(r_2)))

### Extremely Randomized Regression Trees

In [None]:
mse = np.array([])
r_2 = np.array([])

i = 0
for current_date in date_df[date_df['date'] < start_date]['date'].unique():
    i += 1
    print(i)
    print(current_date)

    x_train = clear_df[clear_df['date'] < current_date][train_features]
    y_train = clear_df[clear_df['date'] < current_date]['midprice']

    # 85 and 15 %
    bound = int(round(0.85 * len(x_train), 0))

    # data for training
    x_train_train = x_train.iloc[1:bound]
    y_train_train = y_train.iloc[1:bound]
    x_train_validation = x_train.iloc[bound:]
    y_train_validation = y_train.iloc[bound:]

    # training
    clf = ExtraTreesRegressor(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
    clf.fit(x_train_train, y_train_train)
    
    y_train_pred = clf.predict(x_train_validation)
    
    mse_train_validation = MSE(y_train_validation, y_train_pred)
    r2_train_validation = r2(y_train_validation, y_train_pred)
    
    mse = np.append(mse, mse_train_validation)
    r_2 = np.append(r_2, r2_train_validation)

print(mse)
print(r_2)

print('MSE: {:.2f}'.format(np.mean(mse)))
print('R2: {:.2f}'.format(np.mean(r_2)))

## Predictions

In [13]:
df = df[277:]
clear_df = df[features]
start_date = '2007-01'

y_valid = clear_df[clear_df['date'] > start_date]['midprice']

### Random Forests

In [None]:
mse = np.array([])
r_2 = np.array([])

i = 0
for current_date in df[df['date'] > start_date]['date'].unique():
    i+= 1
    print(i)
    print(current_date)
    
    x_train = clear_df[clear_df['date'] < current_date][train_features]
    y_train = clear_df[clear_df['date'] < current_date]['midprice']

    # data for prediction
    x_test = clear_df[clear_df['date'] == current_date][train_features]
    y_test = clear_df[clear_df['date'] == current_date]['midprice']

    # prediction
    rf = RandomForestRegressor(n_estimators=300, min_samples_leaf=0.05)
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    
    mse_prediction = MSE(y_test, y_pred)
    r2_prediction = r2(y_test, y_pred)

    mse = np.append(mse, mse_prediction)
    r_2 = np.append(r_2, r2_prediction)

print(mse)
print(r_2)

print('MSE: {:.2f}'.format(np.mean(mse)))
print('R2: {:.2f}'.format(np.mean(r_2)))

### Boosted Regression Trees

In [113]:
mse = np.array([])
r_2 = np.array([])

i = 0
for current_date in df[df['date'] > start_date]['date'].unique():
    i+= 1
    print(i)
    print(current_date)
    
    x_train = clear_df[clear_df['date'] < current_date][train_features]
    y_train = clear_df[clear_df['date'] < current_date]['midprice']

    # data for prediction
    x_test = clear_df[clear_df['date'] == current_date][train_features]
    y_test = clear_df[clear_df['date'] == current_date]['midprice']

    # prediction
    sgbt = GradientBoostingRegressor(max_depth=4, subsample=0.8, max_features=0.2, n_estimators=300)
    sgbt.fit(x_train, y_train)
    y_pred = sgbt.predict(x_test)
    
    mse_prediction = MSE(y_test, y_pred)
    r2_prediction = r2(y_test, y_pred)

    mse = np.append(mse, mse_prediction)
    r_2 = np.append(r_2, r2_prediction)

print(mse)
print(r_2)

print('MSE: {:.2f}'.format(np.mean(mse)))
print('R2: {:.2f}'.format(np.mean(r_2)))

### Extremely Randomized Regression Trees

In [None]:
mse = np.array([])
r_2 = np.array([])

i = 0
for current_date in df[df['date'] > start_date]['date'].unique():
    i+= 1
    print(i)
    print(current_date)
    
    x_train = clear_df[clear_df['date'] < current_date][train_features]
    y_train = clear_df[clear_df['date'] < current_date]['midprice']

    # data for prediction
    x_test = clear_df[clear_df['date'] == current_date][train_features]
    y_test = clear_df[clear_df['date'] == current_date]['midprice']

    # prediction
    clf = ExtraTreesRegressor(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    
    mse_prediction = MSE(y_test, y_pred)
    r2_prediction = r2(y_test, y_pred)

    mse = np.append(mse, mse_prediction)
    r_2 = np.append(r_2, r2_prediction)

print(mse)
print(r_2)

print('MSE: {:.2f}'.format(np.mean(mse)))
print('R2: {:.2f}'.format(np.mean(r_2)))

# With Put-Call Separation

## Validations

In [14]:
total_df = pd.read_csv('total_df.csv', parse_dates=[15]).sort_values(by = "date") 
total_df['date'] = total_df['date'].dt.to_period('M')
total_df = total_df[277:]

clear_df = total_df[features][279:]
start_date = '2007-01'
date_df = total_df[total_df["date"] < start_date][519:]

y_valid = clear_df[clear_df['date'] > start_date]['midprice']

In [15]:
features = ['midprice', 'mness_additional', 'ttm_additional', 'embed_lev_additional', 'impvol_additional', 'gamma_additional', 'vega_additional', 'theta_additional', 'strike_additional', 'spot_close_additional', 'divrate_additional', 'vix_additional', 'short_rate_additional', 'delta_additional', 'mness', 'ttm', 'embed_lev', 'impvol', 'gamma', 'vega', 'theta', 'strike', 'spot_close', 'divrate', 'vix', 'short_rate', 'delta', 'date']# + ['optid']
train_features = features[1:-1]
clear_df = total_df[features]

### Random Forests

In [None]:
mse = np.array([])
r_2 = np.array([])

i = 0
for current_date in date_df[date_df['date'] < start_date]['date'].unique():
    i += 1
    print(i)
    print(current_date)
    
    x_train = clear_df[clear_df['date'] < current_date][train_features]
    y_train = clear_df[clear_df['date'] < current_date]['midprice']

    # 85 and 15 %
    bound = int(round(0.85 * len(x_train), 0))

    # data for training
    x_train_train = x_train.iloc[1:bound]
    y_train_train = y_train.iloc[1:bound]
    x_train_validation = x_train.iloc[bound:]
    y_train_validation = y_train.iloc[bound:]

    # training
    rf = RandomForestRegressor(n_estimators=300, min_samples_leaf=0.05)
    rf.fit(x_train_train, y_train_train)
    y_train_pred = rf.predict(x_train_validation)
    
    mse_train_validation = MSE(y_train_validation, y_train_pred)
    r2_train_validation = r2(y_train_validation, y_train_pred)

    mse = np.append(mse, mse_train_validation)
    r_2 = np.append(r_2, r2_train_validation)

print(mse)
print(r_2)

print('MSE: {:.2f}'.format(np.mean(mse)))
print('R2: {:.2f}'.format(np.mean(r_2)))

### Boosted Regression Trees

In [None]:
mse = np.array([])
r_2 = np.array([])

i = 0
for current_date in date_df[date_df['date'] < start_date]['date'].unique():
    i += 1
    print(i)
    print(current_date)
    
    x_train = clear_df[clear_df['date'] < current_date][train_features]
    y_train = clear_df[clear_df['date'] < current_date]['midprice']

    # 85 and 15 %
    bound = int(round(0.85 * len(x_train), 0))

    # data for training
    x_train_train = x_train.iloc[1:bound]
    y_train_train = y_train.iloc[1:bound]
    x_train_validation = x_train.iloc[bound:]
    y_train_validation = y_train.iloc[bound:]

    # training
    sgbt = GradientBoostingRegressor(max_depth=4, subsample=0.8, max_features=0.2, n_estimators=300)
    sgbt.fit(x_train_train, y_train_train)
    y_train_pred = sgbt.predict(x_train_validation)
    
    mse_train_validation = MSE(y_train_validation, y_train_pred)
    r2_train_validation = r2(y_train_validation, y_train_pred)

    mse = np.append(mse, mse_train_validation)
    r_2 = np.append(r_2, r2_train_validation)

print(mse)
print(r_2)

print('MSE: {:.2f}'.format(np.mean(mse)))
print('R2: {:.2f}'.format(np.mean(r_2)))

### Extremely Randomized Regression Trees

In [None]:
mse = np.array([])
r_2 = np.array([])

i = 0
for current_date in date_df[date_df['date'] < start_date]['date'].unique():
    i += 1
    print(i)
    print(current_date)
    
    x_train = clear_df[clear_df['date'] < current_date][train_features]
    y_train = clear_df[clear_df['date'] < current_date]['midprice']

    # 85 and 15 %
    bound = int(round(0.85 * len(x_train), 0))

    # data for training
    x_train_train = x_train.iloc[1:bound]
    y_train_train = y_train.iloc[1:bound]
    x_train_validation = x_train.iloc[bound:]
    y_train_validation = y_train.iloc[bound:]

    # training
    clf = ExtraTreesRegressor(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
    clf.fit(x_train_train, y_train_train)
    y_train_pred = clf.predict(x_train_validation)
    
    mse_train_validation = MSE(y_train_validation, y_train_pred)
    r2_train_validation = r2(y_train_validation, y_train_pred)

    mse = np.append(mse, mse_train_validation)
    r_2 = np.append(r_2, r2_train_validation)

print(mse)
print(r_2)

print('MSE: {:.2f}'.format(np.mean(mse)))
print('R2: {:.2f}'.format(np.mean(r_2)))

## Predictions

In [93]:
total_df = pd.read_csv('total_df.csv', parse_dates=[15]).sort_values(by = "date") 
total_df['date'] = total_df['date'].dt.to_period('M')
total_df = total_df[277:]

clear_df = total_df[features][279:]
start_date = '2007-01'
date_df = total_df[total_df["date"] < start_date][519:]

y_valid = clear_df[clear_df['date'] > start_date]['midprice']

In [94]:
features = ['midprice', 'mness_additional', 'ttm_additional', 'embed_lev_additional', 'impvol_additional', 'gamma_additional', 'vega_additional', 'theta_additional', 'strike_additional', 'spot_close_additional', 'divrate_additional', 'vix_additional', 'short_rate_additional', 'delta_additional', 'mness', 'ttm', 'embed_lev', 'impvol', 'gamma', 'vega', 'theta', 'strike', 'spot_close', 'divrate', 'vix', 'short_rate', 'delta', 'date']# + ['optid']
train_features = features[1:-1]
clear_df = total_df[features]

### Random Forests

In [None]:
mse = np.array([])
r_2 = np.array([])

i = 0
for current_date in total_df[total_df['date'] > start_date]['date'].unique():
    i+= 1
    print(i)
    print(current_date)
    
    x_train = clear_df[clear_df['date'] < current_date][train_features]
    y_train = clear_df[clear_df['date'] < current_date]['midprice']

    # data for prediction
    x_test = clear_df[clear_df['date'] == current_date][train_features]
    y_test = clear_df[clear_df['date'] == current_date]['midprice']

    # prediction
    rf = RandomForestRegressor(n_estimators=300, min_samples_leaf=0.05)
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    
    mse_prediction = MSE(y_test, y_pred)
    r2_prediction = r2(y_test, y_pred)

    mse = np.append(mse, mse_prediction)
    r_2 = np.append(r_2, r2_prediction)

print(mse)
print(r_2)

print('MSE: {:.2f}'.format(np.mean(mse)))
print('R2: {:.2f}'.format(np.mean(r_2)))

### Boosted Regression Trees

In [None]:
mse = np.array([])
r_2 = np.array([])

i = 0
for current_date in total_df[total_df['date'] > start_date]['date'].unique():
    i+= 1
    print(i)
    print(current_date)
    
    x_train = clear_df[clear_df['date'] < current_date][train_features]
    y_train = clear_df[clear_df['date'] < current_date]['midprice']

    # data for prediction
    x_test = clear_df[clear_df['date'] == current_date][train_features]
    y_test = clear_df[clear_df['date'] == current_date]['midprice']

    # prediction
    sgbt = GradientBoostingRegressor(max_depth=4, subsample=0.8, max_features=0.2, n_estimators=300)
    sgbt.fit(x_train, y_train)
    y_pred = sgbt.predict(x_test)
    
    mse_prediction = MSE(y_test, y_pred)
    r2_prediction = r2(y_test, y_pred)

    mse = np.append(mse, mse_prediction)
    r_2 = np.append(r_2, r2_prediction)

print(mse)
print(r_2)

print('MSE: {:.2f}'.format(np.mean(mse)))
print('R2: {:.2f}'.format(np.mean(r_2)))

### Extremely Randomized Regression Trees

In [110]:
mse = np.array([])
r_2 = np.array([])

i = 0
for current_date in total_df[total_df['date'] > start_date]['date'].unique():
    i+= 1
    print(i)
    print(current_date)
    
    x_train = clear_df[clear_df['date'] < current_date][train_features]
    y_train = clear_df[clear_df['date'] < current_date]['midprice']

    # data for prediction
    x_test = clear_df[clear_df['date'] == current_date][train_features]
    y_test = clear_df[clear_df['date'] == current_date]['midprice']

    # prediction
    clf = ExtraTreesRegressor(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    
    mse_prediction = MSE(y_test, y_pred)
    r2_prediction = r2(y_test, y_pred)

    mse = np.append(mse, mse_prediction)
    r_2 = np.append(r_2, r2_prediction)

print(mse)
print(r_2)

print('MSE: {:.2f}'.format(np.mean(mse)))
print('R2: {:.2f}'.format(np.mean(r_2)))