In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil.relativedelta import relativedelta
from scipy.stats import loguniform
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.metrics import mean_squared_error, r2_score

In [26]:
df = pd.read_csv('data.csv', parse_dates=[1]).sort_values(by = "date")
df['date'] = df['date'].dt.to_period('M')
df = df[277:].reset_index(drop=True)

In [None]:
# For testing
df = df[df['date'] < '2008-01']

In [8]:
df.head()

Unnamed: 0,optid,date,secid,cp_flag,strike,bid,ask,volume,openint,impvol,...,vix,dhedged_return_mid,dhedged_return_spot,dhedged_return_spot_gamma,dhedged_return_mid_delev,dhedged_return_spot_delev,dhedged_return_spot_gamma_delev,IV_mness_deriv_1,IV_ttm_deriv_1,short_rate
135026,10043558,1996-02,108105,P,575.0,0.375,0.4375,20.0,2019.0,0.233068,...,15.37,-0.395369,-0.000248,-0.171185,-0.00971,-6.087413e-06,-0.004204,-0.037859,1.418111e-05,0.05427
421,10368325,1996-02,108105,C,500.0,155.25,156.25,0.0,750.0,0.424562,...,15.37,0.007059,0.001697,2.714935,0.001731,0.0004160267,0.665643,,,0.05427
91308,11506611,1996-02,108105,P,665.0,23.875,24.875,52.0,459.0,0.122164,...,15.37,0.09959,0.003746,0.447423,0.006534,0.0002457917,0.029355,-0.02641,-7.934008e-06,0.05427
112280,10170877,1996-02,108105,P,525.0,0.0625,0.125,500.0,7725.0,0.310703,...,15.37,-0.117722,-1.7e-05,-0.06171,-0.003241,-4.689261e-07,-0.001699,-0.037859,1.418111e-05,0.05427
67985,11516393,1996-02,108105,C,685.0,7.875,8.375,0.0,458.0,0.128511,...,15.37,0.257634,0.00323,0.470434,0.011572,0.0001451034,0.021131,-0.010402,8.521277e-07,0.05427


In [None]:
df.info()

In [None]:
def get_summary(df_col):
    print(f'\nSummary of {df_col.name}:')
    print(f'Mean: {np.round( df_col.mean(),2)}')
    print(f'Median: {np.round( df_col.median(),2)}')
    print(f'Standard deviation: {np.round( df_col.std(),2)}')
    print(f'Number of observations: {np.round( df_col.count(),2)}')    

In [None]:
def add_months(date ,period_to_add=1):
    return (datetime.strptime(date, '%Y-%m') + relativedelta(months=period_to_add)).strftime('%Y-%m')

In [36]:
features = [ 'midprice', 'mness', 'ttm', 'embed_lev', 'impvol', 'gamma', 'vega', 'theta']

In [None]:
print('\nSummary statistics for Call options\n')
df[df["cp_flag"] == 'C'][features].apply(lambda x: get_summary(x), axis=0);

In [None]:
print('\nSummary statistics for Put options\n')

df[df["cp_flag"] == 'P'][features].apply(lambda x: get_summary(x), axis=0);

In [37]:
features += ['strike', 'spot_close', 'divrate', 'vix', 'short_rate', 'delta', 'date'] #+ ['optid']
train_features = features[1:-1]

In [None]:
clear_df = df[features]

In [None]:
start_date = '2007-01'

y_prices = clear_df[clear_df['date'] > start_date]['midprice']

In [None]:
def get_splits(df, num=False):
    train_pc = 0.85

    df_len = df.shape[0]
    separator = int(np.floor(df_len*train_pc))
    rest = df_len - separator

    if num:
        return df[:separator], df[separator:]
    else:
        return np.append(np.ones(separator)*-1,np.zeros(rest))

## OLS Regression

In [None]:
ols_predictions = np.array([])
ols_r_2 = np.array([])
ols_r2_better = np.array([])

In [None]:
# OLS
for current_date in df[df['date'] > start_date]['date'].unique():
    
    x_train = clear_df[clear_df['date'] < current_date][train_features]
    y_train = clear_df[clear_df['date'] < current_date]['midprice']

    x_test = clear_df[clear_df['date'] == current_date][train_features]
    y_test = clear_df[clear_df['date'] == current_date]['midprice']
    
    ols = LinearRegression()
    ols.fit(x_train, y_train)
    
    y_pred = ols.predict(x_test)
    
    # Write predictions to array
    ols_predictions = np.append(ols_predictions, y_pred)
    
    # Write R^2 to array
    ols_r_2 = np.append(ols_r_2, ols.score(x_train, y_train))
    
    ols_r2_better = np.append(ols_r2_better, r2_score(y_test, y_pred))

In [None]:
np.round(np.mean(ols_r_2), 3)

In [None]:
# MSE
np.round(mean_squared_error(y_prices, ols_predictions), 3)

In [None]:
np.round(r2_score(y_prices, ols_predictions), 3)

In [None]:
np.round(np.mean(ols_r2_better), 3)

## Penalized Linear Regressions

### Ridge

In [None]:
ridge_predictions = np.array([])
ridge_r_2 = np.array([])
ridge_r2_better = np.array([])

In [None]:
# Make visualization

# ridge_cv.best_params_
# ridge_cv.best_score_


In [None]:
# Ridge
space = dict()

space['solver'] = ['auto','svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
space['alpha'] = np.array([10.0, 1.0, 0.1, 0.01, 0.001, 0])

for current_date in df[df['date'] > start_date]['date'].unique():
    temp = clear_df[clear_df['date'] < current_date]
    X, y = temp[train_features], temp['midprice']

    ps = PredefinedSplit(get_splits(temp))
    cv_scheme = list(ps.split())

    ridge = Ridge(scoring='r2', fit_intercept=True, normalize=True)

    ridge_cv = GridSearchCV(ridge, param_grid=space, scoring='r2', cv = cv_scheme)

    ridge_cv.fit(X, y)
    
    # train, validation = get_splits(clear_df[clear_df['date'] < '2007-02'], num = True)

    # x_train = train[train_features]
    # y_train = train['midprice']

    # x_valid = validation[train_features]
    # y_valid = validation['midprice']
    
    x_test = clear_df[clear_df['date'] == current_date][train_features]
    y_test = clear_df[clear_df['date'] == current_date]['midprice']

    y_pred = ridge_cv.predict(x_test)

    # Write predictions to array
    ridge_predictions = np.append(ridge_predictions, y_pred)

    # Write R^2 to array
    ridge_r_2 = np.append(ridge_r_2, ridge_cv.score(X, y))
    
    ridge_r2_better = np.append(ridge_r2_better, r2_score(y_test, y_pred))

In [None]:
np.round(np.mean(ridge_r_2), 3)

In [None]:
np.round(mean_squared_error(y_prices, ridge_predictions), 3)

In [None]:
np.round(r2_score(y_prices, ridge_predictions), 3)

In [None]:
np.round(np.mean(ridge_r2_better), 3)

### Lasso

In [None]:
lasso_predictions = np.array([])
lasso_r_2 = np.array([])
lasso_r2_better = np.array([])

In [None]:
# Lasso
space = dict()

space['alpha'] = np.array([10.0, 1.0, 0.1, 0.01, 0.001])

for current_date in df[df['date'] > start_date]['date'].unique():
    temp = clear_df[clear_df['date'] < current_date]
    X, y = temp[train_features], temp['midprice']

    ps = PredefinedSplit(get_splits(temp))
    cv_scheme = list(ps.split())

    lasso = Lasso(fit_intercept=True, normalize=True)
    
    lasso_cv = GridSearchCV(lasso, param_grid=space, scoring='r2', cv = cv_scheme)
    
    lasso_cv.fit(X, y)
        
    x_test = clear_df[clear_df['date'] == current_date][train_features]
    y_test = clear_df[clear_df['date'] == current_date]['midprice']
    
    y_pred = lasso_cv.predict(x_test)
    
    # Write predictions to array
    lasso_predictions = np.append(lasso_predictions, y_pred)
    
    # Write R^2 to array
    lasso_r_2 = np.append(lasso_r_2, lasso_cv.score(X, y))
    
    lasso_r2_better = np.append(lasso_r2_better, r2_score(y_test, y_pred))

### ElasticNet

In [None]:
elastic_net_predictions = np.array([])
elastic_net_r_2 = np.array([])
elastic_net_r2_better = np.array([])

In [None]:
# ElasticNet
space = dict()

space['alpha'] = np.array([1.0, 0.01, 0.001, 0.0001])
space['l1_ratio'] = np.arange(0.95, 1, 0.005)

for current_date in df[df['date'] > start_date]['date'].unique():
    temp = clear_df[clear_df['date'] < current_date]
    X, y = temp[train_features], temp['midprice']

    ps = PredefinedSplit(get_splits(temp))
    cv_scheme = list(ps.split())
    
    elastic_net = ElasticNet(fit_intercept=True, normalize=True)
    
    elastic_net_cv = GridSearchCV(elastic_net, param_grid=space, scoring='r2', cv = cv_scheme)
    
    elastic_net_cv.fit(X, y)
    
    x_test = clear_df[clear_df['date'] == current_date][train_features]
    y_test = clear_df[clear_df['date'] == current_date]['midprice']
    
    y_pred = elastic_net_cv.predict(x_test)
    
    # Write predictions to array
    elastic_net_predictions = np.append(elastic_net_predictions, y_pred)
    
    # Write R^2 to array
    elastic_net_r_2 = np.append(elastic_net_r_2, elastic_net_cv.score(X, y))
    
    elastic_net_r2_better = np.append(elastic_net_r2_better, r2_score(y_test, y_pred))



## Put-Call Separation

In [28]:
df = pd.read_csv('total_df.csv', parse_dates=[15]).sort_values(by = "date")
df['date'] = df['date'].dt.to_period('M')
df = df[277:].reset_index(drop=True)

In [25]:
# For testing
df = df[df['date'] < '2008-01']

Unnamed: 0,index,mness,ttm,embed_lev,impvol,gamma,vega,theta,strike,delta,spot_close,divrate,vix,short_rate,midprice,date,mness_additional,ttm_additional,embed_lev_additional,impvol_additional
0,66404,1.794424,35,13.04902,0.131359,0.003453,17.38113,8.350084,690.0,-0.956151,641.43,0.026814,16.54,0.054624,47.0,1996-03,1.794424,35.0,13.04902,0.131359
1,113497,0.951309,189,8.926721,0.127646,0.0051,137.9684,0.565281,700.0,-0.760211,641.43,0.026814,16.54,0.054624,54.625,1996-03,0.951309,189.0,8.926721,0.127646
2,89829,0.976241,98,11.907258,0.129917,0.00654,92.90605,-5.226152,685.0,-0.791274,641.43,0.026814,16.54,0.054624,42.625,1996-03,0.976241,98.0,11.907258,0.129917
3,23951,1.582517,63,10.630817,0.132905,0.003805,35.34675,6.868342,700.0,-0.926051,641.43,0.026814,16.54,0.054624,55.875,1996-03,1.582517,63.0,10.630817,0.132905
4,22836,-0.61551,189,13.662393,0.169646,0.003647,131.1072,-17.05127,595.0,-0.207674,641.43,0.026814,16.54,0.054624,9.75,1996-03,-0.61551,189.0,13.662393,0.169646


In [32]:
clear_df = df[features]
start_date = '2007-01'

y_prices = clear_df[clear_df['date'] > start_date]['midprice']

## OLS Regression

In [33]:
ols_sep_predictions = np.array([])
ols_sep_r_2 = np.array([])

In [38]:
# OLS
for current_date in df[df['date'] > start_date]['date'].unique():
    
    x_train = clear_df[clear_df['date'] < current_date][train_features]
    y_train = clear_df[clear_df['date'] < current_date]['midprice']

    x_test = clear_df[clear_df['date'] == current_date][train_features]
    y_test = clear_df[clear_df['date'] == current_date]['midprice']
    
    ols = LinearRegression()
    ols.fit(x_train, y_train)
    
    y_pred = ols.predict(x_test)
    
    # Write predictions to array
    ols_sep_predictions = np.append(ols_sep_predictions, y_pred)
    
    # Write R^2 to array
    ols_sep_r_2 = np.append(ols_sep_r_2, r2_score(y_test, y_pred))

In [39]:
np.round(np.mean(ols_sep_r_2), 3)

0.289

In [40]:
np.round(r2_score(y_prices, ols_sep_predictions), 3)

0.329

In [41]:
# MSE
np.round(mean_squared_error(y_prices, ols_sep_predictions), 3)

19784.398