In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil.relativedelta import relativedelta
from scipy.stats import loguniform
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('data.csv', parse_dates=[1]).sort_values(by = "date")
df['date'] = df['date'].dt.to_period('M')
df = df[277:].reset_index(drop=True)

In [3]:
df.head()

Unnamed: 0,optid,date,secid,cp_flag,strike,bid,ask,volume,openint,impvol,...,vix,dhedged_return_mid,dhedged_return_spot,dhedged_return_spot_gamma,dhedged_return_mid_delev,dhedged_return_spot_delev,dhedged_return_spot_gamma_delev,IV_mness_deriv_1,IV_ttm_deriv_1,short_rate
0,10758593,1996-03,108105,C,630.0,20.0,20.5,7.0,2885.0,0.163628,...,16.54,-0.162051,-0.005116,-0.453583,-0.007654,-0.000242,-0.021423,-0.034178,4.5e-05,0.054624
1,10093854,1996-03,108105,P,650.0,17.5,18.25,126.0,2383.0,0.143119,...,16.54,-0.161114,-0.00449,-0.430225,-0.008292,-0.000231,-0.022141,-0.025132,5e-06,0.054624
2,10453184,1996-03,108105,P,675.0,33.625,34.625,0.0,202.0,0.13378,...,16.54,-0.032381,-0.001723,-0.21226,-0.00219,-0.000117,-0.014357,-0.008622,-1e-05,0.054624
3,11075738,1996-03,108105,P,650.0,35.0,36.5,13.0,2376.0,0.153038,...,16.54,-0.034062,-0.001898,-0.684611,-0.005364,-0.000299,-0.10781,-0.029598,1.5e-05,0.054624
4,11727207,1996-03,108105,C,655.0,30.25,31.25,7120.0,8000.0,0.146335,...,16.54,-0.019832,-0.000951,-0.160711,-0.001621,-7.8e-05,-0.013133,,,0.054624


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180537 entries, 0 to 180536
Data columns (total 37 columns):
 #   Column                           Non-Null Count   Dtype    
---  ------                           --------------   -----    
 0   optid                            180537 non-null  int64    
 1   date                             180537 non-null  period[M]
 2   secid                            180537 non-null  int64    
 3   cp_flag                          180537 non-null  object   
 4   strike                           180537 non-null  float64  
 5   bid                              180537 non-null  float64  
 6   ask                              180537 non-null  float64  
 7   volume                           180537 non-null  float64  
 8   openint                          180537 non-null  float64  
 9   impvol                           180537 non-null  float64  
 10  delta                            180537 non-null  float64  
 11  gamma                            180537

In [5]:
def get_summary(df_col):
    print(f'\nSummary of {df_col.name}:')
    print(f'Mean: {np.round( df_col.mean(),2)}')
    print(f'Median: {np.round( df_col.median(),2)}')
    print(f'Standard deviation: {np.round( df_col.std(),2)}')
    print(f'Number of observations: {np.round( df_col.count(),2)}')    

In [6]:
def add_months(date ,period_to_add=1):
    return (datetime.strptime(date, '%Y-%m') + relativedelta(months=period_to_add)).strftime('%Y-%m')

In [7]:
features = [ 'midprice', 'mness', 'ttm', 'embed_lev', 'impvol', 'gamma', 'vega', 'theta']

In [8]:
print('\nSummary statistics for Call options\n')
df[df["cp_flag"] == 'C'][features].apply(lambda x: get_summary(x), axis=0);


Summary statistics for Call options


Summary of midprice:
Mean: 91.46
Median: 37.39
Standard deviation: 163.66
Number of observations: 69044

Summary of mness:
Mean: 0.49
Median: 0.31
Standard deviation: 1.3
Number of observations: 69044

Summary of ttm:
Mean: 208.87
Median: 91.0
Standard deviation: 224.3
Number of observations: 69044

Summary of embed_lev:
Mean: 22.17
Median: 14.24
Standard deviation: 21.85
Number of observations: 69044

Summary of impvol:
Mean: 0.21
Median: 0.18
Standard deviation: 0.15
Number of observations: 69044

Summary of gamma:
Mean: 0.0
Median: 0.0
Standard deviation: 0.0
Number of observations: 69044

Summary of vega:
Mean: 240.76
Median: 176.03
Standard deviation: 230.11
Number of observations: 69044

Summary of theta:
Mean: -56.13
Median: -43.84
Standard deviation: 51.24
Number of observations: 69044


In [9]:
print('\nSummary statistics for Put options\n')

df[df["cp_flag"] == 'P'][features].apply(lambda x: get_summary(x), axis=0);


Summary statistics for Put options


Summary of midprice:
Mean: 71.67
Median: 14.25
Standard deviation: 154.16
Number of observations: 111493

Summary of mness:
Mean: -0.9
Median: -1.03
Standard deviation: 1.43
Number of observations: 111493

Summary of ttm:
Mean: 202.65
Median: 91.0
Standard deviation: 222.21
Number of observations: 111493

Summary of embed_lev:
Mean: 14.65
Median: 11.78
Standard deviation: 10.6
Number of observations: 111493

Summary of impvol:
Mean: 0.3
Median: 0.26
Standard deviation: 0.18
Number of observations: 111493

Summary of gamma:
Mean: 0.0
Median: 0.0
Standard deviation: 0.0
Number of observations: 111493

Summary of vega:
Mean: 178.92
Median: 105.07
Standard deviation: 205.27
Number of observations: 111493

Summary of theta:
Mean: -49.22
Median: -36.21
Standard deviation: 49.53
Number of observations: 111493


In [10]:
features += ['strike', 'spot_close', 'divrate', 'vix', 'short_rate', 'delta', 'date'] #+ ['optid']
train_features = features[1:-1]

In [11]:
clear_df = df[features]

In [12]:
start_date = '2007-01'

y_prices = clear_df[clear_df['date'] > start_date]['midprice']

In [13]:
def get_splits(df, num=False):
    train_pc = 0.85

    df_len = df.shape[0]
    separator = int(np.floor(df_len*train_pc))
    rest = df_len - separator

    if num:
        return df[:separator], df[separator:]
    else:
        return np.append(np.ones(separator)*-1,np.zeros(rest))

## OLS Regression

In [2]:
ols_predictions = np.array([])
ols_r_2 = np.array([])
ols_r2_better = np.array([])

In [15]:
# OLS
for current_date in df[df['date'] > start_date]['date'].unique():
    
    x_train = clear_df[clear_df['date'] < current_date][train_features]
    y_train = clear_df[clear_df['date'] < current_date]['midprice']

    x_test = clear_df[clear_df['date'] == current_date][train_features]
    y_test = clear_df[clear_df['date'] == current_date]['midprice']
    
    ols = LinearRegression()
    ols.fit(x_train, y_train)
    
    y_pred = ols.predict(x_test)
    
    # Write predictions to array
    ols_predictions = np.append(ols_predictions, y_pred)
    
    # Write R^2 to array
    ols_r_2 = np.append(ols_r_2, ols.score(x_train, y_train))
    
    ols_r2_better = np.append(ols_r2_better, r2_score(y_test, y_pred))

In [16]:
np.round(np.mean(ols_r_2), 3)

0.408

In [17]:
# MSE
np.round(mean_squared_error(y_prices, ols_predictions), 3)

19784.398

In [3]:
np.round(r2_score(y_prices, ols_predictions), 3)

NameError: name 'y_prices' is not defined

In [None]:
np.round(np.mean(ols_r2_better), 3)

## Penalized Linear Regressions

### Ridge

In [None]:
ridge_predictions = np.array([])
ridge_r_2 = np.array([])
ridge_r2_better = np.array([])

In [None]:
# Make visualization

ridge_cv.best_params_
ridge_cv.best_score_


In [None]:
# Ridge
space = dict()

space['solver'] = ['auto','svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
space['alpha'] = np.array([100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0])
space['fit_intercept'] = [True, False]
space['normalize'] = [True, False]

for current_date in df[df['date'] > start_date]['date'].unique():
    
    temp = clear_df[clear_df['date'] < current_date]
    X, y = temp[train_features], temp['midprice']

    ps = PredefinedSplit(get_splits(temp))
    cv_scheme = list(ps.split())

    ridge = Ridge()

    ridge_cv = GridSearchCV(ridge, param_grid=space, cv = cv_scheme)

    ridge_cv.fit(X, y)
    # train, validation = get_splits(clear_df[clear_df['date'] < '2007-02'], num = True)

    # x_train = train[train_features]
    # y_train = train['midprice']

    # x_valid = validation[train_features]
    # y_valid = validation['midprice']
    
    x_test = clear_df[clear_df['date'] == '2007-02'][train_features]
    y_test = clear_df[clear_df['date'] == current_date]['midprice']

    y_pred = ridge_cv.predict(x_test)

    # Write predictions to array
    ridge_predictions = np.append(ridge_predictions, y_pred)

    # Write R^2 to array
    ridge_r_2 = np.append(ridge_r_2, ridge_cv.score(X, y))
    
    ridge_r2_better = np.append(ridge_r2_better, r2_score(y_test, y_pred))

In [None]:
np.round(np.mean(ridge_r_2), 3)

In [None]:
np.round(mean_squared_error(y_prices, ridge_predictions), 3)

In [None]:
np.round(r2_score(y_prices, ridge_predictions), 3)

In [None]:
np.round(np.mean(ridge_r2_better), 3)