Hi everyone,

Attached you have a dropbox link with the option data. A few things to note:
- You only have end of month date
- Option returns are delta hedged ONLY ONCE not on a daily basis. I went for this approach as it was easier to code so you will have the data faster.
- Only options which have prices at the next end of month are considered, again I did this because it was faster to code but it will probably not inhibit your results too much. 
- I have pre cleaned the option data for you. See for example Goyal (2009) for more details

For now the data seems super small (700mb) so I send you the whole data.

Best,
Patrick

In [None]:
# enable auto complete
%config Completer.use_jedi = False

In [3]:
from collections import *

In [1]:
import pandas as pd 

In [5]:
import datetime

In [6]:
import os

In [7]:
pd.set_option('display.max_columns', 100)

In [9]:
DATAROOT = "data"

# Data Exploration

In [10]:
sp500_op_ret = pd.read_csv(os.path.join(DATAROOT, "sp500_op_ret.csv"))

In [11]:
mapping_table = pd.read_csv(os.path.join(DATAROOT, "mapping_table.csv"))

In [12]:
sp500_const_list = pd.read_csv(os.path.join(DATAROOT, "sp500_const_list.csv"))

## mapping_table and sp500_const_list

In [10]:
# PERMNO is a unique stock (share class) level 
# identifier used by WRDS
mapping_table.sample(5)

Unnamed: 0,secid,sdate,edate,permno
22885,205127,2014-08-27 00:00:00,2020-12-31 00:00:00,14845
13746,113188,2001-11-20 00:00:00,2012-02-22 00:00:00,89208
22228,190028,2014-01-09 00:00:00,2020-12-31 00:00:00,14412
18832,140393,2010-01-14 00:00:00,2020-12-31 00:00:00,93221
21404,161471,2013-02-15 00:00:00,2020-12-31 00:00:00,13779


In [12]:
mapping_table.shape

(28276, 4)

In [14]:
len(set(mapping_table['permno']))

22368

In [19]:
mapping_table[mapping_table["secid"]==5007]

Unnamed: 0,secid,sdate,edate,permno
5,5007,2001-04-09 00:00:00,2002-08-01 00:00:00,19692
6,5007,2012-04-12 00:00:00,2016-08-17 00:00:00,13343


In [18]:
Counter(mapping_table["secid"])

Counter({5001: 1,
         5002: 1,
         5004: 1,
         5005: 1,
         5006: 1,
         5007: 2,
         5008: 1,
         5009: 1,
         5010: 1,
         5012: 1,
         5014: 1,
         5015: 1,
         5016: 1,
         5017: 1,
         5019: 1,
         5021: 1,
         5022: 1,
         5023: 1,
         5024: 2,
         5025: 1,
         5027: 1,
         5028: 1,
         5029: 1,
         5030: 1,
         5031: 1,
         5033: 1,
         5034: 1,
         5035: 1,
         5036: 1,
         5037: 1,
         5038: 1,
         5040: 1,
         5041: 1,
         5042: 1,
         5043: 1,
         5044: 1,
         5045: 1,
         5046: 1,
         5047: 2,
         5048: 1,
         5049: 1,
         5051: 1,
         5052: 1,
         5053: 1,
         5056: 1,
         5057: 1,
         5058: 1,
         5059: 1,
         5060: 1,
         5061: 1,
         5064: 1,
         5065: 1,
         5066: 1,
         5067: 1,
         5068: 1,
         5

In [104]:
secids = set(sp500_op_ret.secid)
len(
    set(
        mapping_table[mapping_table.secid.apply(lambda x : x in secids)].permno
    )
)

1247

In [107]:
pd.DataFrame(set(
        mapping_table[mapping_table.secid.apply(lambda x : x in secids)].permno
    )).to_csv("permno_list.csv")

In [100]:
mapping_table.permno.unique().size

22368

In [24]:
sp500_const_list.sample(5)

Unnamed: 0,permno,comnam,ncusip,shrcd,exchcd,hsiccd,ticker,gvkey,iid,start,ending
1217,12570,INTERNATIONAL TEL & TELEG CORP,46047010,11,1,3561,IIN,5860,1,1957-03-01 00:00:00,2011-10-31 00:00:00
1852,22613,PUBLIC SERVICE CO CO,74444810,11,1,4931,PSR,8804,1,1966-12-15 00:00:00,1976-06-30 00:00:00
1001,25902,GENERAL STEEL INDUSTRIES INC,37085610,10,1,3229,GSI,5088,1,1965-01-07 00:00:00,1973-05-30 00:00:00
1964,68049,RYANS FAMILY STEAK HOUSES INC,78351910,11,3,5812,RYAN,9298,1,1989-09-14 00:00:00,1996-12-31 00:00:00
1984,59440,SAFECO CORP,78642910,11,1,6331,SAF,9351,1,1976-07-01 00:00:00,2008-09-22 00:00:00


In [25]:
sp500_const_list.describe()

Unnamed: 0,permno,shrcd,exchcd,hsiccd,gvkey
count,2501.0,2501.0,2501.0,2501.0,2501.0
mean,45047.839664,11.27509,1.316273,4466.340664,24298.312275
std,27185.64255,3.373588,0.939112,1899.85037,44243.266082
min,10006.0,10.0,-2.0,0.0,1010.0
25%,20853.0,11.0,1.0,3011.0,4699.0
50%,40221.0,11.0,1.0,4011.0,8325.0
75%,71175.0,11.0,1.0,6021.0,13714.0
max,93436.0,72.0,31.0,9999.0,316056.0


## sp500_op_ret
Below are summary statistics (refer to page 15)

### general statistics

In [None]:
sp500_op_ret.head(5)
# sp500_op_ret.sample(5)

Unnamed: 0,secid,date,exdate,cp_flag,strike_price,best_bid,best_offer,volume,open_interest,impl_volatility,delta,gamma,vega,theta,optionid,cfadj,days_no_trading,days_to_exp,forwardprice,spotprice,adj_spot,ir_rate,mid_price,option_ret
0,5015,1996-01-31 00:00:00,1996-07-20 00:00:00,C,20.0,4.625,5.0,15,25,0.380062,0.793831,0.044693,4.456725,-2.603545,10980421,1,0,171,24.142672,24.0,24.0,0.052253,4.8125,-0.025367
1,5048,1996-01-31 00:00:00,1996-08-17 00:00:00,P,40.0,3.0,3.375,5,37,0.245716,-0.507195,0.058756,11.18604,-1.508168,11613022,1,0,199,39.34493,39.0,39.0,0.05178,3.1875,0.002347
2,5049,1996-01-31 00:00:00,1996-05-18 00:00:00,C,65.0,5.875,6.375,4,420,0.167298,0.833206,0.0392,9.020193,-5.51529,11618802,1,0,108,70.168661,70.375,35.1875,0.053274,6.125,-0.009721
3,5049,1996-01-31 00:00:00,1996-08-17 00:00:00,C,70.0,3.25,3.625,13,3806,0.149214,0.559395,0.050652,19.82352,-4.653782,11770703,1,0,199,70.393348,70.375,35.1875,0.05178,3.4375,0.001186
4,5061,1996-01-31 00:00:00,1996-06-22 00:00:00,C,30.0,3.375,3.75,5,254,0.324523,0.657941,0.056807,7.146084,-3.929316,10914516,1,0,143,32.001209,31.75,31.75,0.052726,3.5625,-0.006649


In [13]:
sp500_op_ret[sp500_op_ret["days_no_trading"] >= sp500_op_ret["days_no_trading"].max()]

Unnamed: 0,secid,date,exdate,cp_flag,strike_price,best_bid,best_offer,volume,open_interest,impl_volatility,delta,gamma,vega,theta,optionid,cfadj,days_no_trading,days_to_exp,forwardprice,spotprice,adj_spot,ir_rate,mid_price,option_ret
788893,112026,2006-06-30 00:00:00,2007-01-20 00:00:00,P,40.0,0.30,0.40,1,599,0.418724,-0.038458,0.004153,4.013214,-1.362128,21578802,2,2,204,66.224434,64.64,32.32000,0.056074,0.350,0.100732
802099,110777,2006-08-31 00:00:00,2007-01-20 00:00:00,C,32.5,34.00,34.30,11,620,0.409338,0.998763,0.000259,0.170730,-1.834437,54377206,2,2,142,67.194600,66.21,66.21000,0.054385,34.150,-0.008407
811239,111560,2006-09-29 00:00:00,2007-01-20 00:00:00,C,37.5,14.70,14.90,100,3862,0.402982,0.943461,0.009681,3.170974,-3.908508,24023264,2,2,113,52.250023,51.47,51.47000,0.054082,14.800,-0.010432
866726,121812,2007-02-28 00:00:00,2008-01-19 00:00:00,P,195.0,0.15,0.25,50,577,0.373078,-0.003701,0.000070,4.674209,-0.893325,29807414,1,2,325,470.605325,449.45,449.45000,0.051807,0.200,0.055904
868369,104918,2007-03-30 00:00:00,2008-01-19 00:00:00,P,40.0,15.90,16.20,250,261,0.443765,-0.945869,0.042397,1.886990,-0.379697,26162498,1,2,295,24.895519,23.99,23.99000,0.052204,16.050,-0.005980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964699,110649,2015-01-30 00:00:00,2015-07-17 00:00:00,P,9.0,0.50,0.65,2,237,0.356604,-0.325607,0.152940,2.379124,-0.911734,106045591,1,2,168,9.752932,9.74,68.18002,0.002883,0.575,0.007425
1966470,116416,2015-01-30 00:00:00,2015-03-06 00:00:00,C,167.5,0.73,1.78,1,29,0.385098,0.145554,0.012932,10.431750,-21.029280,106678183,1,2,35,146.727067,147.95,147.95000,0.001924,1.255,-0.021535
1967657,152875,2015-01-30 00:00:00,2015-03-20 00:00:00,P,50.0,0.05,0.30,15,18,0.517489,-0.031383,0.005296,1.817854,-3.501009,106596997,1,2,49,69.840664,70.32,70.32000,0.002204,0.175,0.075764
1968004,189845,2015-01-30 00:00:00,2015-02-27 00:00:00,C,57.5,0.28,0.63,225,232,0.501938,0.139157,0.032471,3.006958,-9.864370,106458697,1,2,28,48.986649,49.08,49.08000,0.001769,0.455,-0.035420


In [17]:
mapping_table

Unnamed: 0,secid,sdate,edate,permno
0,5001,1996-01-02 00:00:00,1996-03-13 00:00:00,10074
1,5002,1996-01-01 00:00:00,1996-02-22 00:00:00,10154
2,5004,1996-01-01 00:00:00,2000-01-27 00:00:00,80071
3,5005,1996-01-01 00:00:00,1997-08-12 00:00:00,85041
4,5006,1996-01-01 00:00:00,1996-08-28 00:00:00,10496
...,...,...,...,...
28271,215155,2020-12-31 00:00:00,2020-12-31 00:00:00,20200
28272,215156,2020-12-31 00:00:00,2020-12-31 00:00:00,20223
28273,215158,2020-12-31 00:00:00,2020-12-31 00:00:00,20285
28274,215165,2020-12-31 00:00:00,2020-12-31 00:00:00,20257


In [18]:
sp500_op_ret

Unnamed: 0,secid,date,exdate,cp_flag,strike_price,best_bid,best_offer,volume,open_interest,impl_volatility,delta,gamma,vega,theta,optionid,cfadj,days_no_trading,days_to_exp,forwardprice,spotprice,adj_spot,ir_rate,mid_price,option_ret
0,5015,1996-01-31 00:00:00,1996-07-20 00:00:00,C,20.0,4.625,5.000,15,25,0.380062,0.793831,0.044693,4.456725,-2.603545,10980421,1,0,171,24.142672,24.000,24.0000,0.052253,4.8125,-0.025367
1,5048,1996-01-31 00:00:00,1996-08-17 00:00:00,P,40.0,3.000,3.375,5,37,0.245716,-0.507195,0.058756,11.186040,-1.508168,11613022,1,0,199,39.344930,39.000,39.0000,0.051780,3.1875,0.002347
2,5049,1996-01-31 00:00:00,1996-05-18 00:00:00,C,65.0,5.875,6.375,4,420,0.167298,0.833206,0.039200,9.020193,-5.515290,11618802,1,0,108,70.168661,70.375,35.1875,0.053274,6.1250,-0.009721
3,5049,1996-01-31 00:00:00,1996-08-17 00:00:00,C,70.0,3.250,3.625,13,3806,0.149214,0.559395,0.050652,19.823520,-4.653782,11770703,1,0,199,70.393348,70.375,35.1875,0.051780,3.4375,0.001186
4,5061,1996-01-31 00:00:00,1996-06-22 00:00:00,C,30.0,3.375,3.750,5,254,0.324523,0.657941,0.056807,7.146084,-3.929316,10914516,1,0,143,32.001209,31.750,31.7500,0.052726,3.5625,-0.006649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3825526,214905,2021-11-30 00:00:00,2022-07-15 00:00:00,C,13.0,1.150,1.500,134,665,0.437066,0.487561,0.095777,3.813131,-1.399446,143779767,1,0,227,12.112223,12.310,12.3100,0.002822,1.3250,-0.068384
3825527,214905,2021-11-30 00:00:00,2022-07-15 00:00:00,C,15.0,0.100,0.750,5,14,0.335686,0.249430,0.098387,3.066143,-0.847491,143779769,1,0,227,12.112223,12.310,12.3100,0.002822,0.4250,0.055073
3825528,214905,2021-11-30 00:00:00,2022-07-15 00:00:00,P,11.0,0.200,1.100,3,111,0.308935,-0.297605,0.114422,3.328838,-0.815255,143779782,1,0,227,12.112223,12.310,12.3100,0.002822,0.6500,0.020419
3825529,214905,2021-11-30 00:00:00,2022-07-15 00:00:00,P,12.0,1.100,1.600,53,35,0.372075,-0.422035,0.106850,3.744469,-1.102309,143779783,1,0,227,12.112223,12.310,12.3100,0.002822,1.3500,-0.016824


In [19]:
sp500_op_ret = sp500_op_ret.merge(mapping_table, on="secid", how="left")

In [24]:
mapping_table

Unnamed: 0,secid,sdate,edate,permno
0,5001,1996-01-02 00:00:00,1996-03-13 00:00:00,10074
1,5002,1996-01-01 00:00:00,1996-02-22 00:00:00,10154
2,5004,1996-01-01 00:00:00,2000-01-27 00:00:00,80071
3,5005,1996-01-01 00:00:00,1997-08-12 00:00:00,85041
4,5006,1996-01-01 00:00:00,1996-08-28 00:00:00,10496
...,...,...,...,...
28271,215155,2020-12-31 00:00:00,2020-12-31 00:00:00,20200
28272,215156,2020-12-31 00:00:00,2020-12-31 00:00:00,20223
28273,215158,2020-12-31 00:00:00,2020-12-31 00:00:00,20285
28274,215165,2020-12-31 00:00:00,2020-12-31 00:00:00,20257


In [41]:
import random

In [53]:
tmp_date = sp500_op_ret["date"][0]
sp500_op_ret[(sp500_op_ret["date"] == tmp_date) & (sp500_op_ret["permno"] == 56266)]

Unnamed: 0,secid,date,exdate,cp_flag,strike_price,best_bid,best_offer,volume,open_interest,impl_volatility,delta,gamma,vega,theta,optionid,cfadj,days_no_trading,days_to_exp,forwardprice,spotprice,adj_spot,ir_rate,mid_price,option_ret,sdate,edate,permno
0,5015,1996-01-31 00:00:00,1996-07-20 00:00:00,C,20.0,4.625,5.0,15,25,0.380062,0.793831,0.044693,4.456725,-2.603545,10980421,1,0,171,24.142672,24.0,24.0,0.052253,4.8125,-0.025367,1996-01-01 00:00:00,1998-10-01 00:00:00,56266
2103,5015,1996-01-31 00:00:00,1996-07-20 00:00:00,C,25.0,1.4375,1.625,16,403,0.294709,0.462826,0.081279,6.401439,-2.533654,10385280,1,0,171,24.142672,24.0,24.0,0.052253,1.53125,-0.003343,1996-01-01 00:00:00,1998-10-01 00:00:00,56266
2104,5015,1996-01-31 00:00:00,1996-07-20 00:00:00,C,27.5,0.9375,1.1875,1,55,0.343211,0.324547,0.063405,5.831637,-2.502579,10440757,1,0,171,24.142672,24.0,24.0,0.052253,1.0625,-0.049677,1996-01-01 00:00:00,1998-10-01 00:00:00,56266
2105,5015,1996-01-31 00:00:00,1996-04-20 00:00:00,P,25.0,1.8125,2.0625,10,49,0.307167,-0.591168,0.120232,4.230949,-2.400755,10752997,1,0,80,24.058134,24.0,24.0,0.05371,1.9375,-0.012452,1996-01-01 00:00:00,1998-10-01 00:00:00,56266
2106,5015,1996-01-31 00:00:00,1996-04-20 00:00:00,P,22.5,0.5,0.6875,1,11,0.281269,-0.285548,0.109995,3.762847,-2.10608,11214327,1,0,80,24.058134,24.0,24.0,0.05371,0.59375,-0.032655,1996-01-01 00:00:00,1998-10-01 00:00:00,56266


In [63]:
tmp_df = pd.DataFrame()

def foo(group):
    num = len(group) / len(group.permno.unique())
    if abs(num - 73.547546) < 0.1:
        print(group)
        global tmp_df
        tmp_df = group 
        group.to_csv("tmp.csv")
    return num 
    
    
sp500_op_ret.groupby("date").apply(foo)

          secid                 date               exdate cp_flag  \
3578059  100892  2021-01-29 00:00:00  2021-03-19 00:00:00       C   
3578060  100892  2021-01-29 00:00:00  2021-03-19 00:00:00       P   
3578061  100892  2021-01-29 00:00:00  2021-05-21 00:00:00       P   
3578062  100892  2021-01-29 00:00:00  2021-08-20 00:00:00       C   
3578063  100892  2021-01-29 00:00:00  2021-08-20 00:00:00       C   
...         ...                  ...                  ...     ...   
3626007  214905  2021-01-29 00:00:00  2022-01-21 00:00:00       C   
3626008  214905  2021-01-29 00:00:00  2022-01-21 00:00:00       C   
3626009  214905  2021-01-29 00:00:00  2022-01-21 00:00:00       P   
3626010  214905  2021-01-29 00:00:00  2022-01-21 00:00:00       P   
3626011  214905  2021-01-29 00:00:00  2022-01-21 00:00:00       P   

         strike_price  best_bid  best_offer  volume  open_interest  \
3578059          42.5      3.50        3.90       8              9   
3578060          47.5      3.30

date
1996-01-31 00:00:00     7.419301
1996-02-29 00:00:00     6.561620
1996-03-29 00:00:00     6.149909
1996-04-30 00:00:00     6.401060
1996-05-31 00:00:00     6.028902
                         ...    
2021-07-30 00:00:00    50.585404
2021-08-31 00:00:00    51.908385
2021-09-30 00:00:00    54.171384
2021-10-29 00:00:00    53.621451
2021-11-30 00:00:00    60.872075
Length: 311, dtype: float64

In [70]:
tmp_df = pd.read_csv("tmp.csv")
tmp_df[tmp_df.permno == 61241].strike_price.describe()

count    584.000000
mean      93.481164
std       34.521112
min        5.000000
25%       75.000000
50%       89.250000
75%      105.000000
max      190.000000
Name: strike_price, dtype: float64

In [62]:
tmp_df

ValueError: Empty data passed with indices specified.

ValueError: Empty data passed with indices specified.

In [14]:
sp500_op_ret["ir_rate"].describe()

count    3.825531e+06
mean     1.801193e-02
std      1.915548e-02
min      8.914000e-04
25%      2.672931e-03
50%      1.074308e-02
75%      2.571517e-02
max      7.199510e-02
Name: ir_rate, dtype: float64

In [97]:
sp500_op_ret[
    (sp500_op_ret["secid"] == 5049) & (sp500_op_ret["date"] == "1996-01-31 00:00:00")
]

Unnamed: 0,secid,date,exdate,cp_flag,strike_price,best_bid,best_offer,volume,open_interest,impl_volatility,delta,gamma,vega,theta,optionid,cfadj,days_no_trading,days_to_exp,forwardprice,spotprice,adj_spot,ir_rate,mid_price,option_ret
2,5049,1996-01-31 00:00:00,1996-05-18 00:00:00,C,65.0,5.875,6.375,4,420,0.167298,0.833206,0.0392,9.020193,-5.51529,11618802,1,0,108,70.168661,70.375,35.1875,0.053274,6.125,-0.009721
3,5049,1996-01-31 00:00:00,1996-08-17 00:00:00,C,70.0,3.25,3.625,13,3806,0.149214,0.559395,0.050652,19.82352,-4.653782,11770703,1,0,199,70.393348,70.375,35.1875,0.05178,3.4375,0.001186
2065,5049,1996-01-31 00:00:00,1997-01-18 00:00:00,C,75.0,2.25,2.625,3,189,0.147161,0.371697,0.036495,25.50448,-3.143041,10637037,1,0,353,71.160224,70.375,35.1875,0.049987,2.4375,-0.000829
2066,5049,1996-01-31 00:00:00,1996-03-16 00:00:00,P,70.0,1.375,1.625,27,27,0.160386,-0.484632,0.107657,9.54487,-4.919761,11022992,1,0,45,70.183981,70.375,35.1875,0.054401,1.5,-0.026413
2067,5049,1996-01-31 00:00:00,1996-05-18 00:00:00,P,70.0,2.0,2.375,1,67,0.148886,-0.473366,0.069962,14.87514,-1.94855,11185240,1,0,108,70.168661,70.375,35.1875,0.053274,2.1875,-0.009183
2068,5049,1996-01-31 00:00:00,1996-03-16 00:00:00,C,70.0,1.375,1.625,10,31,0.145853,0.526418,0.111908,9.628968,-7.829927,11680701,1,0,45,70.183981,70.375,35.1875,0.054401,1.5,-0.029019


In [27]:
sp500_op_ret.describe()

Unnamed: 0,secid,strike_price,best_bid,best_offer,volume,open_interest,impl_volatility,delta,gamma,vega,theta,optionid,cfadj,days_no_trading,days_to_exp,forwardprice,spotprice,adj_spot,ir_rate,mid_price,option_ret
count,3825531.0,3825531.0,3825531.0,3825531.0,3825531.0,3825531.0,3825531.0,3825531.0,3825531.0,3825531.0,3825531.0,3825531.0,3825531.0,3825531.0,3825531.0,3825531.0,3825531.0,3825531.0,3825531.0,3825531.0,3825393.0
mean,111809.1,158.3552,10.48743,10.99611,82.8853,1758.055,0.3903697,0.1159395,0.0380009,24.67463,-16.87396,82876110.0,1.053916,0.002337976,125.7383,161.5546,161.287,216.0093,0.01801193,10.74177,0.07061732
std,19568.78,353.4783,38.74286,39.29525,571.676,5857.573,0.2047089,0.480732,0.04315247,60.98045,43.13121,46495040.0,0.526351,0.05339109,82.9173,352.8938,352.1467,4140.947,0.01915548,39.01709,46.27202
min,5015.0,0.5,0.01,0.07,1.0,1.0,0.011147,-0.999679,0.0,0.00049,-1061.745,10000010.0,1.0,0.0,28.0,5e-05,0.15,0.1171875,0.0008914,0.04,-16010.24
25%,103434.0,37.0,1.15,1.35,3.0,68.0,0.25947,-0.271804,0.010479,5.147422,-13.07261,32648210.0,1.0,0.0,50.0,37.57191,37.54,31.97,0.002672931,1.25,-0.03809121
50%,107398.0,65.0,2.9,3.2,11.0,316.0,0.339646,0.160491,0.026374,9.966705,-6.169045,103308200.0,1.0,0.0,109.0,64.988,64.86,56.73,0.01074308,3.05,-0.0102676
75%,111652.0,129.0,7.4,7.9,40.0,1264.0,0.458008,0.506883,0.05078,20.19212,-3.281715,124080000.0,1.0,0.0,173.0,131.2548,131.15,117.485,0.02571517,7.65,0.01373601
max,214905.0,5500.0,2692.95,2707.55,256169.0,779540.0,2.998749,1.00003,2.643942,1354.885,17.71834,143923000.0,50.0,2.0,359.0,3516.114,3507.07,439054.4,0.0719951,2700.25,74252.89


### number of put/call

In [33]:
num_call = sum(sp500_op_ret["cp_flag"] == "C")
num_put =  sum(sp500_op_ret["cp_flag"] == "P")
print(f"number of calls: {num_call}, comprising {round(num_call/(num_call + num_put) * 100)}% options")

number of calls: 2179336, comprising 57% options


### Moneyness

In [34]:
moneyness = sp500_op_ret["strike_price"] / sp500_op_ret["spotprice"]

In [60]:
sp500_op_ret["moneyness"] = moneyness

In [72]:
itm = sp500_op_ret[
    (
        (sp500_op_ret["moneyness"] > 1.1) & (sp500_op_ret["cp_flag"] == "P")
    )
    | 
    (
        (sp500_op_ret["moneyness"] < 0.9) & (sp500_op_ret["cp_flag"] == "C")
    )
]

In [73]:
otm = sp500_op_ret[
    (
        (sp500_op_ret["moneyness"] < 0.9) & (sp500_op_ret["cp_flag"] == "P")
    )
    | 
    (
        (sp500_op_ret["moneyness"] > 1.1) & (sp500_op_ret["cp_flag"] == "C")
    )
]

In [74]:
atm = sp500_op_ret[
        (sp500_op_ret["moneyness"] >= 0.9) & (sp500_op_ret["moneyness"] <= 1.1)
]

In [84]:
ioatm = [itm, otm, atm]
[round(i.shape[0] / sp500_op_ret.shape[0], 2) for i in ioatm]

[0.14, 0.33, 0.52]

In [80]:
sp500_op_ret.shape

(3825531, 25)

### average IV

In [15]:
sp500_op_ret["impl_volatility"].mean()

0.3903697014422311

### days to maturity

In [49]:
days2maturity = (    
    sp500_op_ret["exdate"].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
  - sp500_op_ret["date"].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
)

In [45]:
days2maturity.describe()

count                        3825531
mean     125 days 17:43:05.652344728
std       82 days 22:00:54.621799602
min                 28 days 00:00:00
25%                 50 days 00:00:00
50%                109 days 00:00:00
75%                173 days 00:00:00
max                359 days 00:00:00
dtype: object

In [46]:
days2maturity[0]

Timedelta('171 days 00:00:00')

In [52]:
datetime.timedelta(900) - days2maturity[0]

Timedelta('729 days 00:00:00')

In [55]:
short_term_options = sum(days2maturity <= datetime.timedelta(90))
long_term_options = sum(days2maturity > datetime.timedelta(90))

In [56]:
short_term_options, long_term_options

(1764052, 2061479)

In [59]:
f"number of short_term_options {short_term_options} and its proportion {round(short_term_options / (short_term_options + long_term_options) * 100)}%"

'number of short_term_options 1764052 and its proportion 46%'

In [18]:
days2maturity.median()

Timedelta('109 days 00:00:00')

## Prediction

### just use very basic features --- almost no predictive power

In [12]:
import numpy as np
from sklearn.linear_model import LinearRegression

In [43]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [54]:
from sklearn import linear_model

In [45]:
subsample = sp500_op_ret

In [46]:
subsample = subsample.dropna()

In [82]:
X = subsample[
    ["impl_volatility", "delta", "gamma", "vega", "theta"]
]

y = subsample['option_ret']

X_train, X_test, y_train, y_test = train_test_split(
    X, y
)

# regressor = LinearRegression()
regressor = linear_model.Lasso(alpha=0.1)
regressor.fit(X_train, y_train)

Lasso(alpha=0.1)

In [83]:
y_pred = regressor.predict(X_test)
mean_squared_error(y_test, y_pred)

1486.3248747034686

In [84]:
r2_score(y_test, y_pred)

-2.0995210787688734e-06

In [85]:
sp500_op_ret.shape

(3825531, 24)