In [32]:
import pandas as pd
import numpy as np
import vectorbt as vbt

np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)                 

In [33]:
# loading data
path = r"Data\AAPL_options.csv"
options_df = pd.read_csv(path)
options_df.head(2)

Unnamed: 0,date,expiration date,type,strike price,ask price,ask size,bid price,bid size,last price,volume,open interest,closing price,exp closing price,date div,exp date div
0,2016-01-04,2016-01-08,call,15.0,11.375,20.0,11.3,12.0,0.0,0.0,0.0,26.337,24.24,0.085246,0.09377
1,2016-01-04,2016-01-08,put,15.0,0.005,1400.0,0.0,0.0,0.0,0.0,0.0,26.337,24.24,0.085246,0.09377


In [34]:
# creating mid price
options_df.loc[:, "mid price"] = round((options_df["ask price"] + options_df["bid price"])/2, 5)

In [35]:
# concatenating call and put rows for the same price    
# call mid, put mid are columns 16 and 17 respectively
options_df["call mid"] = 0
options_df["put mid"] = 0
    
# boolean masks for call and put options in every second row
call_mask = (options_df.iloc[::2, 2] == "call").values
put_mask = (options_df.iloc[::2, 2] == "put").values

# indices of every second row
indices = np.arange(0, options_df.shape[0], 2)

# assign values based on call options
# if call, call mid will be assigned from mid price and put mid will be assigned from next mid price
options_df.iloc[indices[call_mask], 16] = options_df.iloc[indices[call_mask], 15].values
options_df.iloc[indices[call_mask], 17] = options_df.iloc[indices[call_mask] + 1, 15].values
# assign values based on put options
options_df.iloc[indices[put_mask], 17] = options_df.iloc[indices[put_mask], 15].values
options_df.iloc[indices[put_mask], 16] = options_df.iloc[indices[put_mask] + 1, 15].values

In [36]:
# reducing rows to every other and fixing index
options_df = options_df[options_df.index % 2 == 0]
options_df.index = np.arange(0, len(options_df))

In [37]:
# dropping unnecessary columns
options_df = options_df.drop(["bid price", "ask price", "last price",
     "exp closing price", "date div", "exp date div", "type"], axis=1)

In [38]:
# creating DTE
T = (pd.to_datetime(options_df["expiration date"]) - pd.to_datetime(options_df["date"])).dt.days
options_df["DTE"] = T

In [39]:
# dropping rows with zero call and put mid
mask1 = options_df["call mid"] > 0
mask2 = options_df["put mid"] > 0
options_df = options_df[mask1 | mask2]
options_df.head(2)

Unnamed: 0,date,expiration date,strike price,ask size,bid size,volume,open interest,closing price,mid price,call mid,put mid,DTE
0,2016-01-04,2016-01-08,15.0,20.0,12.0,0.0,0.0,26.337,11.3375,11.3375,0.0025,4
1,2016-01-04,2016-01-08,16.25,1412.0,0.0,0.0,0.0,26.337,0.0025,10.0875,0.0025,4


In [47]:
# creating new dataframe for date organization
df = pd.DataFrame()

# organizing based on dates
for header in ["DTE", "strike price", "closing price", "call mid", "put mid", "open interest", "volume"]:
    df[header] = options_df.groupby("date")[header].apply(np.array)

# converting index to date column
if "date" not in df.columns:
    df = df.reset_index()

df["date"] = pd.to_datetime(df["date"])
df["terms"] = df["DTE"].apply(lambda x: sorted(set(x)))
df.head(1)

Unnamed: 0,date,DTE,strike price,closing price,call mid,put mid,open interest,volume,terms
0,2016-01-04,"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[15.0, 16.25, 17.5, 18.75, 20.0, 21.25, 22.5, ...","[26.337, 26.337, 26.337, 26.337, 26.337, 26.33...","[11.3375, 10.0875, 8.8375, 7.5875, 6.3375, 5.0...","[0.0025, 0.0025, 0.0025, 0.0025, 0.00125, 0.00...","[0.0, 0.0, 0.0, 0.0, 2592.0, 8.0, 15908.0, 362...","[0.0, 0.0, 0.0, 0.0, 4836.0, 0.0, 35040.0, 157...","[4, 11, 18, 25, 32, 39, 46, 74, 102, 165, 193,..."


In [53]:
#grouping by DTE
def group_by_dte(dte, val, terms):
    grouped_mask = {}
    for i in terms:
        if i not in grouped_mask:
            grouped_mask[i] = []
        grouped_mask[i] = list(np.array(val)[dte == i])

    return list(grouped_mask.values())

df.loc[:, "strikes by DTE"] = [group_by_dte(row["DTE"], row["strike price"], row["terms"]) for idx, row in df.iterrows()]
df.loc[:, "open interest by DTE"] = [group_by_dte(row["DTE"], row["open interest"], row["terms"]) for idx, row in df.iterrows()]
df.loc[:, "volume by DTE"] = [group_by_dte(row["DTE"], row["volume"], row["terms"]) for idx, row in df.iterrows()]
df.head(2)

Unnamed: 0,date,DTE,strike price,closing price,call mid,put mid,open interest,volume,terms,abs moneyness,moneyness by DTE,strike by DTE,open interest by DTE,volume by DTE,max oi index by DTE,max volume index by DTE,strikes by DTE
0,2016-01-04,"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[15.0, 16.25, 17.5, 18.75, 20.0, 21.25, 22.5, ...","[26.337, 26.337, 26.337, 26.337, 26.337, 26.33...","[11.3375, 10.0875, 8.8375, 7.5875, 6.3375, 5.0...","[0.0025, 0.0025, 0.0025, 0.0025, 0.00125, 0.00...","[0.0, 0.0, 0.0, 0.0, 2592.0, 8.0, 15908.0, 362...","[0.0, 0.0, 0.0, 0.0, 4836.0, 0.0, 35040.0, 157...","[4, 11, 18, 25, 32, 39, 46, 74, 102, 165, 193,...","[11.337, 10.087, 8.837, 7.587, 6.337, 5.087, 3...","[[11.337, 10.087, 8.837, 7.587, 6.337, 5.087, ...","[[15.0, 16.25, 17.5, 18.75, 20.0, 21.25, 22.5,...","[[0.0, 0.0, 0.0, 0.0, 2592.0, 8.0, 15908.0, 36...","[[0.0, 0.0, 0.0, 0.0, 4836.0, 0.0, 35040.0, 15...","[21, 127, 14, 16, 24, 0, 14, 12, 14, 18, 11, 1...","[19, 82, 6, 16, 13, 11, 13, 10, 14, 18, 5, 16,...","[[15.0, 16.25, 17.5, 18.75, 20.0, 21.25, 22.5,..."
1,2016-01-05,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[15.0, 16.25, 17.5, 18.75, 20.0, 20.125, 20.25...","[25.677, 25.677, 25.677, 25.677, 25.677, 25.67...","[10.67875, 9.42875, 8.17875, 6.92875, 5.67875,...","[0.0025, 0.0025, 0.0025, 0.0025, 0.00125, 0.00...","[0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[3, 10, 17, 24, 31, 38, 45, 73, 101, 164, 192,...","[10.677, 9.427, 8.177, 6.927, 5.677, 5.552, 5....","[[10.677, 9.427, 8.177, 6.927, 5.677, 5.552, 5...","[[15.0, 16.25, 17.5, 18.75, 20.0, 20.125, 20.2...","[[0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[44, 127, 14, 16, 3, 15, 13, 12, 14, 18, 11, 1...","[40, 73, 9, 16, 16, 15, 11, 10, 14, 18, 11, 16...","[[15.0, 16.25, 17.5, 18.75, 20.0, 20.125, 20.2..."


In [54]:
#finding ATM strike for each expiration date
def max_by_dte(row):
    return [np.argmax(x) for x in row]

df.loc[:, "max oi index by DTE"] = list(map(max_by_dte, df.loc[:, "open interest by DTE"]))
df.loc[:, "max volume index by DTE"] = list(map(max_by_dte, df.loc[:, "volume by DTE"]))

In [67]:
# finding strike prices with the greatest volume and open interest for each DTE
def max_of(row, col):
    return [row[i][col[i]] for i in range(len(row))]

df.loc[:, "max oi strikes by DTE"] = list(map(max_of, df["strikes by DTE"], df["max oi index by DTE"]))
df.loc[:, "max oi by DTE"] = list(map(max_of, df["open interest by DTE"], df["max oi index by DTE"]))

df.loc[:, "max volume strikes by DTE"] = list(map(max_of, df["strikes by DTE"], df["max volume index by DTE"]))
df.loc[:, "max volume by DTE"] = list(map(max_of, df["open interest by DTE"], df["max volume index by DTE"]))

df.head(1)

Unnamed: 0,date,DTE,strike price,closing price,call mid,put mid,open interest,volume,terms,abs moneyness,...,strike by DTE,open interest by DTE,volume by DTE,max oi index by DTE,max volume index by DTE,strikes by DTE,max oi strikes by DTE,max oi by DTE,max volume strikes by DTE,max volume by DTE
0,2016-01-04,"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[15.0, 16.25, 17.5, 18.75, 20.0, 21.25, 22.5, ...","[26.337, 26.337, 26.337, 26.337, 26.337, 26.33...","[11.3375, 10.0875, 8.8375, 7.5875, 6.3375, 5.0...","[0.0025, 0.0025, 0.0025, 0.0025, 0.00125, 0.00...","[0.0, 0.0, 0.0, 0.0, 2592.0, 8.0, 15908.0, 362...","[0.0, 0.0, 0.0, 0.0, 4836.0, 0.0, 35040.0, 157...","[4, 11, 18, 25, 32, 39, 46, 74, 102, 165, 193,...","[11.337, 10.087, 8.837, 7.587, 6.337, 5.087, 3...",...,"[[15.0, 16.25, 17.5, 18.75, 20.0, 21.25, 22.5,...","[[0.0, 0.0, 0.0, 0.0, 2592.0, 8.0, 15908.0, 36...","[[0.0, 0.0, 0.0, 0.0, 4836.0, 0.0, 35040.0, 15...","[21, 127, 14, 16, 24, 0, 14, 12, 14, 18, 11, 1...","[19, 82, 6, 16, 13, 11, 13, 10, 14, 18, 5, 16,...","[[15.0, 16.25, 17.5, 18.75, 20.0, 21.25, 22.5,...","[26.25, 35.0, 27.5, 27.5, 28.5, 23.75, 30.0, 2...","[24300.0, 561236.0, 11120.0, 34928.0, 2032.0, ...","[25.75, 27.5, 25.0, 27.5, 25.75, 25.75, 28.75,...","[10868.0, 415412.0, 8404.0, 34928.0, 240.0, 53..."


In [68]:
#reorganizing df
max_df = pd.DataFrame({"date": df["date"],
                       "terms": df["terms"],
                       "max_volumes": df["max volume by DTE"],
                       "max_ois": df["max oi by DTE"],
                       "max_volume_strikes": df["max volume strikes by DTE"],
                       "max_oi_strikes": df["max oi strikes by DTE"],
                       "max_volume_idx": df["max volume index by DTE"],
                       "max_oi_idx": df["max oi index by DTE"]
                      })
max_df.head(1)

Unnamed: 0,date,terms,max_volumes,max_ois,max_volume_strikes,max_oi_strikes,max_volume_idx,max_oi_idx
0,2016-01-04,"[4, 11, 18, 25, 32, 39, 46, 74, 102, 165, 193,...","[10868.0, 415412.0, 8404.0, 34928.0, 240.0, 53...","[24300.0, 561236.0, 11120.0, 34928.0, 2032.0, ...","[25.75, 27.5, 25.0, 27.5, 25.75, 25.75, 28.75,...","[26.25, 35.0, 27.5, 27.5, 28.5, 23.75, 30.0, 2...","[19, 82, 6, 16, 13, 11, 13, 10, 14, 18, 5, 16,...","[21, 127, 14, 16, 24, 0, 14, 12, 14, 18, 11, 1..."


In [69]:
#finding which terms have the highest open interest and volume
#and finding the z score for these terms
def high_val_term(row, val):
    return val[np.argmax(row)]

def max_zscore_by_date(row):
    return (np.max(row) - np.mean(row)) / np.std(row)

max_df.loc[:, "max_oi_zscore"] = list(map(max_zscore_by_date, max_df["max_ois"]))
max_df.loc[:, "max_volume_zscore"] = list(map(max_zscore_by_date, max_df["max_volumes"]))

max_df.loc[:, "max_oi_term"] = list(map(high_val_term, max_df["max_ois"], max_df["terms"]))
max_df.loc[:, "max_volume_term"] = list(map(high_val_term, max_df["max_volumes"], max_df["terms"]))

print(max_df[["max_oi_term", "max_volume_term"]])
print(max_df[["max_oi_zscore", "max_volume_zscore"]])

     max_oi_term  max_volume_term
0             11               11
1             10              381
2              9                9
3              8                8
4              7              287
..           ...              ...
247           28               28
248           24               24
249           23               23
250           22               22
251           21              385

[252 rows x 2 columns]
     max_oi_zscore  max_volume_zscore
0         3.139783           3.141636
1         2.783677           2.038177
2         2.785092           3.080206
3         3.042789           3.131000
4         3.048106           1.912309
..             ...                ...
247       3.595408           3.626721
248       3.494901           3.531546
249       3.464081           3.696101
250       3.567284           3.705158
251       3.432017           3.549666

[252 rows x 2 columns]


In [70]:
max_df[["max_oi_zscore", "max_volume_zscore"]].describe()

Unnamed: 0,max_oi_zscore,max_volume_zscore
count,252.0,252.0
mean,2.70722,2.848688
std,0.40639,0.521349
min,1.728948,1.700033
25%,2.460373,2.419166
50%,2.707611,2.849778
75%,2.991579,3.260007
max,3.626086,3.88143
