## Description

Analyzes TAQ bar prices from IG

In [2]:
import datetime

import pandas as pd
import s3fs
import seaborn as sns
import statsmodels
import statsmodels.api
from pyarrow import parquet

# IG data

In [5]:
# Load one day of data: 2019-01-07 is a Monday.

# path = "s3://iglp-core-data/ds/ext/bars/taq/v1.0-prod/300/20210603/data.parquet"
# path = "s3://iglp-core-data/ds/ext/bars/taq/v1.0-prod/300/20190107/data.parquet"
path = "s3://iglp-core-data/ds/ext/bars/taq/v1.0-prod/60/20190107/data.parquet"
columns = None
filesystem = s3fs.S3FileSystem() if path.startswith("s3://") else None
dataset = parquet.ParquetDataset(path, filesystem=filesystem)
table = dataset.read(columns=columns)
df_taq_bars = table.to_pandas()
# .to_csv(sys.stdout, index=False, quoting=csv.QUOTE_NONNUMERIC)

In [6]:
print("df_taq_bars.shape=", df_taq_bars.shape)

print(df_taq_bars.columns)

print("tickers=", len(df_taq_bars["ticker"].unique()))

print("igid=", len(df_taq_bars["igid"].unique()))

print("currency=", df_taq_bars["currency"].unique())

df_taq_bars.shape= (4079164, 50)
Index(['vendor_date', 'interval', 'start_time', 'end_time', 'ticker',
       'currency', 'open', 'close', 'low', 'high', 'volume', 'notional',
       'last_trade_time', 'all_day_volume', 'all_day_notional', 'day_volume',
       'day_notional', 'day_vol_prc_sqr', 'day_num_trade', 'bid', 'ask',
       'bid_size', 'ask_size', 'good_bid', 'good_ask', 'good_bid_size',
       'good_ask_size', 'day_spread', 'day_num_spread', 'day_low', 'day_high',
       'last_trade', 'last_trade_volume', 'bid_high', 'ask_high', 'bid_low',
       'ask_low', 'sided_bid_count', 'sided_bid_shares', 'sided_bid_notional',
       'day_sided_bid_count', 'day_sided_bid_shares', 'day_sided_bid_notional',
       'sided_ask_count', 'sided_ask_shares', 'sided_ask_notional',
       'day_sided_ask_count', 'day_sided_ask_shares', 'day_sided_ask_notional',
       'igid'],
      dtype='object')
tickers= 8516
igid= 8476
currency= ['USD']


In [12]:
# print("\n".join(map(str, df_taq_bars.iloc[0].values)))

In [13]:
display(df_taq_bars.head(3))

display(df_taq_bars.tail(3))

Unnamed: 0,vendor_date,interval,start_time,end_time,ticker,currency,open,close,low,high,...,day_sided_bid_count,day_sided_bid_shares,day_sided_bid_notional,sided_ask_count,sided_ask_shares,sided_ask_notional,day_sided_ask_count,day_sided_ask_shares,day_sided_ask_notional,igid
0,2019-01-07,60,1546869600,1546869660,A,USD,,,,,...,0,0,0.0,0,0,0.0,0,0,0.0,16572.0
1,2019-01-07,60,1546869600,1546869660,AA,USD,,,,,...,0,0,0.0,0,0,0.0,0,0,0.0,1218568.0
2,2019-01-07,60,1546869600,1546869660,AAAU,USD,,,,,...,0,0,0.0,0,0,0.0,0,0,0.0,1428781.0


Unnamed: 0,vendor_date,interval,start_time,end_time,ticker,currency,open,close,low,high,...,day_sided_bid_count,day_sided_bid_shares,day_sided_bid_notional,sided_ask_count,sided_ask_shares,sided_ask_notional,day_sided_ask_count,day_sided_ask_shares,day_sided_ask_notional,igid
4079161,2019-01-07,60,1546898280,1546898340,ZXZZ.T,USD,,,,,...,8,1000,55009.0,0,0,0.0,6,700,39400.0,926565.0
4079162,2019-01-07,60,1546898280,1546898340,ZYME,USD,,,,,...,366,69447,1060614.0,0,0,0.0,722,120109,1843956.0,1228878.0
4079163,2019-01-07,60,1546898280,1546898340,ZYNE,USD,,,,,...,1036,224384,902623.6,0,0,0.0,1331,223640,904867.8,927406.0


In [14]:
# Get AAPL data for a subset of columns.
mask = df_taq_bars["ticker"] == "AAPL"
df_ig = df_taq_bars[mask]
print(df_ig.shape)

columns = [
    "start_time",
    "end_time",
    "ticker",
    "currency",
    "open",
    "close",
    "low",
    "high",
    "volume",
]
df_ig = df_ig[columns]

df_ig.head(3)

(479, 50)


Unnamed: 0,start_time,end_time,ticker,currency,open,close,low,high,volume
13,1546869600,1546869660,AAPL,USD,,,,,0
8529,1546869660,1546869720,AAPL,USD,,,,,0
17045,1546869720,1546869780,AAPL,USD,,,,,0


In [15]:
def to_et(df, col_name):
    df = df.copy()
    vals = df[col_name].apply(datetime.datetime.fromtimestamp)
    # print(vals)
    vals = vals.dt.tz_localize("UTC").dt.tz_convert("America/New_York")
    df[col_name] = vals
    return df


df_ig2 = to_et(df_ig, "start_time")
df_ig2 = to_et(df_ig2, "end_time")
df_ig2.set_index("start_time", drop=True, inplace=True)

display(df_ig2.head())
# display(df_ig2.tail())
# display(df_ig2[df_ig2.index > "2019-01-07 09:20"].head())
# display(df_ig2[df_ig2.index < "2019-01-07 16:03"].tail())

Unnamed: 0_level_0,end_time,ticker,currency,open,close,low,high,volume
start_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-01-07 09:00:00-05:00,2019-01-07 09:01:00-05:00,AAPL,USD,,,,,0
2019-01-07 09:01:00-05:00,2019-01-07 09:02:00-05:00,AAPL,USD,,,,,0
2019-01-07 09:02:00-05:00,2019-01-07 09:03:00-05:00,AAPL,USD,,,,,0
2019-01-07 09:03:00-05:00,2019-01-07 09:04:00-05:00,AAPL,USD,,,,,0
2019-01-07 09:04:00-05:00,2019-01-07 09:05:00-05:00,AAPL,USD,,,,,0


In [None]:
display(df_ig2[df_ig2.index > "2019-01-07 09:27"].head())

In [None]:
display(df_ig2[df_ig2.index < "2019-01-07 16:03"].tail())

In [None]:
df_ig2["close"].plot()

In [None]:
df_ig2["volume"].plot()

# Load ref data

In [None]:
file_name = "/app/aapl.csv"
df_ref = pd.read_csv(file_name)
display(df_ref.head())

df_ref["datetime"] = pd.to_datetime(df_ref["datetime"])
# df_ref["datetime"] = df_ref["datetime"].dt.tz_localize('UTC').dt.tz_convert('America/New_York')
df_ref["datetime"] = df_ref["datetime"].dt.tz_localize("America/New_York")
df_ref.set_index("datetime", inplace=True, drop=True)
# df_ref = df_ref[(df_ref.index >= "2019-01-07") & (df_ref.index < "2019-01-08")]
df_ref = df_ref[
    (df_ref.index >= "2019-01-07 09:30") & (df_ref.index <= "2019-01-07 16:00")
]
df_ref.rename({"vol": "volume"}, axis="columns", inplace=True)

display(df_ref.head())
display(df_ref.tail())

In [None]:
# display(df_ref[df_ref.index > "2019-01-07 09:27"].head())

# display(df_ref[df_ref.index < "2019-01-07 16:03"].tail())

In [None]:
df_ref["close"].plot()

In [None]:
df_ref["volume"].plot()

# Comparison

In [None]:
display(df_ig2.head())
display(df_ref.head())

In [None]:
# Notice how the 16:00:00 bars differ
display(df_ig2.dropna().tail())
display(df_ref.dropna().tail())

In [None]:
target_col_name = "close"

In [None]:
col_names = [target_col_name]
df_all = df_ig2[col_names].merge(
    df_ref[col_names],
    left_index=True,
    right_index=True,
    how="outer",
    suffixes=["_ig", "_ref"],
)
df_all.head()

In [None]:
# Notice that the precisions appear to be different across the two columns
#   (and within the same day for the "_ig" column)
display(df_all.dropna().head())
display(df_all.dropna().tail())

In [None]:
df_all.dropna().tail()

In [None]:
df_all.plot()

In [None]:
df_all.columns[0]

In [None]:
def calculate_diffs(df, shifts=0):
    df = df.diff()
    df["diff_of_diffs"] = df[df.columns[0]] - df[df.columns[1]].shift(shifts)
    return df

In [None]:
diffs = calculate_diffs(df_all, 0)

In [None]:
diffs.dropna()

In [None]:
diffs["diff_of_diffs"].plot()

In [None]:
diffs["diff_of_diffs"].cumsum().plot()

In [None]:
diffs["diff_of_diffs"].hist(bins=30)

In [None]:
diffs["diff_of_diffs"].mean(), diffs["diff_of_diffs"].std()

In [None]:
diffs["diff_of_diffs"].apply(abs).sum()

In [None]:
# Force all the data to be centered around 100.
df_all -= df_all.mean(axis=0)
df_all += 100.0

df_all.plot()

In [None]:
rets = df_all.pct_change()

rets.plot()

In [None]:
# df_all[col_names].loc["2019-01-07 09:30":"2019-01-07 12:00"].plot()
# df_all[col_names].loc["2019-01-07 09:30":"2019-01-07 09:35"].plot()
df_all.loc["2019-01-07 09:35":"2019-01-07 09:40"].plot()

In [None]:
predicted_var = diffs.columns[0]
predictor_var = diffs.columns[1]

df = diffs[[predicted_var, predictor_var]].copy()
df[predicted_var] = df[predicted_var].shift(0)
df = df.dropna()

intercept = True
model = statsmodels.api.OLS(
    df[predicted_var], df[predictor_var], hasconst=intercept
).fit()
print(model.summary().as_text())

sns.jointplot(x=predictor_var, y=predicted_var, data=df)