In [1]:
import numpy as np
import pandas as pd
import datetime
import pyarrow.parquet as pq
import os
import matplotlib.pyplot as plt

#### Revisit Data Quality issue



In [2]:
### Preparation:
# 1. read data file names
folder_path = "F:/volume_prediction"
filenames = os.listdir(folder_path)
filenames = [x for x in filenames if x.endswith(".parquet")]

filenames

['barra_factor_exposures.parquet',
 'earnings.parquet',
 'pricing_and_volumes.parquet',
 'r3k_sec_mapping.parquet']

In [3]:
# 2. load data
from model_training_utils import read_data
# using pyarrow.parquet.read_table to load data
# columns and filters are useful to save memory


### [1] Find the universe after mapping
use the intersection of universe

In [4]:
# final universe
predictors_path = "F:/predictors"
final_universe = read_data(filename="final_universe",
                               columns=['date', 'barrid', 'isin', 'fsym_id'], 
                               folder_path=predictors_path)

In [5]:
# check the time range of the final universe
dates_list = final_universe.date.unique()
min(dates_list), max(dates_list)

(datetime.date(2019, 1, 2), datetime.date(2023, 12, 29))

### Features/Predictors

### 1. Technical signals (“tech”)
From paper:  
Technical signals (“tech”): lagged moving averages of returns and log dollar volume over the past 1, 5, 22, and 252 days.

In [6]:
pv_df = read_data(filename="pricing_and_volumes",
                  columns=['date', 'barrid', 'adj_price', 'adj_volume', 'is_adj_date'], 
                  folder_path=folder_path)

pv_df = final_universe.merge(pv_df, on=['date', 'barrid'], how="left")
pv_df = pv_df.sort_values(by=["isin", "date"], ignore_index=True)
pv_df.loc[:, "adj_price"]=pv_df["adj_price"].replace(0, np.nan)
pv_df.loc[:, "log_adj_volume"]=np.log(pv_df["adj_volume"].replace(0, np.nan).replace(np.float32("inf"), np.nan))
pv_df.loc[:, "return"]=pv_df.groupby(["barrid"])['adj_price'].pct_change(fill_method=None)

pv_df = pv_df[['barrid', 'date', 'isin', 'log_adj_volume', 'return', 'is_adj_date']]

In [7]:
# https://www.isin.org/can-a-company-have-more-than-one-isin/
multiple_isin = pv_df[["barrid", "isin"]].drop_duplicates().groupby("barrid").count()

In [8]:
multiple_isin[multiple_isin.values>1]

Unnamed: 0_level_0,isin
barrid,Unnamed: 1_level_1
USA1K61,2
USA1RY3,2
USA1SS1,2
USA1X81,2
USA1Y71,2
...,...
USBFW41,2
USBFXO1,2
USBFZ71,2
USBGUH2,2


In [9]:
pv_df[pv_df.barrid=="USA1K61"].groupby("isin").date.max()

isin
US6496045013    2023-03-08
US6496048405    2023-12-29
Name: date, dtype: object

In [10]:
pv_df[pv_df.barrid=="USA1K61"].groupby("isin").date.min()

isin
US6496045013    2019-01-02
US6496048405    2023-03-09
Name: date, dtype: object

In [11]:
pv_df[(pv_df.barrid=="USA1K61")&(pv_df.date>=datetime.date(2023,3, 5))\
&(pv_df.date<=datetime.date(2023,3, 10))]

Unnamed: 0,barrid,date,isin,log_adj_volume,return,is_adj_date
2380809,USA1K61,2023-03-06,US6496045013,13.134276,-0.015971,False
2380810,USA1K61,2023-03-07,US6496045013,12.998518,-0.019975,False
2380811,USA1K61,2023-03-08,US6496045013,13.646385,-0.020382,False
2380812,USA1K61,2023-03-09,US6496048405,14.443053,-0.045514,True
2380813,USA1K61,2023-03-10,US6496048405,14.32361,-0.036785,False


In [12]:
pv_df[(pv_df.barrid=="USA1K61")&(pv_df.is_adj_date)]

Unnamed: 0,barrid,date,isin,log_adj_volume,return,is_adj_date
2379818,USA1K61,2019-03-28,US6496045013,13.724984,0.004644,True
2379877,USA1K61,2019-06-21,US6496045013,13.827114,-0.008746,True
2379938,USA1K61,2019-09-18,US6496045013,14.210334,0.009622,True
2380003,USA1K61,2019-12-19,US6496045013,13.861867,-0.00409,True
2380135,USA1K61,2020-06-30,US6496045013,13.673772,-0.007937,True
2380194,USA1K61,2020-09-23,US6496045013,13.802788,-0.051562,True
2380253,USA1K61,2020-12-16,US6496045013,13.439316,-0.005302,True
2380319,USA1K61,2021-03-24,US6496045013,13.868507,-0.004374,True
2380382,USA1K61,2021-06-23,US6496045013,13.756181,-0.014803,True
2380445,USA1K61,2021-09-22,US6496045013,13.403049,0.020869,True


In [1]:
13.597571/14.291685-1

-0.048567681137668406

In [2]:
14.443053/13.597571	-1

0.062178899451968306

In [3]:
14.930832/13.597571-1

0.0980514093289162

In [13]:
multiple_barrid = pv_df[["barrid", "isin"]].drop_duplicates().groupby("isin").count()
multiple_barrid = multiple_barrid[multiple_barrid.values>1]
multiple_barrid = pv_df[["barrid", "isin"]].drop_duplicates().groupby("isin").agg(list).loc[multiple_barrid.index]
multiple_barrid

Unnamed: 0_level_0,barrid
isin,Unnamed: 1_level_1
IE00BLNN3691,"[USAEOF1, USBE2D1]"
US0235861004,"[USARZ41, USARZ42]"
US03748R7474,"[USARWM1, USBFDP1]"
US1416191062,"[USA8TQ1, GERRJO1]"
US4219461047,"[USAQGH1, GERJEF1]"
VGG1110E1079,"[USBDMI1, USBOFX1]"


In [14]:
for isin_i in multiple_barrid.index:
    start_date = pv_df[pv_df["isin"]==isin_i].groupby("barrid").date.max().iloc[0]
    end_date =pv_df[pv_df["isin"]==isin_i].groupby("barrid").date.min().iloc[1]
    print(isin_i, start_date, end_date)

IE00BLNN3691 2019-05-14 2022-06-27
US0235861004 2022-11-09 2022-11-10
US03748R7474 2020-12-14 2020-12-15
US1416191062 2023-04-28 2019-01-02
US4219461047 2022-07-21 2019-01-02
VGG1110E1079 2022-10-04 2022-10-05


In [15]:
isin_i="US1416191062"
print("Min Date: ", pv_df[pv_df["isin"]==isin_i].groupby("barrid").date.min())
print("Min Date: ", pv_df[pv_df["isin"]==isin_i].groupby("barrid").date.max())
pv_df[pv_df["barrid"]=="GERRJO1"].shape[0]

Min Date:  barrid
GERRJO1    2023-04-28
USA8TQ1    2019-01-02
Name: date, dtype: object
Min Date:  barrid
GERRJO1    2023-04-28
USA8TQ1    2023-04-27
Name: date, dtype: object


1

In [16]:
isin_i="US4219461047"
print("Min Date: ", pv_df[pv_df["isin"]==isin_i].groupby("barrid").date.min())
print("Min Date: ", pv_df[pv_df["isin"]==isin_i].groupby("barrid").date.max())
pv_df[pv_df["barrid"]=="GERJEF1"].shape[0]

Min Date:  barrid
GERJEF1    2022-07-21
USAQGH1    2019-01-02
Name: date, dtype: object
Min Date:  barrid
GERJEF1    2022-07-21
USAQGH1    2022-07-20
Name: date, dtype: object


1

In [17]:
pv_df[(pv_df["isin"]=="IE00BLNN3691")&(pv_df.date>=datetime.date(2019,5, 10))\
&(pv_df.date<=datetime.date(2022,6, 30))]

Unnamed: 0,barrid,date,isin,log_adj_volume,return,is_adj_date
95221,USAEOF1,2019-05-10,IE00BLNN3691,17.83341,-0.026316,False
95222,USAEOF1,2019-05-13,IE00BLNN3691,,0.0,False
95223,USAEOF1,2019-05-14,IE00BLNN3691,20.511644,-0.864865,False
95224,USBE2D1,2022-06-27,IE00BLNN3691,14.330241,,False
95225,USBE2D1,2022-06-28,IE00BLNN3691,13.735524,0.00734,False
95226,USBE2D1,2022-06-29,IE00BLNN3691,13.746185,-0.015431,False
95227,USBE2D1,2022-06-30,IE00BLNN3691,13.924104,-0.083152,False
