In [63]:
import pandas as pd
import numpy as np
import datetime as dt
import math
import matplotlib.pyplot as plt
from pandas.tseries.offsets import BDay
from tqdm import tqdm
from scipy.stats.mstats import winsorize

PATH = "C:/Users/jackl/OneDrive/Documents/finance_research/japan_qe/"

In [64]:
boj_df = pd.read_pickle(PATH+'checkpoint_data/boj_purchases_stock_level_v2.pkl')
boj_df = boj_df[['date', 'sedol', 'shares_purchased','boj_share_shares_no_ff', 'boj_share_shares', 'sic_2d']]

In [65]:
nk_df = pd.read_csv(PATH+'nk_df_v2.csv')

In [66]:
ret_df = pd.read_pickle(PATH+'checkpoint_data/returns_all_stocks.pkl')
ret_df = ret_df[ret_df['sedol'].isin(nk_df['sedol'])]
ret_df['size'] = ret_df['cshoc'] * ret_df['prccd'] / 10**12
ret_df = ret_df[['datadate', 'sedol', 'mod_prccd', 'cshoc', 'cshtrd', 'size', 'mod_ret']]

In [67]:
ret_df = ret_df[ret_df['datadate'] <= pd.to_datetime('2020-12-31')]

In [68]:
control_df = pd.read_pickle(PATH+'checkpoint_data/controls_daily.pkl')

## Merge

In [69]:
vol_df = pd.merge(ret_df, 
                  boj_df, 
                  left_on=['sedol', 'datadate'], 
                  right_on=['sedol', 'date'], 
                  how='left').drop(columns=['date'])

In [70]:
vol_df = pd.merge_asof(vol_df.sort_values('datadate'), 
                       control_df[['datadate', 'sedol', 'mb', 'mb_l1']].sort_values('datadate'), 
                       by='sedol',
                       on=['datadate'], 
                       direction='backward')

In [72]:
vol_df['turnover'] = vol_df['cshtrd']/vol_df['cshoc'] * 100

In [74]:
cols = ['cshtrd', 'size', 'mb', 'mod_ret', 'turnover', 'mb_l1']
vol_df[cols] = vol_df[cols].clip(lower=vol_df[cols].quantile(0.01), 
                                 upper=vol_df[cols].quantile(0.99), 
                                 axis=1)

In [75]:
vol_df['shares_purchased'] = vol_df['shares_purchased'].apply(lambda x: 0 if math.isnan(x)==True else x)
vol_df['shares_purchased_pct'] = vol_df['shares_purchased'] / vol_df['cshoc'] * 100

In [76]:
vol_df['cshtrd_typical_days'] = vol_df['cshtrd'] * (vol_df['shares_purchased'].apply(lambda x: math.nan if x>1 else 1))
vol_df['turnover_typical_days'] = vol_df['turnover'] * (vol_df['shares_purchased'].apply(lambda x: math.nan if x>1 else 1))

In [77]:
vol_df_list = []
for sedol, sedol_vol_df in tqdm(vol_df.groupby(['sedol'])): 
    sedol_vol_df['mod_ret_l1']  = sedol_vol_df['mod_ret'].shift(1)
    sedol_vol_df['cshtrd_l1'] = sedol_vol_df['cshtrd'].shift(1)
    sedol_vol_df['sic_2d'] = sedol_vol_df['sic_2d'].ffill().bfill()
    
    sedol_vol_df['shares_purchased_pct_f1'] = sedol_vol_df['shares_purchased_pct'].shift(-1)
    sedol_vol_df['shares_purchased_pct_f2'] = sedol_vol_df['shares_purchased_pct'].shift(-2)
    sedol_vol_df['shares_purchased_pct_f3'] = sedol_vol_df['shares_purchased_pct'].shift(-3)
    sedol_vol_df['shares_purchased_pct_f4'] = sedol_vol_df['shares_purchased_pct'].shift(-4)
    sedol_vol_df['shares_purchased_pct_f5'] = sedol_vol_df['shares_purchased_pct'].shift(-5)
    sedol_vol_df['shares_purchased_pct_f6'] = sedol_vol_df['shares_purchased_pct'].shift(-6)
    sedol_vol_df['shares_purchased_pct_f7'] = sedol_vol_df['shares_purchased_pct'].shift(-7)
    sedol_vol_df['shares_purchased_pct_f8'] = sedol_vol_df['shares_purchased_pct'].shift(-8)
    sedol_vol_df['shares_purchased_pct_f9'] = sedol_vol_df['shares_purchased_pct'].shift(-9)
    sedol_vol_df['shares_purchased_pct_f10'] = sedol_vol_df['shares_purchased_pct'].shift(-10)
    sedol_vol_df['shares_purchased_pct_f11'] = sedol_vol_df['shares_purchased_pct'].shift(-11)
    sedol_vol_df['shares_purchased_pct_f12'] = sedol_vol_df['shares_purchased_pct'].shift(-12)
    sedol_vol_df['shares_purchased_pct_f13'] = sedol_vol_df['shares_purchased_pct'].shift(-13)
    
    window_size = 245
    sedol_vol_df['typ_vol'] = sedol_vol_df['cshtrd'].rolling(window=window_size, min_periods=window_size//3).mean().shift(1)
    sedol_vol_df['typ_vol_no_boj'] = sedol_vol_df['cshtrd_typical_days'].rolling(window=window_size, min_periods=window_size//3).mean().shift(1)
    sedol_vol_df['typ_turn'] = sedol_vol_df['turnover'].rolling(window=window_size, min_periods=window_size//3).mean().shift(1)
    sedol_vol_df['typ_turn_no_boj'] = sedol_vol_df['turnover_typical_days'].rolling(window=window_size, min_periods=window_size//3).mean().shift(1)
    vol_df_list.append(sedol_vol_df)
vol_df = pd.concat(vol_df_list)

100%|████████████████████████████████████████████████████████████████████████████████| 242/242 [00:07<00:00, 33.08it/s]


In [78]:
#get firm IDs
sedol_id = dict(zip(set(vol_df['sedol']), range(len(set(vol_df['sedol'])))))
vol_df['firm_id'] = vol_df['sedol'].map(sedol_id)

In [79]:
#get date IDs
datadate_id = dict(zip(sorted(set(vol_df['datadate'])), range(len(sorted(set(vol_df['datadate']))))))
vol_df['datadate_id'] = vol_df['datadate'].map(datadate_id)

In [86]:
vol_df = vol_df[(vol_df['datadate'] >= pd.to_datetime('2010-12-01')) & 
                (vol_df['datadate'] <= pd.to_datetime('2020-12-31'))]

In [87]:
vol_df.to_stata(PATH+'regression_files/dta_files/front_running.dta')