In [4]:
import pandas as pd
import datetime as dt
import math
from tqdm import tqdm
import numpy as np
from pandas.tseries.offsets import BDay

PATH = "C:/Users/jackl/OneDrive/Documents/finance_research/japan_qe/"

In [5]:
def get_truncated_df(merge_df, columns, year_column_name=None, low=0.01, high=0.99):
    merge_df_copy = merge_df.copy()
    if year_column_name == None:
        for column in columns: 
            _1pct, _99pct = merge_df_copy[column].quantile(q=low), merge_df_copy[column].quantile(q=high)
            merge_df_copy[column].where((merge_df_copy[column] < _99pct) & (merge_df_copy[column] > _1pct), math.nan, inplace=True)
    else: 
        merge_df_list = []
        merge_year_df_list = [[year, merge_year_df] for year, merge_year_df in merge_df_copy.groupby(year_column_name)]
        for year, merge_year_df in tqdm(merge_year_df_list):
            for column in columns: 
                _1pct, _99pct = merge_year_df[column].quantile(q=low), merge_year_df[column].quantile(q=high)
                merge_year_df[column].where(~((merge_year_df[column] > _99pct) | (merge_year_df[column] < _1pct)), math.nan, inplace=True)
            merge_df_list.append(merge_year_df)
        merge_df_copy = pd.concat(merge_df_list)
    
    return merge_df_copy

def get_boj_quintiles_conditional(sue_df_boj_merged, suffix=None, quantiles=5):
    sue_df_boj_merged_nd = sue_df_boj_merged[['sedol', 'yr_qtr_index', 'boj_share_shares']].drop_duplicates()
    sue_df_boj_merged_nd_list = []
    for index, df in tqdm(sue_df_boj_merged_nd.groupby('yr_qtr_index')): 
        try:
            df['boj_share_shares_quantiles_'+suffix] = pd.qcut(df['boj_share_shares'], quantiles, labels=False)
        except: 
            df['boj_share_shares_quantiles_'+suffix] = [math.nan for i in range(len(df))]

        sue_df_boj_merged_nd_list.append(df)

    sue_df_boj_merged_nd = pd.concat(sue_df_boj_merged_nd_list)

    sue_df_boj_merged = pd.merge(sue_df_boj_merged, 
                                 sue_df_boj_merged_nd, 
                                 on=['sedol', 'yr_qtr_index', 'boj_share_shares'])
    return sue_df_boj_merged

In [6]:
ret_df = pd.read_pickle(PATH+'checkpoint_data/returns_all_stocks.pkl')

In [9]:
ret_df['amihud_raw'] = abs(ret_df['mod_ret'])/(ret_df['cshtrd'] * ret_df['prccd'])
ret_df['turn_raw_no_ff'] = ret_df['cshtrd']/ret_df['cshoc']

In [10]:
amihud_list = []
turn_no_ff_list = []
num_days_total_list = []
num_days_w_data_list = []
coverage_ratio_list = []

datadate_list = []
yr_qtr_index_list = [] 
sedol_list = []

for index, df in tqdm(ret_df.groupby(['sedol', 'yr_qtr_index'])):
    datadate = df.iloc[0]['datadate']
    amihud_qtr = np.log(10**6 * df['amihud_raw'].mean())
    turn_no_ff_qtr = np.log(df['turn_raw_no_ff'].mean())
    
    num_days_total = len(df)
    num_days_w_data = len(df[~df['amihud_raw'].isna()])
    coverage_ratio = num_days_w_data/num_days_total
    
    amihud_list.append(amihud_qtr)
    turn_no_ff_list.append(turn_no_ff_qtr)
    
    num_days_total_list.append(num_days_total)
    num_days_w_data_list.append(num_days_w_data)
    coverage_ratio_list.append(coverage_ratio)
    
    datadate_list.append(datadate)
    sedol_list.append(index[0])
    yr_qtr_index_list.append(index[1])

  amihud_qtr = np.log(10**6 * df['amihud_raw'].mean())
100%|█████████████████████████████████████████████████████████████████████████| 172472/172472 [08:23<00:00, 342.60it/s]


In [11]:
liq_df = pd.DataFrame({'sedol': sedol_list, 
                       'datadate': datadate_list,
                       'yr_qtr_index': yr_qtr_index_list, 
                       'amihud': amihud_list, 
                       'turn_no_ff': turn_no_ff_list, 
                       'num_days_total': num_days_total_list, 
                       'num_days_w_data': num_days_w_data_list, 
                       'coverage_ratio': coverage_ratio_list})

liq_df['yr'] = liq_df['yr_qtr_index'].apply(lambda x: x[0])
liq_df['coverage_ratio'] = liq_df['num_days_w_data']/liq_df['num_days_total']
liq_df = liq_df.sort_values(['sedol', 'yr_qtr_index'])

In [13]:
liq_df['yr_qtr_index_f1'] = (liq_df['datadate'] + pd.DateOffset(months=3)).apply(lambda x: (x.year, x.quarter))

In [14]:
liq_df = get_truncated_df(liq_df, 
                          columns=['amihud', 'turn_no_ff'], 
                          year_column_name=['yr_qtr_index'], 
                          low=0.01, 
                          high=0.99).sort_values(['sedol', 'yr_qtr_index'])

100%|██████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 87.23it/s]


In [15]:
liq_df.to_pickle(PATH+'checkpoint_data/liq.pkl')