In [2]:
import pandas as pd
import datetime as dt
import math
from tqdm import tqdm
from pandas.tseries.offsets import BDay

PATH = "C:/Users/jackl/OneDrive/Documents/finance_research/japan_qe/"

In [8]:
def get_truncated_df(merge_df, columns, year_column_name=None, low=0.01, high=0.99):
    merge_df_copy = merge_df.copy()
    if year_column_name == None:
        for column in columns: 
            _1pct, _99pct = merge_df_copy[column].quantile(q=low), merge_df_copy[column].quantile(q=high)
            merge_df_copy[column].where((merge_df_copy[column] < _99pct) & (merge_df_copy[column] > _1pct), math.nan, inplace=True)
    else: 
        merge_df_list = []
        merge_year_df_list = [[year, merge_year_df] for year, merge_year_df in merge_df_copy.groupby(year_column_name)]
        for year, merge_year_df in tqdm(merge_year_df_list):
            for column in columns: 
                _1pct, _99pct = merge_year_df[column].quantile(q=low), merge_year_df[column].quantile(q=high)
                merge_year_df[column].where(~((merge_year_df[column] > _99pct) | (merge_year_df[column] < _1pct)), math.nan, inplace=True)
            merge_df_list.append(merge_year_df)
        merge_df_copy = pd.concat(merge_df_list)
    
    return merge_df_copy

def get_boj_quintiles_conditional(sue_df_boj_merged, suffix=None, quantiles=5):
    sue_df_boj_merged_nd = sue_df_boj_merged[['sedol', 'yr_qtr_index', 'boj_share_shares']].drop_duplicates()
    sue_df_boj_merged_nd_list = []
    for index, df in tqdm(sue_df_boj_merged_nd.groupby('yr_qtr_index')): 
        try:
            df['boj_share_shares_quantiles_'+suffix] = pd.qcut(df['boj_share_shares'], quantiles, labels=False)
        except: 
            df['boj_share_shares_quantiles_'+suffix] = [math.nan for i in range(len(df))]

        sue_df_boj_merged_nd_list.append(df)

    sue_df_boj_merged_nd = pd.concat(sue_df_boj_merged_nd_list)

    sue_df_boj_merged = pd.merge(sue_df_boj_merged, 
                                 sue_df_boj_merged_nd, 
                                 on=['sedol', 'yr_qtr_index', 'boj_share_shares'])
    return sue_df_boj_merged

## Returns

In [25]:
ret_df = pd.read_csv(PATH+'raw_data/comp_global/all_japanese_stocks.csv', parse_dates=['datadate'])

In [26]:
ret_df = ret_df.dropna(subset=['sedol'])
ret_df['sedol'] = ret_df['sedol'].astype(str)

In [27]:
ret_df = ret_df.drop(columns=['gvkey', 'iid'])

In [28]:
ret_df['mod_prccd'] = ret_df['prccd'] * ret_df['trfd'] / ret_df['ajexdi']
ret_df['mod_ret'] = ret_df.groupby('sedol')['mod_prccd'].pct_change()
ret_df['yr_qtr_index'] = ret_df['datadate'].apply(lambda x: (x.year, x.quarter))

In [31]:
ret_df = ret_df[~ret_df['datadate'].isin([pd.to_datetime('2018-09-30'), 
                                          pd.to_datetime('2020-02-29'), 
                                          pd.to_datetime('2015-03-28'), 
                                          pd.to_datetime('2019-06-30'), 
                                          pd.to_datetime('2019-03-31')])]

In [35]:
#restricts sample to trading-days, i.e. gets rid of days where more than 90% of stocks are not traded
ret_df_without_non_trading_days_list = []
for date, df in tqdm(ret_df.groupby(['datadate'])): 
    ret_non_nan = df[['mod_ret']].dropna()
    num_stocks = len(ret_non_nan)
    num_stocks_w_0_ret = len(ret_non_nan[ret_non_nan['mod_ret']==0])
    try:
        if num_stocks_w_0_ret/num_stocks < 0.9: 
            ret_df_without_non_trading_days_list.append(df)
    except:
        print(date)
        
ret_df_without_non_trading_days = pd.concat(ret_df_without_non_trading_days_list)

  1%|▍                                                         | 22/3024 [00:00<00:42, 71.41it/s]

2009-06-01 00:00:00


100%|███████████████████████████████████████████████████████| 3024/3024 [00:13<00:00, 218.70it/s]


In [36]:
#restricts sample to stocks that trade, i.e. gets rid of stocks with 80% of days in a quarter are not traded
ret_df_without_non_traded_stocks_list = []
for sedol_qtr, df in tqdm(ret_df_without_non_trading_days.groupby(['sedol', 'yr_qtr_index'])): 
    ret_non_nan = df[['mod_ret']].dropna()
    num_days = len(ret_non_nan)
    num_days_w_0_ret = len(ret_non_nan[ret_non_nan['mod_ret']==0])
    
    #reasonable cutoff is if there are 19 trading days in the quarter
    if num_days >= 31: 
        if num_days_w_0_ret / num_days < 0.8: 
            ret_df_without_non_traded_stocks_list.append(df)

ret_df_without_non_traded_stocks = pd.concat(ret_df_without_non_traded_stocks_list)

100%|█████████████████████████████████████████████████████| 12257/12257 [01:03<00:00, 194.20it/s]


In [37]:
ret_df_without_non_traded_stocks.to_pickle(PATH+'checkpoint_data/returns_all_stocks_intermediate.pkl')
# ret_df_without_non_traded_stocks = pd.read_pickle(PATH+'checkpoint_data/returns_all_stocks_intermediate.pkl')

## Merge with TOPIX Index To Get Mkt Returns

In [38]:
topix_df = pd.read_csv(PATH+'raw_data/datastream/topix_daily_returns.csv', parse_dates=['valuedate'])
topix_df['ret'] = topix_df['pi_'].pct_change()

In [39]:
ret_df_merged_1 = pd.merge(ret_df_without_non_traded_stocks, 
                           topix_df, 
                           left_on=['datadate'], 
                           right_on=['valuedate'], 
                           how='left')

ret_df_merged_1 = ret_df_merged_1.drop(columns=['valuedate'])

In [40]:
ret_df_merged_1['mod_ret_mkt_adj'] = ret_df_merged_1['mod_ret'] - ret_df_merged_1['ret']

In [41]:
ret_df_merged_1 = ret_df_merged_1.drop(columns=['dsindexmnem', 'pi_', 'ret'])

In [47]:
ret_df_merged_1.sort_values(['sedol', 'datadate']).to_pickle(PATH+'checkpoint_data/returns_all_stocks.pkl')

<!-- ## Volume -->

In [22]:
# ret_df_list = []
# for sedol, sedol_ret_df in tqdm(ret_df.groupby(['sedol'])): 
#     sedol_ret_df['cshtrd_av_yr'] = sedol_ret_df['cshtrd'].rolling(252, min_periods=210).mean().shift(1)
#     sedol_ret_df['cshtrd_av_qtr'] = sedol_ret_df['cshtrd'].rolling(63, min_periods=52).mean().shift(1)
#     ret_df_list.append(sedol_ret_df)

In [23]:
# ret_df = pd.concat(ret_df_list)