In [1]:
import pandas as pd
import datetime as dt
import math
from tqdm import tqdm
import numpy as np
from pandas.tseries.offsets import BDay
import matplotlib.pyplot as plt

PATH = "C:/Users/jackl/OneDrive/Documents/finance_research/japan_qe/"

nk_df = pd.read_csv(PATH+'nk_df_v2.csv')
sedol_list = list(nk_df['sedol'])
del nk_df

In [2]:
def get_truncated_df(merge_df, columns, year_column_name=None, low=0.01, high=0.99):
    merge_df_copy = merge_df.copy()
    if year_column_name == None:
        for column in columns: 
            _1pct, _99pct = merge_df_copy[column].quantile(q=low), merge_df_copy[column].quantile(q=high)
            merge_df_copy[column].where((merge_df_copy[column] < _99pct) & (merge_df_copy[column] > _1pct), math.nan, inplace=True)
    else: 
        merge_df_list = []
        merge_year_df_list = [[year, merge_year_df] for year, merge_year_df in merge_df_copy.groupby(year_column_name)]
        for year, merge_year_df in tqdm(merge_year_df_list):
            for column in columns: 
                _1pct, _99pct = merge_year_df[column].quantile(q=low), merge_year_df[column].quantile(q=high)
                merge_year_df[column].where(~((merge_year_df[column] > _99pct) | (merge_year_df[column] < _1pct)), math.nan, inplace=True)
            merge_df_list.append(merge_year_df)
        merge_df_copy = pd.concat(merge_df_list)
    
    return merge_df_copy

def get_boj_quintiles_conditional(sue_df_boj_merged, suffix=None, quantiles=5):
    sue_df_boj_merged_nd = sue_df_boj_merged[['sedol', 'yr_qtr_index', 'boj_share_shares']].drop_duplicates()
    sue_df_boj_merged_nd_list = []
    for index, df in tqdm(sue_df_boj_merged_nd.groupby('yr_qtr_index')): 
        try:
            df['boj_share_shares_quantiles_'+suffix] = pd.qcut(df['boj_share_shares'], quantiles, labels=False)
        except: 
            df['boj_share_shares_quantiles_'+suffix] = [math.nan for i in range(len(df))]

        sue_df_boj_merged_nd_list.append(df)

    sue_df_boj_merged_nd = pd.concat(sue_df_boj_merged_nd_list)

    sue_df_boj_merged = pd.merge(sue_df_boj_merged, 
                                 sue_df_boj_merged_nd, 
                                 on=['sedol', 'yr_qtr_index', 'boj_share_shares'])
    return sue_df_boj_merged

# Daily Returns Setup

In [3]:
ret_df = pd.read_pickle(PATH+'checkpoint_data/returns_all_stocks.pkl')

ret_df = ret_df[['yr_qtr_index', 'datadate', 'conm', 'sedol', 'cshoc', 'cshtrd', 
                 'prccd', 'mod_prccd', 'mod_ret', 'mod_ret_mkt_adj']]

ret_df['mc'] = ret_df['cshoc'] * ret_df['prccd']
ret_df['turnover'] = ret_df['cshtrd']/ret_df['cshoc']

In [4]:
mc_df = ret_df[['datadate', 'sedol', 'mc']]
mc_for_later = mc_df.copy()

# Market-to-Book Setup

In [5]:
mb_df = pd.read_csv(PATH+'raw_data/worldscope/mb.csv', parse_dates=['ITEM5350'])
mb_df.columns = ['code', 'year_', 'freq', 'seq', 'def_taxes', 'prf', 'common_equity', 'datadate', 'sedol']

In [6]:
mb_df = mb_df.dropna(subset=['sedol'])
mb_df['sedol'] = mb_df['sedol'].astype(str)

In [7]:
mb_df = mb_df.drop(columns=['code', 'year_', 'freq', 'seq'])

In [8]:
mb_df['def_taxes'] = mb_df['def_taxes'].apply(lambda x: 0 if math.isnan(x)==True else x)
mb_df['prf'] = mb_df['prf'].apply(lambda x: 0 if math.isnan(x)==True else x)

In [9]:
mb_df['bv'] = mb_df['common_equity'] + mb_df['def_taxes'] - mb_df['prf']
mb_df = mb_df[mb_df['bv'] > 0]

In [10]:
mb_df = mb_df[['datadate','sedol', 'bv']]
mb_df = mb_df.dropna()

In [11]:
mb_df.columns = ['mb_datadate', 'sedol', 'bv']

In [12]:
mb_df['yr_qtr_index'] = mb_df['mb_datadate'].apply(lambda x: (x.year, math.ceil(x.month / 3)))
qtr_yr_index_list = list(mb_df['yr_qtr_index'].sort_values().unique())
qtr_index = [i for i in range(len(qtr_yr_index_list))]
qtr_conv_dict = dict(zip(qtr_yr_index_list, qtr_index))
mb_df['qtr_index'] = mb_df['yr_qtr_index'].apply(lambda x: qtr_conv_dict[x])

In [13]:
mb_df = pd.merge_asof(mb_df.sort_values('mb_datadate'), 
                      mc_df.sort_values('datadate'), 
                      by='sedol', 
                      left_on=['mb_datadate'], 
                      right_on=['datadate'], 
                      direction='backward', 
                      tolerance=pd.Timedelta(days=3))

In [14]:
mb_df['mb'] = mb_df['mc']/mb_df['bv']
mb_df = mb_df.dropna(subset=['mb'])
mb_df = mb_df.drop(columns=['mc', 'datadate'])

# Earnings Setup

In [15]:
# ws_df = pd.read_csv(PATH+'raw_data/worldscope/all_earnings.csv', parse_dates=['ITEM5350'])
# ws_df.columns = ['code', 'year', 'freq', 'seq', 'e', 'date', 'sedol', 'fyr']
# ws_df = ws_df.dropna(subset=['sedol'])
# ws_df['sedol'] = ws_df['sedol'].astype(str)

In [16]:
# ws_df = ws_df.dropna(subset=['sedol', 'date'])
# ws_df['sedol'] = ws_df['sedol'].astype(str)
# ws_df = ws_df.drop(columns=['code', 'fyr', 'seq', 'freq'])

In [17]:
# ws_df['month'] = ws_df['date'].dt.month
# month_list = list(ws_df['month'])
# new_month_list = []
# for i in tqdm(range(len(month_list))):
#     month = month_list[i]
#     if month in [1, 4, 7, 10]: 
#         month += 2
#     elif month in [2, 5, 8, 11]: 
#         month += 1
#     new_month_list.append(month)
# ws_df['month'] = new_month_list
# ws_df['day'] = 30

100%|████████████████████████████████████████████████████████████████████| 2627985/2627985 [00:04<00:00, 654661.32it/s]


In [18]:
# ws_df['datadate'] = pd.to_datetime(ws_df[['year', 'month', 'day']])
# ws_df = ws_df.drop(columns=['year', 'month', 'day', 'date'])

In [20]:
# date_index = [pd.to_datetime('2010-03-30') + pd.DateOffset(months=3 * i) for i in range(44)]
# sedol_list = []
# for _, sedol_df in tqdm(ws_df.groupby('sedol')): 
#     sedol_df = sedol_df.set_index('datadate').reindex(date_index).reset_index()
#     sedol_df['e_rolling'] = sedol_df['e'].rolling(window=4, min_periods=4).mean()
#     sedol_list.append(sedol_df)
# ws_df = pd.concat(sedol_list)

In [21]:
# ws_df.columns = ['earnings_datadate', 'e', 'sedol', 'e_rolling']

In [22]:
# ws_df['yr_qtr_index'] = ws_df['earnings_datadate'].apply(lambda x: (x.year, math.ceil(x.month / 3)))
# ws_df['qtr_index'] = ws_df['yr_qtr_index'].apply(lambda x: qtr_conv_dict[x])

In [23]:
# ws_df = pd.merge_asof(ws_df.sort_values('earnings_datadate'), 
#                       mc_df.sort_values('datadate'), 
#                       by='sedol', 
#                       left_on=['earnings_datadate'], 
#                       right_on=['datadate'], 
#                       direction='backward', 
#                       tolerance=pd.Timedelta(days=3))

In [21]:
# ws_df['pe'] = ws_df['mc']/ws_df['e_rolling']
# ws_df = ws_df.drop(columns=['mc', 'datadate'])

## Beta

In [22]:
beta_df = pd.read_pickle(PATH+'checkpoint_data/beta_qtr_df.pkl').drop(columns=['qtr_yr_index'])

In [23]:
beta_df['yr_qtr_index'] = beta_df['beta_date'].apply(lambda x: (x.year, math.ceil(x.month / 3)))
beta_df['qtr_index'] = beta_df['yr_qtr_index'].apply(lambda x: qtr_conv_dict[x])

## Daily Controls

In [24]:
mc_for_later['index'] = mc_for_later['datadate'].apply(lambda x: (x.year, x.month))
mc_for_later = mc_for_later.drop_duplicates(['index', 'sedol'], keep='first').drop(columns=['index'])
mc_for_later.columns = ['datadate', 'sedol', 'mc_monthly']
ret_df = pd.merge_asof(ret_df.sort_values('datadate'), 
                       mc_for_later.sort_values('datadate'), 
                       by='sedol', 
                       on='datadate', 
                       direction='backward')

In [25]:
ret_df['yr_qtr_index'] = ret_df['datadate'].apply(lambda x: (x.year, math.ceil(x.month / 3)))
ret_df['qtr_index'] = ret_df['yr_qtr_index'].map(qtr_conv_dict)

In [26]:
sedol_df_list = []
for _, sedol_df in tqdm(ret_df.groupby('sedol')):
    sedol_df = sedol_df.sort_values('datadate')
    sedol_df['turnover_3m'] = sedol_df['turnover'].rolling(window=60, min_periods=30).mean().shift(1)
    sedol_df['vlt_12m'] = sedol_df['mod_ret_mkt_adj'].rolling(window=245, min_periods=245//2).std().shift(1)
    sedol_df_list.append(sedol_df)
ret_df = pd.concat(sedol_df_list)

100%|███████████████████████████████████████████████████████████████████████████████| 266/266 [00:02<00:00, 111.18it/s]


In [27]:
ret_df = pd.merge(ret_df, 
                  mb_df, 
                  on=['sedol', 'qtr_index', 'yr_qtr_index'],
                  how='left')

ret_df['qtr_index_l1'] = ret_df['qtr_index'] - 1
ret_df = pd.merge(ret_df, 
                  ret_df[['mb', 'qtr_index', 'sedol']].drop_duplicates(['qtr_index', 'sedol']), 
                  left_on=['sedol', 'qtr_index_l1'], 
                  right_on=['sedol', 'qtr_index'],
                  suffixes=[None, '_l1']).drop(columns=['qtr_index_l1'])

In [28]:
# ret_df = pd.merge(ret_df, 
#                   ws_df, 
#                   on=['sedol', 'qtr_index', 'yr_qtr_index'],
#                   how='left')

# ret_df['qtr_index_l1'] = ret_df['qtr_index'] - 1
# ret_df = pd.merge(ret_df, 
#                   ret_df[['pe', 'qtr_index', 'sedol']].drop_duplicates(['qtr_index', 'sedol']), 
#                   left_on=['sedol', 'qtr_index_l1'], 
#                   right_on=['sedol', 'qtr_index'],
#                   suffixes=[None, '_l1']).drop(columns=['qtr_index_l1'])

In [29]:
#note beta is already lagged
ret_df = pd.merge(ret_df, 
                  beta_df, 
                  on=['sedol', 'qtr_index', 'yr_qtr_index'],
                  how='left')

In [30]:
#past returns
ret_df_subset = ret_df[['datadate', 'mod_prccd', 'sedol']]
ret_df['datadate_l_year'] = ret_df['datadate'] - pd.DateOffset(years = 1)

In [34]:
ret_df = pd.merge_asof(ret_df.sort_values('datadate_l_year'), 
                       ret_df_subset.sort_values('datadate'), 
                       by=['sedol'], 
                       left_on=['datadate_l_year'], 
                       right_on=['datadate'], 
                       suffixes=[None, '_l_year'],
                       direction='backward', 
                       tolerance=pd.Timedelta(days=7)).drop(columns=['datadate_l_year'])

ret_df['datadate_l_day'] = ret_df['datadate'] - pd.DateOffset(days = 1)
ret_df = pd.merge_asof(ret_df.sort_values('datadate_l_day'), 
                       ret_df_subset.sort_values('datadate'), 
                       by=['sedol'], 
                       left_on=['datadate_l_day'], 
                       right_on=['datadate'], 
                       suffixes=[None, '_l_day'],
                       direction='backward', 
                       tolerance=pd.Timedelta(days=7)).drop(columns=['datadate_l_day'])

ret_df['past_yr_returns'] = ret_df['mod_prccd']/ret_df['mod_prccd_l_year'] - 1
ret_df['past_yr_returns_l_day'] = ret_df['mod_prccd_l_day']/ret_df['mod_prccd_l_year'] - 1

In [35]:
ret_df.to_pickle(PATH+'checkpoint_data/controls_daily.pkl')

In [30]:
# ret_df = pd.read_pickle(PATH+'checkpoint_data/controls_daily.pkl')

## Ignore Below

## Quarterly Controls

In [8]:
mc_list = []
vlt_list = []
ret_list = []
prc_list = []
turnover_list = []

num_days_total_list = []
num_days_w_data_list = []
coverage_ratio_list = []

start_date_list = []
yr_qtr_index_list = [] 
sedol_list = []

for index, df in tqdm(ret_df.groupby(['sedol', 'yr_qtr_index'])): 
    #get values
    start_date = df.iloc[0]['datadate']
    
    vlt = df['mod_ret'].std()
    ret = df['mod_ret'].mean()
    prc = np.log(df.iloc[0]['prccd'])
    mc = np.log(df.iloc[0]['log_mc'])
    turnover = df['turnover'].mean()
    
    num_days_total = len(df)
    num_days_w_data = len(df[~(abs(df['mod_ret']) <= 10**(-6))])
    coverage_ratio = num_days_w_data/num_days_total
    
    #append
    mc_list.append(mc)
    vlt_list.append(vlt)
    prc_list.append(prc)
    ret_list.append(ret)
    turnover_list.append(turnover)
    
    num_days_total_list.append(num_days_total)
    num_days_w_data_list.append(num_days_w_data)
    coverage_ratio_list.append(coverage_ratio)
    
    start_date_list.append(start_date)
    sedol_list.append(index[0])
    yr_qtr_index_list.append(index[1])

100%|█████████████████████████████████████████████████████████████████████████| 172472/172472 [14:07<00:00, 203.56it/s]


In [9]:
control_df = pd.DataFrame({'sedol': sedol_list, 
                           'yr_qtr_index': yr_qtr_index_list, 
                           'start_date': start_date_list,
                           'mc': mc_list, 
                           'vlt': vlt_list,
                           'ret': ret_list,
                           'prc': prc_list,
                           'turnover': turnover_list,
                           'num_days_total': num_days_total_list, 
                           'num_days_w_data': num_days_w_data_list, 
                           'coverage_ratio': coverage_ratio_list})

control_df['yr'] = control_df['yr_qtr_index'].apply(lambda x: x[0])
control_df = control_df.sort_values(['sedol', 'yr_qtr_index'])

In [17]:
control_df_1 = pd.merge(control_df, 
                        mb_df, 
                        left_on=['yr', 'sedol'], 
                        right_on=['year_', 'sedol'], 
                        how='left').drop(columns=['year_'])

In [18]:
control_match_df = control_df_1[['start_date', 'sedol', 'vlt', 'ret', 'turnover']]
control_match_df['start_date_f1'] = control_match_df['start_date'] + pd.DateOffset(months=3)
control_match_df['yr_qtr_index_f1'] = control_match_df['start_date_f1'].apply(lambda x: (x.year, x.quarter))
control_match_df = control_match_df.drop(columns=['start_date_f1'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_match_df['start_date_f1'] = control_match_df['start_date'] + pd.DateOffset(months=3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_match_df['yr_qtr_index_f1'] = control_match_df['start_date_f1'].apply(lambda x: (x.year, x.quarter))


In [19]:
control_df_2 = pd.merge(control_df_1, 
                        control_match_df.drop(columns=['start_date']), 
                        left_on=['sedol', 'yr_qtr_index'], 
                        right_on=['sedol', 'yr_qtr_index_f1'], 
                        suffixes=[None, '_l1'], 
                        how='left').drop(columns=['yr_qtr_index_f1', 'yr'])

In [20]:
control_df_2 = get_truncated_df(control_df_2, 
                 columns=['mc', 'vlt', 'ret', 'prc', 'mb', 'turnover', 'vlt_l1','ret_l1', 'turnover_l1'], 
                 year_column_name='yr_qtr_index', 
                 low=0.01, 
                 high=0.99)

100%|██████████████████████████████████████████████████████████████████████████████████| 45/45 [00:02<00:00, 19.24it/s]


In [22]:
control_df_2.to_pickle(PATH+'checkpoint_data/control.pkl')

In [6]:
control_df_2 = pd.read_pickle(PATH+'checkpoint_data/control.pkl')

## Yearly Controls

In [11]:
ret_df['yr'] = ret_df['datadate'].dt.year

In [18]:
mc_list = []
vlt_list = []
ret_list = []
prc_list = []
turnover_list = []

num_days_total_list = []
num_days_w_data_list = []
coverage_ratio_list = []

start_date_list = []
yr_qtr_index_list = [] 
sedol_list = []

for index, df in tqdm(ret_df.groupby(['sedol', 'yr'])): 
    #get values
    start_date = df.iloc[0]['datadate']
    
    vlt = df['mod_ret_mkt_adj'].std()
    ret = df['mod_ret_mkt_adj'].mean()
    turnover = df['turnover'].mean()
    prc = np.log(df.iloc[0]['prccd'])
    mc = np.log(df.iloc[0]['log_mc'])
    
    num_days_total = len(df)
    num_days_w_data = len(df[~(abs(df['mod_ret']) <= 10**(-6))])
    coverage_ratio = num_days_w_data/num_days_total
    
    #append
    mc_list.append(mc)
    vlt_list.append(vlt)
    prc_list.append(prc)
    ret_list.append(ret)
    turnover_list.append(turnover)
    
    num_days_total_list.append(num_days_total)
    num_days_w_data_list.append(num_days_w_data)
    coverage_ratio_list.append(coverage_ratio)
    
    start_date_list.append(start_date)
    sedol_list.append(index[0])
    yr_qtr_index_list.append(index[1])

100%|███████████████████████████████████████████████████████████████████████████| 46842/46842 [04:21<00:00, 178.90it/s]


In [75]:
control_df = pd.DataFrame({'sedol': sedol_list, 
                           'yr': yr_qtr_index_list, 
                           'start_date': start_date_list,
                           'mc': mc_list, 
                           'vlt': vlt_list,
                           'ret': ret_list,
                           'prc': prc_list,
                           'turnover': turnover_list,
                           'num_days_total': num_days_total_list, 
                           'num_days_w_data': num_days_w_data_list, 
                           'coverage_ratio': coverage_ratio_list})

control_df = control_df.sort_values(['sedol', 'yr'])

In [78]:
control_df_1 = pd.merge(control_df, 
                        mb_df, 
                        left_on=['yr', 'sedol'], 
                        right_on=['year_', 'sedol'], 
                        how='left').drop(columns=['year_'])

In [84]:
control_match_df = control_df_1[['start_date', 'sedol', 'vlt', 'ret', 'turnover', 'yr']]
control_match_df['yr_f1'] = control_match_df['yr'] + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_match_df['yr_f1'] = control_match_df['yr'] + 1


In [88]:
control_df_2 = pd.merge(control_df_1, 
                        control_match_df.drop(columns=['start_date']), 
                        left_on=['sedol', 'yr'], 
                        right_on=['sedol', 'yr_f1'], 
                        suffixes=[None, '_l1'], 
                        how='left').drop(columns=['yr_f1', 'yr_l1'])

In [90]:
control_df_2 = get_truncated_df(control_df_2, 
                 columns=['mc', 'vlt', 'ret', 'prc', 'mb', 'turnover', 'vlt_l1','ret_l1', 'turnover_l1'], 
                 year_column_name='yr', 
                 low=0.01, 
                 high=0.99)

100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 17.90it/s]


In [91]:
control_df_2.to_pickle(PATH+'checkpoint_data/control_yearly.pkl')