Cleans IBES data and adds Novy-Marx style SUE measures. 

Final output is a dataframe containing companies with their SUE at different quarters. 

In [1]:
import pandas as pd
import datetime as dt
import math
import matplotlib.pyplot as plt

from tqdm import tqdm
from pandas.tseries.offsets import BDay

pd.set_option("display.max_columns", None)

PATH = "C:/Users/jackl/OneDrive/Documents/finance_research/japan_qe/"

## IBES SUE Cleaning

In [2]:
_alphabet = '0123456789 BCD FGH JKLMN PQRST VWXYZ'
def calc_check_digit(number):
    weights = (1, 3, 1, 7, 3, 9)
    s = sum(w * _alphabet.index(n) for w, n in zip(weights, number))
    return str((10 - s) % 10)

In [3]:
sum_detailed_df = pd.read_csv(PATH+'raw_data/ibes/detailed_int_v2.csv', parse_dates=['ANNDATS', 'PENDS'])

In [4]:
sum_detailed_jp_df = sum_detailed_df[sum_detailed_df['CURR_ACT']=='JPY'][['TICKER', 'CUSIP', 'CNAME', 'ANNDATS', 'ANNTIMS', 'PENDS']]

In [5]:
ticker_df_list = []
for ticker, ticker_df in tqdm(sum_detailed_jp_df.groupby(['TICKER'])):
    try: 
        cusip, cname = ticker_df[['CUSIP', 'CNAME']].dropna().iloc[0]
        ticker_df['CUSIP'] = ticker_df['CUSIP'].fillna(cusip)
        ticker_df['CNAME'] = ticker_df['CNAME'].fillna(cname)
    except:
        pass
    ticker_df_list.append(ticker_df)

100%|█████████████████████████████████████████████████████████████████████████████| 5328/5328 [00:33<00:00, 159.74it/s]


In [6]:
sum_detailed_jp_df = pd.concat(ticker_df_list)

In [7]:
#there are only 135 na's in total. Sufficieint to just drop all NANs
sum_detailed_jp_df = sum_detailed_jp_df.dropna()

In [8]:
#get SEDOL from CUSIP
sum_detailed_jp_df['SEDOL'] = sum_detailed_jp_df['CUSIP'].apply(lambda x: str(x)[2:] if len(str(x)) == 8 else math.nan)
sum_detailed_jp_df['SEDOL'] = sum_detailed_jp_df['SEDOL'].apply(lambda x: str(x) + calc_check_digit(x) if len(str(x)) == 6 else math.nan)

In [9]:
sum_detailed_jp_df[['TICKER', 'CUSIP']] = sum_detailed_jp_df[['TICKER', 'CUSIP']].astype(str)
sum_detailed_jp_df['ANNTIMS'] = pd.to_timedelta(sum_detailed_jp_df['ANNTIMS'])

In [10]:
sum_detailed_jp_df = sum_detailed_jp_df[sum_detailed_jp_df['ANNTIMS'] != pd.Timedelta('00:00:00')]

In [11]:
sum_detailed_jp_df['ANNDATS_FULL_EST'] = (sum_detailed_jp_df['ANNDATS'] + sum_detailed_jp_df['ANNTIMS'])
sum_detailed_jp_df['ANNDATS_FULL_JST'] = sum_detailed_jp_df['ANNDATS_FULL_EST'].dt.tz_localize('EST').dt.tz_convert('Asia/Tokyo')

In [12]:
#convert times
sum_detailed_jp_df['ANNDATS_JST'] = sum_detailed_jp_df['ANNDATS_FULL_JST'].dt.date
sum_detailed_jp_df['ANNTIMS_JST'] = pd.to_timedelta(sum_detailed_jp_df['ANNDATS_FULL_JST'].dt.time.astype(str))

In [13]:
#adjust time zones. This is needed since IBES data is quoted in NY time, while price data is Japanese local time
sum_detailed_jp_df['eff_anndats'] = sum_detailed_jp_df['ANNDATS_JST'] + sum_detailed_jp_df['ANNTIMS_JST'].apply(lambda x: BDay(1) if (x > pd.to_timedelta('15:00:00')) else BDay(0))

In [14]:
sum_detailed_jp_df = sum_detailed_jp_df.drop(columns=['ANNDATS',
                                                      'ANNTIMS',
                                                      'ANNDATS_JST', 
                                                      'ANNTIMS_JST'])

In [15]:
sum_detailed_jp_df['PMON'] = sum_detailed_jp_df['PENDS'].dt.month
sum_detailed_jp_df['PYEAR'] = sum_detailed_jp_df['PENDS'].dt.year

In [16]:
sum_detailed_jp_df.columns = ['TICKER', 'CUSIP', 'CNAME', 'FPEDATS', 'SEDOL', 'ANNDATS_FULL_EST', 
                              'ANNDATS_FULL_JST', 'eff_anndats', 'PMON', 'PYEAR']

In [17]:
sum_detailed_jp_df = sum_detailed_jp_df.sort_values(['CUSIP', 'FPEDATS']).drop_duplicates(['CUSIP', 'eff_anndats'], keep='last')

In [18]:
# anndats = list(sum_detailed_jp_df['eff_anndats'])
# times = list(sum_detailed_jp_df['ANNTIMS_ACT'])
# ref_anndats = []
# for i in range(len(anndats)): 
#     if times[i] >= pd.to_timedelta('00:00:00'): 
#         ref_anndats.append(anndats[i])
#     else: 
#         ref_anndats.append(math.nan)
        
# sum_detailed_jp_df['eff_anndats'] = ref_anndats

### Novy-Marx SUE

In [1]:
ws_df = pd.read_csv(PATH+'raw_data/worldscope/all_earnings.csv', parse_dates=['ITEM5350'])
ws_df.columns = ['code', 'year_', 'freq', 'seq', 'e', 'date', 'sedol', 'fyr']
ws_df['sedol'] = ws_df['sedol'].astype(str)
ws_df = ws_df.drop(columns=['code'])

NameError: name 'pd' is not defined

In [20]:
ws_df = ws_df[ws_df['sedol'].isin(sum_detailed_jp_df['SEDOL'].unique())].dropna(subset=['sedol'])
ws_df['year_behind'] = ws_df['year_'] - 1

ws_df_merged = pd.merge(ws_df, 
                        ws_df[['sedol', 'year_', 'e' ,'seq']], 
                        left_on=['sedol', 'year_behind', 'seq'], 
                        right_on=['sedol', 'year_', 'seq'], 
                        how='left', 
                        suffixes=[None, '_behind']).drop(columns=['year__behind', 'year_behind'])

In [21]:
ws_df_merged['e_diff'] = ws_df_merged['e'] - ws_df_merged['e_behind']
ws_df_merged = ws_df_merged.sort_values(['sedol', 'year_', 'seq']).reset_index(drop=True)

In [22]:
ws_df_merged_list = []
for sedol, df in tqdm(ws_df_merged.groupby(['sedol'])): 
    df = df.sort_values('date')
    df['e_std_dev_past'] = df['e_diff'].rolling(window=8, min_periods=6).std().shift(1)
    ws_df_merged_list.append(df)
    
ws_df_merged = pd.concat(ws_df_merged_list)

100%|█████████████████████████████████████████████████████████████████████████████| 5106/5106 [00:18<00:00, 276.73it/s]


In [23]:
ws_df_merged['sue_nm'] = ws_df_merged['e_diff']/ws_df_merged['e_std_dev_past']
ws_df_merged['PYEAR'] = ws_df_merged['date'].dt.year
ws_df_merged['PMON'] = ws_df_merged['date'].dt.month

In [24]:
ws_df_merged = ws_df_merged.dropna(subset=['sedol'])
ws_df_merged['sedol'] = ws_df_merged['sedol'].astype(str)

In [25]:
jp_sue_df_merged_nm = pd.merge(sum_detailed_jp_df, 
                               ws_df_merged[['sedol', 'PMON', 'PYEAR', 'e_diff', 'sue_nm']], 
                               left_on=['SEDOL', 'PMON', 'PYEAR'],
                               right_on=['sedol', 'PMON', 'PYEAR'], 
                               how='left').drop(columns='sedol')

In [26]:
qtr = list(jp_sue_df_merged_nm['eff_anndats'].dt.quarter)
yr = list(jp_sue_df_merged_nm['eff_anndats'].dt.year)
pairs = [(qtr[i], yr[i]) for i in range(len(qtr))]
jp_sue_df_merged_nm['qtr_yr_index'] = pairs

jp_sue_df_merged_nm_list = []
for index, df in jp_sue_df_merged_nm.groupby('qtr_yr_index'): 
    try:
        df['sue_nm_quintiles'] = pd.qcut(df['sue_nm'], 5, labels=False)
        df['sue_nm_deciles'] = pd.qcut(df['sue_nm'], 10, labels=False)
        jp_sue_df_merged_nm_list.append(df)
    except: 
        pass

jp_sue_df_merged_nm = pd.concat(jp_sue_df_merged_nm_list)

## Get $\Delta$E/P

In [27]:
ret_df = pd.read_pickle(PATH+'checkpoint_data/returns_all_stocks.pkl')

In [28]:
jp_sue_df_merged_nm['eff_anndats_l1'] = jp_sue_df_merged_nm['eff_anndats'] - pd.DateOffset(days=1)

In [29]:
jp_sue_df_merged_nm_1 = pd.merge_asof(
                                jp_sue_df_merged_nm.sort_values(['eff_anndats_l1']),
                                ret_df[['datadate', 'sedol', 'prccd', 'cshoc']].sort_values(['datadate']), 
                                left_by=['SEDOL'], 
                                right_by=['sedol'], 
                                left_on=['eff_anndats_l1'], 
                                right_on=['datadate'], 
                                direction='backward',
                                tolerance=pd.Timedelta(days=5)
)

del ret_df

In [30]:
jp_sue_df_merged_nm_1['dep'] = jp_sue_df_merged_nm_1['e_diff']/(jp_sue_df_merged_nm_1['prccd'] * jp_sue_df_merged_nm_1['cshoc'])

In [31]:
jp_sue_df_merged_nm_list = []
for index, df in tqdm(jp_sue_df_merged_nm_1.groupby('qtr_yr_index')): 
    try:
        df['dep_nm_quintiles'] = pd.qcut(df['dep'], 5, labels=False)
        df['dep_nm_deciles'] = pd.qcut(df['dep'], 10, labels=False)
        jp_sue_df_merged_nm_list.append(df)
    except: 
        pass

jp_sue_df_merged_nm_1 = pd.concat(jp_sue_df_merged_nm_list)

100%|█████████████████████████████████████████████████████████████████████████████████| 63/63 [00:00<00:00, 111.83it/s]


In [33]:
jp_sue_df_merged_nm_1.to_pickle(PATH+'checkpoint_data/sue_df_nm.pkl')