Cleans IBES data and adds Novy-Marx style SUE measures. 

Final output is a dataframe containing companies with their SUE at different quarters. 

In [1]:
import pandas as pd
import datetime as dt
import math
from tqdm import tqdm
from pandas.tseries.offsets import BDay

import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)

PATH = "C:/Users/jackl/OneDrive/Documents/finance_research/japan_qe/"

## IBES SUE Cleaning

In [2]:
_alphabet = '0123456789 BCD FGH JKLMN PQRST VWXYZ'
def calc_check_digit(number):
    weights = (1, 3, 1, 7, 3, 9)
    s = sum(w * _alphabet.index(n) for w, n in zip(weights, number))
    return str((10 - s) % 10)
pd.set_option('display.max_columns', None)

In [3]:
#load data
su_df = pd.read_csv(PATH+'raw_data/ibes/surprise_int.csv', parse_dates=['anndats'])
sum_df = pd.read_csv(PATH+'raw_data/ibes/summary_int.csv', parse_dates=['STATPERS'])

sum_detailed_df = pd.read_csv(PATH+'raw_data/ibes/detailed_int.csv', parse_dates=['ANNDATS_ACT'])
# sum_detailed_df = sum_detailed_df[['TICKER', 'ANNDATS_ACT', 'ANNTIMS_ACT']].drop_duplicates().dropna()
sum_detailed_df['ANNTIMS_ACT'] = pd.to_timedelta(sum_detailed_df['ANNTIMS_ACT'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
#extract relevant columns
su_df = su_df[['OFTIC', 'TICKER', 'PYEAR', 'PMON', 'anndats', 'actual', 'surpmean', 'surpstdev', 'suescore']]

In [5]:
#make sure matching columns are the same type
sum_detailed_df['TICKER'] = sum_detailed_df['TICKER'].astype(str)
su_df['TICKER'] = su_df['TICKER'].astype(str)
sum_df['TICKER'] = sum_df['TICKER'].astype(str)

su_df['anndats'] = pd.to_datetime(su_df['anndats'])
sum_detailed_df['ANNDATS_ACT'] = pd.to_datetime(sum_detailed_df['ANNDATS_ACT'])

In [101]:
#merge to get precise release times
sue_df = pd.merge(su_df, 
                  sum_detailed_df, 
                  left_on=['TICKER', 'anndats'], 
                  right_on=['TICKER', 'ANNDATS_ACT'], 
                  how='left')

In [7]:
#restrict to Japanese firms
jp_sum_df = sum_df[sum_df['CURCODE']=='JPY']
jp_tickers = (jp_sum_df['OFTIC']).unique()

jp_sue_df = sue_df[sue_df['OFTIC'].isin(jp_tickers)].sort_values(['anndats'])
jp_sue_df = pd.merge(jp_sum_df[['TICKER', 'CUSIP']].drop_duplicates(), 
                     jp_sue_df, 
                     on='TICKER', 
                     how='right')

In [8]:
#get SEDOL from ISIN
jp_sue_df['SEDOL'] = jp_sue_df['CUSIP'].apply(lambda x: str(x)[2:] if len(str(x)) == 8 else math.nan)
jp_sue_df['SEDOL'] = jp_sue_df['SEDOL'].apply(lambda x: str(x) + calc_check_digit(x) if len(str(x)) == 6 else math.nan)

In [9]:
#convert times
jp_sue_df['ANNDATS_FULL_CONVERT'] = (jp_sue_df['ANNDATS_ACT'] + jp_sue_df['ANNTIMS_ACT']).dt.tz_localize('EST').dt.tz_convert('Asia/Tokyo')
jp_sue_df['ANNDATS_ACT_CONVERT'] = jp_sue_df['ANNDATS_FULL_CONVERT'].dt.date
jp_sue_df['ANNTIMS_ACT_CONVERT'] = pd.to_timedelta(jp_sue_df['ANNDATS_FULL_CONVERT'].dt.time.astype(str))

In [10]:
#adjust time zones. This is needed since IBES data is quoted in NY time, while price data is Japanese local time
jp_sue_df['eff_anndats'] = jp_sue_df['ANNDATS_ACT_CONVERT'] + jp_sue_df['ANNTIMS_ACT_CONVERT'].apply(lambda x: BDay(1) if (x > pd.to_timedelta('15:00:00')) else BDay(0))

In [11]:
anndats = list(jp_sue_df['eff_anndats'])
times = list(jp_sue_df['ANNTIMS_ACT'])
ref_anndats = []
for i in range(len(anndats)): 
    if times[i] >= pd.to_timedelta('00:00:00'): 
        ref_anndats.append(anndats[i])
    else: 
        ref_anndats.append(math.nan)
        
jp_sue_df['eff_anndats'] = ref_anndats

In [12]:
jp_sue_df = jp_sue_df.drop(columns=['ANNDATS_ACT', 'ANNTIMS_ACT', 'ANNDATS_FULL_CONVERT', 'ANNDATS_ACT_CONVERT', 'ANNTIMS_ACT_CONVERT'])
jp_sue_df = jp_sue_df[['TICKER', 'CUSIP', 'SEDOL', 'OFTIC', 'PYEAR', 'PMON', 'anndats', 'eff_anndats', 'actual', 'surpmean', 'surpstdev', 'suescore']]

In [13]:
jp_sue_df.to_pickle(PATH+'checkpoint_data/japan_sue.pkl')

### Novy-Marx SUE

In [4]:
jp_sue_df = pd.read_pickle(PATH+'checkpoint_data/japan_sue.pkl')

In [86]:
ws_df = pd.read_csv(PATH+'raw_data/worldscope/all_earnings.csv', parse_dates=['ITEM5350'])
ws_df.columns = ['code', 'year_', 'freq', 'seq', 'e', 'date', 'sedol', 'fyr']
ws_df['sedol'] = ws_df['sedol'].astype(str)
ws_df = ws_df.drop(columns=['code'])

In [62]:
ws_df = ws_df[ws_df['sedol'].isin(jp_sue_df['SEDOL'].unique())].dropna(subset=['sedol'])
ws_df['year_behind'] = ws_df['year_'] - 1

ws_df_merged = pd.merge(ws_df, 
                        ws_df[['sedol', 'year_', 'e' ,'seq']], 
                        left_on=['sedol', 'year_behind', 'seq'], 
                        right_on=['sedol', 'year_', 'seq'], 
                        how='left', 
                        suffixes=[None, '_behind']).drop(columns=['year__behind', 'year_behind'])

In [63]:
ws_df_merged['e_diff'] = ws_df_merged['e'] - ws_df_merged['e_behind']

In [64]:
ws_df_merged = ws_df_merged.sort_values(['sedol', 'year_', 'seq']).reset_index(drop=True)

In [67]:
ws_df_merged_list = []
for sedol, df in tqdm(ws_df_merged.groupby(['sedol'])): 
    df['e_std_dev_past'] = df['e_diff'].rolling(window=8, min_periods=6).std().shift(1)
    ws_df_merged_list.append(df)
ws_df_merged = pd.concat(ws_df_merged_list)

100%|█████████████████████████████████████████████████████████████████████████████| 1031/1031 [00:02<00:00, 378.58it/s]


In [68]:
ws_df_merged['sue_nm'] = ws_df_merged['e_diff']/ws_df_merged['e_std_dev_past']

In [74]:
ws_df_merged['PYEAR'] = ws_df_merged['date'].dt.year
ws_df_merged['PMON'] = ws_df_merged['date'].dt.month

In [79]:
jp_sue_df_merged_nm = pd.merge(jp_sue_df, 
                               ws_df_merged[['sedol', 'PMON', 'PYEAR', 'sue_nm']], 
                               left_on=['SEDOL', 'PMON', 'PYEAR'],
                               right_on=['sedol', 'PMON', 'PYEAR'], 
                               how='left').drop(columns=['sedol'])

In [83]:
jp_sue_df_merged_nm.to_pickle(PATH+'checkpoint_data/sue_df_nm.pkl')

In [91]:
jp_sue_df_merged_nm.to_stata(PATH+'test_sue.dta')