Cleans IBES data and adds Novy-Marx style SUE measures. 

Final output is a dataframe containing companies with their SUE at different quarters. 

In [5]:
import pandas as pd
import datetime as dt
import math
import matplotlib.pyplot as plt

from tqdm import tqdm
from pandas.tseries.offsets import BDay

pd.set_option("display.max_columns", None)

PATH = "C:/Users/jackl/OneDrive/Documents/finance_research/japan_qe/"

## IBES SUE Cleaning

In [6]:
_alphabet = '0123456789 BCD FGH JKLMN PQRST VWXYZ'
def calc_check_digit(number):
    weights = (1, 3, 1, 7, 3, 9)
    s = sum(w * _alphabet.index(n) for w, n in zip(weights, number))
    return str((10 - s) % 10)

In [7]:
sum_detailed_df = pd.read_csv(PATH+'raw_data/ibes/detailed_int.csv', parse_dates=['ANNDATS_ACT', 'FPEDATS'])
sum_detailed_df = sum_detailed_df.dropna(subset=['CUSIP']).drop_duplicates(['CUSIP', 'FPEDATS'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [8]:
cols = ['TICKER', 'CUSIP', 'CNAME', 'ANNDATS_ACT', 'ANNTIMS_ACT', 'FPEDATS']
sum_detailed_jp_df = sum_detailed_df[sum_detailed_df['CURR']=='JPY'][cols]
sum_detailed_jp_df[['TICKER', 'CUSIP']] = sum_detailed_jp_df[['TICKER', 'CUSIP']].astype(str)
sum_detailed_jp_df['ANNTIMS_ACT'] = pd.to_timedelta(sum_detailed_jp_df['ANNTIMS_ACT'])

In [9]:
#get SEDOL from CUSIP
sum_detailed_jp_df['SEDOL'] = sum_detailed_jp_df['CUSIP'].apply(lambda x: str(x)[2:] if len(str(x)) == 8 else math.nan)
sum_detailed_jp_df['SEDOL'] = sum_detailed_jp_df['SEDOL'].apply(lambda x: str(x) + calc_check_digit(x) if len(str(x)) == 6 else math.nan)

In [10]:
sum_detailed_jp_df['ANNDATS_FULL_CONVERT'] = (sum_detailed_jp_df['ANNDATS_ACT'] + sum_detailed_jp_df['ANNTIMS_ACT'])
sum_detailed_jp_df['ANNDATS_FULL_CONVERT'] = sum_detailed_jp_df['ANNDATS_FULL_CONVERT'].dt.tz_localize('EST').dt.tz_convert('Asia/Tokyo')

In [11]:
#convert times
sum_detailed_jp_df['ANNDATS_FULL_CONVERT'] = (sum_detailed_jp_df['ANNDATS_ACT'] + sum_detailed_jp_df['ANNTIMS_ACT']).dt.tz_localize('EST').dt.tz_convert('Asia/Tokyo')
sum_detailed_jp_df['ANNDATS_ACT_CONVERT'] = sum_detailed_jp_df['ANNDATS_FULL_CONVERT'].dt.date
sum_detailed_jp_df['ANNTIMS_ACT_CONVERT'] = pd.to_timedelta(sum_detailed_jp_df['ANNDATS_FULL_CONVERT'].dt.time.astype(str))

In [12]:
#adjust time zones. This is needed since IBES data is quoted in NY time, while price data is Japanese local time
sum_detailed_jp_df['eff_anndats'] = sum_detailed_jp_df['ANNDATS_ACT_CONVERT'] + sum_detailed_jp_df['ANNTIMS_ACT_CONVERT'].apply(lambda x: BDay(1) if (x > pd.to_timedelta('15:00:00')) else BDay(0))

In [13]:
sum_detailed_jp_df = sum_detailed_jp_df.drop(columns=['ANNDATS_ACT', 
                                                      'ANNDATS_FULL_CONVERT', 
                                                      'ANNDATS_ACT_CONVERT', 
                                                      'ANNTIMS_ACT_CONVERT'])

In [14]:
sum_detailed_jp_df['PMON'] = sum_detailed_jp_df['FPEDATS'].dt.month
sum_detailed_jp_df['PYEAR'] = sum_detailed_jp_df['FPEDATS'].dt.year

In [15]:
anndats = list(sum_detailed_jp_df['eff_anndats'])
times = list(sum_detailed_jp_df['ANNTIMS_ACT'])
ref_anndats = []
for i in range(len(anndats)): 
    if times[i] >= pd.to_timedelta('00:00:00'): 
        ref_anndats.append(anndats[i])
    else: 
        ref_anndats.append(math.nan)
        
sum_detailed_jp_df['eff_anndats'] = ref_anndats

### Novy-Marx SUE

In [16]:
ws_df = pd.read_csv(PATH+'raw_data/worldscope/all_earnings.csv', parse_dates=['ITEM5350'])
ws_df.columns = ['code', 'year_', 'freq', 'seq', 'e', 'date', 'sedol', 'fyr']
ws_df['sedol'] = ws_df['sedol'].astype(str)
ws_df = ws_df.drop(columns=['code'])

In [17]:
ws_df = ws_df[ws_df['sedol'].isin(sum_detailed_jp_df['SEDOL'].unique())].dropna(subset=['sedol'])
ws_df['year_behind'] = ws_df['year_'] - 1

ws_df_merged = pd.merge(ws_df, 
                        ws_df[['sedol', 'year_', 'e' ,'seq']], 
                        left_on=['sedol', 'year_behind', 'seq'], 
                        right_on=['sedol', 'year_', 'seq'], 
                        how='left', 
                        suffixes=[None, '_behind']).drop(columns=['year__behind', 'year_behind'])

In [18]:
ws_df_merged['e_diff'] = ws_df_merged['e'] - ws_df_merged['e_behind']
ws_df_merged = ws_df_merged.sort_values(['sedol', 'year_', 'seq']).reset_index(drop=True)

In [20]:
ws_df_merged_list = []
for sedol, df in tqdm(ws_df_merged.groupby(['sedol'])): 
    df['e_std_dev_past'] = df['e_diff'].rolling(window=8, min_periods=6).std().shift(1)
    ws_df_merged_list.append(df)
ws_df_merged = pd.concat(ws_df_merged_list)

100%|█████████████████████████████████████████████████████████████████████████████| 1341/1341 [00:03<00:00, 435.31it/s]


In [21]:
ws_df_merged['sue_nm'] = ws_df_merged['e_diff']/ws_df_merged['e_std_dev_past']
ws_df_merged['PYEAR'] = ws_df_merged['date'].dt.year
ws_df_merged['PMON'] = ws_df_merged['date'].dt.month

In [22]:
ws_df_merged = ws_df_merged.dropna(subset=['sedol'])
ws_df_merged['sedol'] = ws_df_merged['sedol'].astype(str)

In [23]:
jp_sue_df_merged_nm = pd.merge(sum_detailed_jp_df, 
                               ws_df_merged[['sedol', 'PMON', 'PYEAR', 'sue_nm']], 
                               left_on=['SEDOL', 'PMON', 'PYEAR'],
                               right_on=['sedol', 'PMON', 'PYEAR'], 
                               how='left').drop(columns='sedol')

In [115]:
jp_sue_df_merged_nm.to_pickle(PATH+'checkpoint_data/sue_df_nm.pkl')

## Get $\Delta$E/P

In [25]:
jp_sue_df_merged_nm['eff_anndats_l1'] = jp_sue_df_merged_nm['eff_anndats'] - pd.DateOffset(days=1)