In [2]:
import pandas as pd
import datetime as dt
import math
from tqdm import tqdm
from pandas.tseries.offsets import BDay

# IBES Cleaning

In [19]:
_alphabet = '0123456789 BCD FGH JKLMN PQRST VWXYZ'
def calc_check_digit(number):
    weights = (1, 3, 1, 7, 3, 9)
    s = sum(w * _alphabet.index(n) for w, n in zip(weights, number))
    return str((10 - s) % 10)

pd.set_option('display.max_columns', None)

In [30]:
sue_df = pd.read_csv('raw_data/ibes/surprise_int.csv', parse_dates=['anndats'])

In [31]:
sum_df = pd.read_csv('raw_data/ibes/summary_int.csv', parse_dates=['STATPERS'])

In [32]:
sum_detailed_df = pd.read_csv('raw_data/ibes/detailed_int.csv', parse_dates=['ANNDATS_ACT'])
sum_detailed_df = sum_detailed_df[['TICKER', 'ANNDATS_ACT', 'ANNTIMS_ACT']].drop_duplicates().dropna()
sum_detailed_df['ANNTIMS_ACT'] = pd.to_timedelta(sum_detailed_df['ANNTIMS_ACT'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [21]:
sum_detailed_df['TICKER'] = sum_detailed_df['TICKER'].astype(str)
sue_df['TICKER'] = sue_df['TICKER'].astype(str)
sum_df['TICKER'] = sum_df['TICKER'].astype(str)

sue_df['anndats'] = pd.to_datetime(sue_df['anndats'])
sum_detailed_df['ANNDATS_ACT'] = pd.to_datetime(sum_detailed_df['ANNDATS_ACT'])

In [22]:
sue_df = pd.merge(sue_df, 
                  sum_detailed_df, 
                  left_on=['TICKER', 'anndats'], 
                  right_on=['TICKER', 'ANNDATS_ACT'], how='left')

In [23]:
jp_sum_df = sum_df[sum_df['CURCODE']=='JPY']
jp_tickers = (jp_sum_df['OFTIC']).unique()
jp_sue_df = sue_df[sue_df['OFTIC'].isin(jp_tickers)].sort_values(['anndats'])
jp_sue_df = pd.merge(jp_sum_df[['TICKER', 'CUSIP']].drop_duplicates(), jp_sue_df, on='TICKER', how='right')

In [24]:
jp_sue_df['SEDOL'] = jp_sue_df['CUSIP'].apply(lambda x: str(x)[2:] if len(str(x)) == 8 else math.nan)
jp_sue_df['SEDOL'] = jp_sue_df['SEDOL'].apply(lambda x: str(x) + calc_check_digit(x) if len(str(x)) == 6 else math.nan)

In [25]:
jp_sue_df['ANNDATS_FULL_CONVERT'] = (jp_sue_df['ANNDATS_ACT'] + jp_sue_df['ANNTIMS_ACT']).dt.tz_localize('EST').dt.tz_convert('Asia/Tokyo')
jp_sue_df['ANNDATS_ACT_CONVERT'] = jp_sue_df['ANNDATS_FULL_CONVERT'].dt.date
jp_sue_df['ANNTIMS_ACT_CONVERT'] = pd.to_timedelta(jp_sue_df['ANNDATS_FULL_CONVERT'].dt.time.astype(str))

In [26]:
jp_sue_df['eff_anndats'] = jp_sue_df['ANNDATS_ACT_CONVERT'] + jp_sue_df['ANNTIMS_ACT_CONVERT'].apply(lambda x: BDay(1) if (x > pd.to_timedelta('15:00:00')) else BDay(0))

In [27]:
anndats = list(jp_sue_df['eff_anndats'])
times = list(jp_sue_df['ANNTIMS_ACT'])
ref_anndats = []
for i in range(len(anndats)): 
    if times[i] >= pd.to_timedelta('00:00:00'): 
        ref_anndats.append(anndats[i])
    else: 
        ref_anndats.append(math.nan)
        
jp_sue_df['eff_anndats'] = ref_anndats

In [28]:
jp_sue_df['SEDOL'] = jp_sue_df['SEDOL'].astype(str)

In [29]:
jp_sue_df.to_pickle('checkpoint_data/japan_sue.pkl')

In [67]:
with open("raw_data/txt_files/sedol_list.txt", "w") as outfile:
    outfile.write("\n".join(str(sedol) for sedol in list(jp_sue_df['SEDOL'].unique())))

# Global Compustat Cleaning

In [3]:
topix_df = pd.read_csv('raw_data/datastream/topix_daily_returns.csv', parse_dates=['valuedate'])
topix_df['ret'] = topix_df['pi_'].pct_change()

In [21]:
ret_df = pd.read_csv('raw_data/comp_global/prices_and_vol.csv', parse_dates=['datadate'])
ret_df['mod_prccd'] = ret_df['prccd'] * ret_df['trfd'] / ret_df['ajexdi']
ret_df['mod_ret'] = ret_df.groupby('gvkey')['mod_prccd'].pct_change()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [22]:
ret_df = pd.merge(ret_df, 
                  topix_df, 
                  left_on=['datadate'], 
                  right_on=['valuedate'], 
                  how='left')

ret_df = ret_df.drop(columns=['valuedate'])

In [28]:
ret_df['mod_ret_mkt_adj'] = ret_df['mod_ret'] - ret_df['ret']
ret_df['turnover'] = ret_df['cshtrd']/ret_df['cshoc']
ret_df['mod_ret_mkt_adj_sq'] = ret_df['mod_ret_mkt_adj'] * ret_df['mod_ret_mkt_adj']

In [31]:
df_list = []
for gvkey, df in tqdm(ret_df.groupby('gvkey')): 
    df['av_turnover'] = df['turnover'].rolling(252, min_periods=200).mean()
    df['ab_turnover'] = df['turnover']/df['av_turnover']
    
    df['mod_ret_mkt_adj_sq_back'] = df['mod_ret_mkt_adj_sq'].rolling(23).sum()
    df['qvs_day'] = 1 - df['mod_ret_mkt_adj_sq']/df['mod_ret_mkt_adj_sq_back']
    
    df_list.append(df)
    
ret_df = pd.concat(df_list)

100%|███████████████████████████████████████████████████████| 1017/1017 [00:09<00:00, 102.40it/s]


In [35]:
df = df_list[23]

In [8]:
ret_df['sedol'] = ret_df['sedol'].astype(str)

In [9]:
ret_df.to_pickle('checkpoint_data/japan_prices_and_volume.pkl')

In [13]:
ret_df['mod_ret_mkt_adj_sq_back']

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
           ..
4075627   NaN
4075628   NaN
4075629   NaN
4075630   NaN
4075631   NaN
Name: mod_ret_mkt_adj_sq_back, Length: 4075632, dtype: float64