This file links raw IBES surprise data (SUE) with PERMNOs for further matching. The file 'sue_df_linked.pkl' is the final dataset exported. 

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import math

def get_match_permno(date, match_df): 
    try: 
        return match_df.iloc[match_df.index.get_loc(date)]['PERMNO']
    except: 
        math.nan

In [2]:
sue_df = pd.read_csv('data/sue.csv', parse_dates=['anndats'])
sue_df = sue_df[sue_df['USFIRM']==1]

numest_df = pd.read_csv('data/numest.csv', parse_dates=['STATPERS', 'FPEDATS', 'ANNDATS_ACT'])
numest_df = numest_df.sort_values('STATPERS').drop_duplicates(['TICKER', 'ANNDATS_ACT'], keep='last')
numest_df = numest_df[['TICKER', 'NUMEST', 'MEANEST', 'ANNDATS_ACT']]

In [3]:
link_df = pd.read_csv('data/ibes_crsp_link.csv', parse_dates=['sdate', 'edate'])
link_df['PERMNO'] = link_df.groupby('TICKER')['PERMNO'].ffill()
link_df['edate'] = link_df['edate'].apply(lambda x: pd.to_datetime('2021-12-31') if pd.isnull(x) else x)
link_df = link_df.dropna(subset=['PERMNO'])
link_df.index = pd.IntervalIndex.from_arrays(link_df['sdate'], link_df['edate'], closed='both')

In [4]:
sics_sc_df = pd.read_csv('data/sic_and_share_code.csv', parse_dates=['date'])
sics_sc_df['PERMNO'] = sics_sc_df['PERMNO'].astype('Int64')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
sue_dfs_cond_ticker = {ticker: ticker_df for ticker, ticker_df in sue_df.groupby('TICKER')}
link_dfs_cond_ticker = {ticker: ticker_df for ticker, ticker_df in link_df.groupby('TICKER')}
tickers_list = list(sue_dfs_cond_ticker.keys())

In [6]:
sue_df_linked_list = []
missed_list = []
for i in tqdm(range(len(tickers_list))):
    ticker = tickers_list[i]
    ticker_df_sue = sue_dfs_cond_ticker[ticker].sort_values('anndats')
    try:
        ticker_df_link = link_dfs_cond_ticker[ticker]
        ticker_df_sue['PERMNO'] = ticker_df_sue['anndats'].apply(lambda date: get_match_permno(date, ticker_df_link))
        sue_df_linked_list.append(ticker_df_sue)
    except: 
        ticker_df_sue['PERMNO'] = [math.nan for i in range(len(ticker_df_sue))]
        missed_list.append(ticker_df_sue)

100%|████████████████████████████████████████████████████████████████████████████| 15740/15740 [08:48<00:00, 29.78it/s]


In [7]:
sue_df_linked = pd.concat(sue_df_linked_list + missed_list)

In [8]:
sue_df_linked = sue_df_linked.dropna(subset=['PERMNO'])

In [9]:
def get_correct(x): 
    try: 
        if len(x) == 2: 
            return 15617
    except: 
        return x
    
sue_df_linked['PERMNO'] = sue_df_linked['PERMNO'].apply(lambda x: get_correct(x))
sue_df_linked['PERMNO'] = sue_df_linked['PERMNO'].astype('Int64')

In [10]:
year_list, month_list = list(sue_df_linked['PYEAR']), list(sue_df_linked['PMON'])
date_list = [pd.to_datetime(str(year_list[i]) + '-' + str(month_list[i]) + '-' + '1') for i in tqdm(range(len(year_list)))]
sue_df_linked['pyear_month_date'] = date_list

100%|████████████████████████████████████████████████████████████████████████| 440383/440383 [01:41<00:00, 4349.44it/s]


In [11]:
sue_df_linked['bias'] = sue_df_linked['actual'] - sue_df_linked['surpmean'] 

In [12]:
sue_df_linked_merged = pd.merge_asof(sue_df_linked.sort_values('anndats'), 
                                     sics_sc_df.sort_values('date'), 
                                     by='PERMNO', 
                                     left_on='anndats', 
                                     right_on='date', 
                                     tolerance=pd.Timedelta(days=28), 
                                     direction='nearest')

In [13]:
sue_df_linked_merged = sue_df_linked_merged[(sue_df_linked_merged['SHRCD']==10) | (sue_df_linked_merged['SHRCD']==11)]

In [14]:
sue_df_linked_merged['SICCD'] = sue_df_linked_merged['SICCD'].apply(lambda x: math.nan if x == 'Z' else x)
sue_df_linked_merged['SICCD'] = sue_df_linked_merged['SICCD'].astype(float)

In [15]:
sue_df_linked_merged = sue_df_linked_merged[~((sue_df_linked_merged['SICCD']<=4949) & (sue_df_linked_merged['SICCD']>=4900))]
sue_df_linked_merged = sue_df_linked_merged[~((sue_df_linked_merged['SICCD']<=6999) & (sue_df_linked_merged['SICCD']>=6000))]

In [80]:
sue_df_linked_merged = pd.merge(sue_df_linked_merged, 
         numest_df, 
         left_on=['TICKER', 'anndats'],
         right_on=['TICKER', 'ANNDATS_ACT'],
         how='left')

In [85]:
sue_df_linked_merged.to_pickle('data/checkpoint_data/sue_df_linked.pkl')
# sue_df_linked_merged.to_pickle('data/checkpoint_data/sue_df_linked_qtr.pkl')