In [220]:
import sqlite3
import numpy as np
import pandas as pd

In [345]:
compustat_db = sqlite3.connect('../data/compustat.db.sqlite')
compustatq_data = pd.read_sql_query('SELECT * FROM compustatq', compustat_db)

In [269]:
compustatq_data

Unnamed: 0,index,gvkey,datadate,saleq,cshoq,ceqq,ibq,prccq,rdq,tic,dvy,fyr,fqtr,fyearq,year
0,0,001410,2019-01-31 00:00:00,1607.900,66.210,1461.100,13.000,34.19,2019-03-06,ABM,11.900,10,1,2019,2019
1,346,144517,2019-01-31 00:00:00,,6.750,,,172.69,,SOXX,,7,2,2019,2019
2,345,144516,2019-01-31 00:00:00,,11.150,,,192.22,,IGV,,7,2,2019,2019
3,344,142789,2019-01-31 00:00:00,,7.750,,,189.53,,IGM,,7,2,2019,2019
4,343,142188,2019-01-31 00:00:00,,19.700,,,105.86,,ICF,,4,3,2018,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158678,152354,032878,2022-12-31 00:00:00,7.777,82.356,80.883,-68.689,0.60,2023-04-28,BTBT,0.000,12,4,2022,2022
158679,152355,032881,2022-12-31 00:00:00,,0.760,,,,,SIMS,,12,4,2022,2022
158680,152356,032882,2022-12-31 00:00:00,,0.630,,,,,FITE,,12,4,2022,2022
158681,152358,032884,2022-12-31 00:00:00,464.836,324.277,2240.035,81.927,8.74,2023-03-02,PAGS,0.000,12,4,2022,2022


We first get a list of earning dates with the corresponding companies.

In [270]:
compustatq_cleaned_data = (compustatq_data
    .drop_duplicates(subset=['tic', 'rdq'])
    .get(['tic', 'rdq'])
    .assign(rdq=lambda x: pd.to_datetime(x['rdq'], format='%Y-%m-%d', errors='coerce'))
    .dropna()
    .reset_index(drop=True)
)

In [272]:
crsp_db = sqlite3.connect('../data/crsp.db.sqlite')
crsp_data = pd.read_sql_query('SELECT ticker, date FROM crsp_daily', crsp_db)

We then add earning announcements rates to the right of the CRSP data.

In [273]:
crsp_cleaned_data = (crsp_data
    .drop_duplicates(subset=['ticker', 'date'])
    .dropna()
    .assign(date=lambda x: pd.to_datetime(x['date']))
    .reset_index(drop=True)
)

In [274]:
rdq_merged_data = (crsp_cleaned_data
    .merge(compustatq_cleaned_data, left_on=['ticker', 'date'], right_on=['tic', 'rdq'], how='left')
    .drop(columns=['tic'])
    .sort_values(by=['ticker', 'date'])
)

We make columns for the dates days after the earning announcements.

In [275]:
for i in range(4):
    rdq_merged_data[f'rdq_{i}'] = rdq_merged_data.groupby('ticker')['rdq'].shift(i)
rdq_merged_data['rdq_exists'] = sum([rdq_merged_data[f'rdq_{i}'].notna() for i in range(4)]) > 0
rdq_merged_data = (
    rdq_merged_data
    .replace({False: np.nan})
    .dropna(subset=['rdq_exists'])
    .drop(columns=['rdq_exists'])
)

Now we add the returns for the company on the days after the earnings announcement and combine them based on two cases.
1. Earning announcement is during the day
2. Earnings announcement is after hours.

In [276]:
crsp_ret_data = (pd.read_sql_query('SELECT ticker, ret, date FROM crsp_daily', crsp_db)
                 .drop_duplicates(subset=['ticker', 'date']).dropna().assign(date=lambda x: pd.to_datetime(x['date']))
                 .reset_index(drop=True)
)

In [277]:
compustat_all_data = (pd.read_sql_query('SELECT * FROM compustatq', compustat_db)
                      .drop_duplicates(subset=['tic', 'rdq'])
                      .assign(rdq=lambda x: pd.to_datetime(x['rdq'], format='%Y-%m-%d', errors='coerce'))
                      .reset_index(drop=True)
)

In [278]:
rdq_ret = (rdq_merged_data
    .merge(crsp_ret_data, left_on=['date', 'ticker'], right_on=['date', 'ticker'], how='left')
    .merge(compustat_all_data, left_on=['rdq', 'ticker'], right_on=['rdq', 'tic'], how='left')
)

In [279]:
for i in range(4):
    rdq_ret[f'rdq_{i}_ret'] = rdq_ret.groupby('ticker')['ret'].shift(-i)

In [280]:
rdq_ret['ret_after_hours'] = (1+rdq_ret['rdq_1_ret']) * (1+rdq_ret['rdq_2_ret']) * (1+rdq_ret['rdq_3_ret']) - 1
rdq_ret['ret_during_day'] = (1+rdq_ret['rdq_1_ret']) * (1+rdq_ret['rdq_2_ret']) * (1+rdq_ret['rdq_0_ret']) - 1

Drop intermittent columns.

In [281]:
rdq_final_ret = (
    rdq_ret
    .dropna(subset=['ret_after_hours', 'ret_during_day', 'rdq'])
    .drop(columns=['ret', 'rdq_0_ret', 'rdq_1_ret', 'rdq_2_ret', 'rdq_3_ret', 'rdq_0', 'rdq_1', 'rdq_2', 'rdq_3', 'ticker'])
    .query('rdq > "2020-01-01"')
    .reset_index(drop=True)
)

In [282]:
rdq_final_ret.head(5)

Unnamed: 0,date,rdq,index,gvkey,datadate,saleq,cshoq,ceqq,ibq,prccq,tic,dvy,fyr,fqtr,fyearq,year,ret_after_hours,ret_during_day
0,2020-02-18,2020-02-18,38318.0,126554,2020-01-31 00:00:00,1357.0,310.048,4848.0,197.0,82.56,A,56.0,10.0,1.0,2020.0,2020.0,0.003302,-0.017245
1,2020-05-21,2020-05-21,48150.0,126554,2020-04-30 00:00:00,1238.0,308.443,4768.0,101.0,76.66,A,111.0,10.0,2.0,2020.0,2020.0,0.067246,0.054094
2,2020-08-18,2020-08-18,57945.0,126554,2020-07-31 00:00:00,1261.0,308.578,4981.0,199.0,96.33,A,167.0,10.0,3.0,2020.0,2020.0,0.004497,0.002769
3,2020-11-23,2020-11-23,67698.0,126554,2020-10-31 00:00:00,1483.0,306.198,4873.0,222.0,102.09,A,222.0,10.0,4.0,2020.0,2020.0,0.016753,0.031202
4,2021-02-16,2021-02-16,77415.0,126554,2021-01-31 00:00:00,1548.0,304.905,4804.0,288.0,120.17,A,59.0,10.0,1.0,2021.0,2021.0,-0.008441,-0.002344


We now get 2019 data so that we can calculate seasonal and random walk.

In [358]:
compustat2019_data = (pd.read_sql_query('SELECT * FROM compustatq', compustat_db)
                      .drop_duplicates(subset=['tic', 'rdq'])
                      .assign(rdq=lambda x: pd.to_datetime(x['rdq'], format='%Y-%m-%d', errors='coerce'))
                      .query('year == 2019')
                      .reset_index(drop=True)
)

In [359]:
complete_data = (pd.concat([compustat2019_data,rdq_final_ret], ignore_index=True)
                 .sort_values(by=['tic', 'datadate'])
                 .drop_duplicates(subset=['tic', 'datadate'])
                 .dropna(subset=['tic', 'datadate', 'cshoq', 'prccq'])
                 .reset_index(drop=True)
)

In [360]:
complete_data.describe()

Unnamed: 0,index,saleq,cshoq,ceqq,ibq,prccq,rdq,dvy,fyr,fqtr,fyearq,year,date,ret_after_hours,ret_during_day
count,61288.0,60665.0,61288.0,60761.0,60816.0,61288.0,60883,59559.0,61288.0,61288.0,61288.0,61288.0,38155,38155.0,38155.0
mean,65655.112404,1225.737313,182.641523,2827.114514,95.894613,50.830739,2020-12-06 04:58:19.301939968,105.545046,10.68493,2.436872,2020.226847,2020.220924,2021-08-24 02:40:35.224740864,0.004625,0.005085
min,0.0,-4423.855,0.0,-134605.0,-20070.0,1e-06,2019-02-13 00:00:00,0.0,1.0,1.0,2018.0,2019.0,2020-02-12 00:00:00,-0.748718,-0.693247
25%,25311.75,10.307,20.575,56.413,-6.20375,5.9975,2019-11-08 00:00:00,0.0,12.0,1.0,2019.0,2019.0,2020-12-01 00:00:00,-0.044183,-0.049923
50%,60172.0,98.116,48.406,304.966,1.835,18.7,2020-11-02 00:00:00,0.0,12.0,2.0,2020.0,2020.0,2021-08-12 00:00:00,0.001418,0.001517
75%,105777.25,526.472,120.43175,1267.722,35.00775,47.7425,2021-11-15 00:00:00,13.5465,12.0,3.0,2021.0,2021.0,2022-05-05 00:00:00,0.047729,0.053875
max,149088.0,152871.0,67698.494,424791.0,34630.0,31700.0,2023-06-09 00:00:00,18135.0,12.0,4.0,2023.0,2022.0,2022-12-27 00:00:00,2.377484,2.315053
std,44998.268156,5539.300085,874.512497,12956.841075,724.620054,250.98917,,597.20719,2.938278,1.096988,1.158757,1.138912,,0.108297,0.118136


We add ME and BM and get rid of 0 ME firms.

In [361]:
complete_data['me'] = complete_data['cshoq'] * complete_data['prccq']
complete_data['bm'] = complete_data['ceqq'] / complete_data['me']
complete_data = (
    complete_data
    .assign(bm=lambda x: x['bm'].apply(lambda x: 0.5 if x < 0 else x))
    .assign(cshoq=lambda x: x['cshoq'].replace(0,np.nan))
    .dropna(subset=['tic','datadate', 'cshoq'])
    .reset_index(drop=True)
)

In [362]:
complete_data.head(2)

Unnamed: 0,index,gvkey,datadate,saleq,cshoq,ceqq,ibq,prccq,rdq,tic,dvy,fyr,fqtr,fyearq,year,date,ret_after_hours,ret_during_day,me,bm
0,325.0,126554,2019-01-31 00:00:00,1284.0,318.0,5036.0,504.0,76.05,2019-02-20,A,52.0,10.0,1.0,2019.0,2019.0,NaT,,,24183.9,0.208238
1,9902.0,126554,2019-04-30 00:00:00,1238.0,316.991,5125.0,182.0,78.5,2019-05-14,A,104.0,10.0,2.0,2019.0,2019.0,NaT,,,24883.7935,0.205957


We identify the top and bottom trecile in terms of BM.

In [363]:
bminfo = (
    complete_data.get(['tic', 'datadate', 'bm'])
    .groupby('datadate')
    .apply(lambda x: x
           .assign(bm_percentile = x['bm'].rank(pct=True))
           .assign(bm_bucket = lambda x: x
                   .apply(lambda x: 0 if x['bm_percentile'] < 0.333 else 1 if x['bm_percentile'] < 0.6666 else 2, axis = 1))
           )
    .reset_index(drop=True)
    .sort_values(by=['tic', 'datadate'])
    .drop(columns=['bm', 'bm_percentile'])
    .reset_index(drop=True)
)
bminfo.head(2)


  .apply(lambda x: x


Unnamed: 0,tic,datadate,bm_bucket
0,A,2019-01-31 00:00:00,0
1,A,2019-04-30 00:00:00,0


We identify the top and bottom quartiles for dividends from year to date.

In [364]:
dvyinfo = (
    complete_data
    .get(['tic', 'datadate', 'dvy', 'me'])
    .assign(dvy = lambda x: x['dvy']/x['me'])
    .drop(columns=['me'])
    .groupby('datadate')
    .apply(lambda x: x
           .assign(dvy_percentile = x['dvy'].rank(pct=True))
           .assign(dvy_bucket = lambda x: x['dvy_percentile']
                   .apply(lambda x: 0 if x < 0.25 else 1 if x < 0.50 else 2 if x < 0.75 else 3))
           )
    .reset_index(drop=True)
    .sort_values(by=['tic', 'datadate'])
    .drop(columns=['dvy', 'dvy_percentile'])
    .reset_index(drop=True)
)
dvyinfo.head(2)

  .apply(lambda x: x


Unnamed: 0,tic,datadate,dvy_bucket
0,A,2019-01-31 00:00:00,2
1,A,2019-04-30 00:00:00,2


We then merge the two and add it to the daily stock data we have so far.

In [365]:
complete_data = (
    complete_data
    .merge(bminfo, left_on=['tic', 'datadate'], right_on=['tic', 'datadate'], how='left')
    .merge(dvyinfo, left_on=['tic', 'datadate'], right_on=['tic', 'datadate'], how='left')
    .reset_index(drop=True)
)
complete_data

Unnamed: 0,index,gvkey,datadate,saleq,cshoq,ceqq,ibq,prccq,rdq,tic,...,fqtr,fyearq,year,date,ret_after_hours,ret_during_day,me,bm,bm_bucket,dvy_bucket
0,325.0,126554,2019-01-31 00:00:00,1284.000,318.000,5036.000,504.000,76.05,2019-02-20,A,...,1.0,2019.0,2019.0,NaT,,,24183.90000,0.208238,0,2
1,9902.0,126554,2019-04-30 00:00:00,1238.000,316.991,5125.000,182.000,78.50,2019-05-14,A,...,2.0,2019.0,2019.0,NaT,,,24883.79350,0.205957,0,2
2,19457.0,126554,2019-07-31 00:00:00,1274.000,309.597,4747.000,191.000,69.41,2019-08-14,A,...,3.0,2019.0,2019.0,NaT,,,21489.12777,0.220902,0,2
3,28950.0,126554,2019-10-31 00:00:00,1367.000,309.071,4748.000,194.000,75.75,2019-11-25,A,...,4.0,2019.0,2019.0,NaT,,,23412.12825,0.202801,0,2
4,38318.0,126554,2020-01-31 00:00:00,1357.000,310.048,4848.000,197.000,82.56,2020-02-18,A,...,1.0,2020.0,2020.0,2020-02-18,0.003302,-0.017245,25597.56288,0.189393,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61282,104994.0,129441,2021-09-30 00:00:00,34.786,34.775,63.383,6.107,11.39,2021-11-02,ZYXI,...,3.0,2021.0,2021.0,2021-11-02,0.122905,0.128673,396.08725,0.160023,0,1
61283,115153.0,129441,2021-12-31 00:00:00,40.366,36.131,73.925,8.894,9.97,2022-02-24,ZYXI,...,4.0,2021.0,2021.0,2022-02-24,-0.094225,-0.055556,360.22607,0.205218,0,2
61284,125488.0,129441,2022-03-31 00:00:00,31.083,39.777,75.818,1.377,6.23,2022-04-28,ZYXI,...,1.0,2022.0,2022.0,2022-04-28,-0.086834,-0.113343,247.81071,0.305951,0,3
61285,135839.0,129441,2022-06-30 00:00:00,36.759,38.404,69.009,3.346,7.98,2022-07-28,ZYXI,...,2.0,2022.0,2022.0,2022-07-28,0.082051,0.146631,306.46392,0.225178,0,3


In [477]:
complete_data = (
    complete_data
    .assign(saleq = lambda x: x['saleq'].replace(0, np.nan))
    .assign(ibq = lambda x: x['ibq'].replace(0, np.nan))
    .dropna(subset=['saleq', 'ibq'])
)

We now want to calculate the quartely and yearly walks to get our surprise variables. We will get our data from a year and a quarter ago.

In [478]:
shift_year = (
    complete_data
    .sort_values(by=['tic', 'datadate'])
    .assign(saleq_4 = lambda x: x.groupby('tic')['saleq'].shift(4))
    .assign(ibq_4 = lambda x: x.groupby('tic')['ibq'].shift(4))
    .assign(fqtr_4 = lambda x: x.groupby('tic')['fqtr'].shift(4))
    .assign(fyearq_4 = lambda x: x.groupby('tic')['fyearq'].shift(4))
)
for row in shift_year.iterrows():
    try:
        assert row[1]['fyearq'] == row[1]['fyearq_4'] + 1
        assert row[1]['fqtr'] == row[1]['fqtr_4']
    except:
        row[1]['saleq'] = np.nan
        row[1]['ibq'] = np.nan
shift_year.dropna(subset=['saleq', 'ibq'], inplace=True)

In [479]:
shift_quarter = (
    complete_data
    .sort_values(by=['tic', 'datadate'])
    .assign(saleq_1 = lambda x: x.groupby('tic')['saleq'].shift(1))
    .assign(ibq_1 = lambda x: x.groupby('tic')['ibq'].shift(1))
    .assign(fqtr_1 = lambda x: x.groupby('tic')['fqtr'].shift(1))
    .assign(fyearq_1 = lambda x: x.groupby('tic')['fyearq'].shift(1))
)
for row in shift_quarter.iterrows():
    try:
        assert ((row[1]['fyearq']) == (row[1]['fyearq_1'] + 1) and row[1]['fqtr'] == 1) or ((row[1]['fyearq']) == (row[1]['fyearq_1']))
        assert row[1]['fqtr']%4 == (row[1]['fqtr_1']+1)%4
    except:
        row[1]['saleq'] = np.nan
        row[1]['ibq'] = np.nan
shift_quarter = (shift_quarter
              .dropna(subset=['saleq', 'ibq'])
)

In [480]:
shift_merge = (
    shift_year
    .get(['tic', 'datadate', 'saleq_4', 'ibq_4'])
    .merge(shift_quarter, on=['tic', 'datadate'], how='inner')
    .dropna(subset=['saleq_1', 'ibq_1', 'saleq_4', 'ibq_4'])
    .reset_index(drop=True)
)
shift_merge.head(2)

Unnamed: 0,tic,datadate,saleq_4,ibq_4,index,gvkey,saleq,cshoq,ceqq,ibq,...,ret_after_hours,ret_during_day,me,bm,bm_bucket,dvy_bucket,saleq_1,ibq_1,fqtr_1,fyearq_1
0,A,2020-01-31 00:00:00,1284.0,504.0,38318.0,126554,1357.0,310.048,4848.0,197.0,...,0.003302,-0.017245,25597.56288,0.189393,0,2,1367.0,194.0,4.0,2019.0
1,A,2020-04-30 00:00:00,1238.0,182.0,48150.0,126554,1238.0,308.443,4768.0,101.0,...,0.067246,0.054094,23645.24038,0.201647,0,2,1357.0,197.0,1.0,2020.0


I clean up the columns to only get data we need now.

In [512]:
needed_data = (
    shift_merge
    .get(['tic', 'datadate','me', 'fqtr', 'saleq', 'ibq','saleq_1', 'ibq_1', 'saleq_4', 'ibq_4','dvy','bm' ,'bm_bucket', 'dvy_bucket', 'ret_after_hours', 'ret_during_day'])
)
needed_data.head(2)

Unnamed: 0,tic,datadate,me,fqtr,saleq,ibq,saleq_1,ibq_1,saleq_4,ibq_4,dvy,bm,bm_bucket,dvy_bucket,ret_after_hours,ret_during_day
0,A,2020-01-31 00:00:00,25597.56288,1.0,1357.0,197.0,1367.0,194.0,1284.0,504.0,56.0,0.189393,0,2,0.003302,-0.017245
1,A,2020-04-30 00:00:00,23645.24038,2.0,1238.0,101.0,1357.0,197.0,1238.0,182.0,111.0,0.201647,0,2,0.067246,0.054094


We say that a firm is growth if it has low BM and not high Dividend yields. We say that a firm is value if the dividend yields are high OR the BM is high.

In [513]:
needed_data['growth'] = np.select([(needed_data['bm_bucket'] == 0) & (needed_data['dvy_bucket'] != 3)], [True], default=False)
needed_data['value'] = np.select([ needed_data['dvy_bucket'] == 3, needed_data['bm_bucket'] == 2], [True, True], default=False)  
needed_data.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  needed_data['growth'] = np.select([(needed_data['bm_bucket'] == 0) & (needed_data['dvy_bucket'] != 3)], [True], default=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  needed_data['value'] = np.select([ needed_data['dvy_bucket'] == 3, needed_data['bm_bucket'] == 2], [True, True], default=False)


Unnamed: 0,tic,datadate,me,fqtr,saleq,ibq,saleq_1,ibq_1,saleq_4,ibq_4,dvy,bm,bm_bucket,dvy_bucket,ret_after_hours,ret_during_day,growth,value
0,A,2020-01-31 00:00:00,25597.56288,1.0,1357.0,197.0,1367.0,194.0,1284.0,504.0,56.0,0.189393,0,2,0.003302,-0.017245,True,False
1,A,2020-04-30 00:00:00,23645.24038,2.0,1238.0,101.0,1357.0,197.0,1238.0,182.0,111.0,0.201647,0,2,0.067246,0.054094,True,False


We now calculate the earning/sales surprises.

In [514]:
needed_data = (
    needed_data
    .assign(srwSale = lambda x: (x['saleq']-x['saleq_4'])/x['saleq_4'])
    .assign(srwEarning = lambda x: (x['ibq']-x['ibq_4'])/x['ibq_4'])
    .assign(rwSale = lambda x: (x['saleq']-x['saleq_1'])/x['saleq_1'])
    .assign(rwEarning = lambda x: (x['ibq']-x['ibq_1'])/x['ibq_1'])
    .assign(datadate = lambda x: pd.to_datetime(x['datadate']))
)
needed_data.head(2)

Unnamed: 0,tic,datadate,me,fqtr,saleq,ibq,saleq_1,ibq_1,saleq_4,ibq_4,...,bm_bucket,dvy_bucket,ret_after_hours,ret_during_day,growth,value,srwSale,srwEarning,rwSale,rwEarning
0,A,2020-01-31,25597.56288,1.0,1357.0,197.0,1367.0,194.0,1284.0,504.0,...,0,2,0.003302,-0.017245,True,False,0.056854,-0.609127,-0.007315,0.015464
1,A,2020-04-30,23645.24038,2.0,1238.0,101.0,1357.0,197.0,1238.0,182.0,...,0,2,0.067246,0.054094,True,False,0.0,-0.445055,-0.087693,-0.48731


IBES holds forecasts. We want to link IBES data with what we have. Measure EPS is earnings predictions and SAL is revenue/sale predictions.

In [515]:
ibes_db = sqlite3.connect('../data/ibes.db.sqlite')

In [588]:
ibes_history = pd.read_sql_query('SELECT * from ibes', ibes_db)
ibes_history.head(2)

Unnamed: 0,ticker,pyear,pmon,actual,anndats,suescore,surpmean,surpstdev,measure,fiscalp,oftic
0,000V,2019.0,12.0,-92.4,2020-03-13 00:00:00,-0.33978,-81.185,33.00618,EPS,ANN,EIGR
1,000V,2019.0,12.0,-20.4,2020-03-13 00:00:00,-0.26602,-12.91,28.15593,EPS,QTR,EIGR


In [589]:
import calendar, datetime
ibes_cleaned_history = (
    ibes_history
    .drop_duplicates(subset=['ticker', 'measure', 'anndats', 'fiscalp', 'pyear', 'pmon', 'suescore'])
    .dropna(subset=['measure', 'fiscalp', 'pyear', 'pmon', 'suescore'])
    .query('measure == "EPS" or measure == "SAL" or measure == "NET"')
    .query('fiscalp == "QTR"')
    .assign(datadate = lambda x: x[['pyear', 'pmon']]
            .apply(lambda x: datetime.datetime(int(x[0]), int(x[1]), calendar.monthrange(int(x[0]), int(x[1]))[1]), axis = 1))
    .query('datadate > "2020-01-01"')
    .sort_values(by=['ticker', 'datadate'])
    .reset_index(drop=True)
)
ibes_cleaned_history

  .apply(lambda x: datetime.datetime(int(x[0]), int(x[1]), calendar.monthrange(int(x[0]), int(x[1]))[1]), axis = 1))


Unnamed: 0,ticker,pyear,pmon,actual,anndats,suescore,surpmean,surpstdev,measure,fiscalp,oftic,datadate
0,000V,2020.0,3.0,-18.600,2020-05-07 00:00:00,1.23778,-24.08000,4.42728,EPS,QTR,EIGR,2020-03-31
1,000V,2020.0,3.0,-15.239,2020-05-07 00:00:00,3.67705,-17.96400,0.74108,NET,QTR,EIGR,2020-03-31
2,000V,2020.0,3.0,0.000,2020-05-07 00:00:00,-0.50000,0.62500,1.25000,SAL,QTR,EIGR,2020-03-31
3,000V,2020.0,6.0,-18.000,2020-08-06 00:00:00,1.04215,-21.40000,3.26250,EPS,QTR,EIGR,2020-06-30
4,000V,2020.0,6.0,-15.326,2020-08-06 00:00:00,0.96555,-16.52375,1.24049,NET,QTR,EIGR,2020-06-30
...,...,...,...,...,...,...,...,...,...,...,...,...
120547,ZYNX,2022.0,6.0,3.346,2022-07-28 00:00:00,1.24415,2.80775,0.43262,NET,QTR,ZYXI,2022-06-30
120548,ZYNX,2022.0,6.0,36.759,2022-07-28 00:00:00,0.70373,36.29650,0.65721,SAL,QTR,ZYXI,2022-06-30
120549,ZYNX,2022.0,9.0,0.130,2022-10-27 00:00:00,0.85551,0.12100,0.01052,EPS,QTR,ZYXI,2022-09-30
120550,ZYNX,2022.0,9.0,4.873,2022-10-27 00:00:00,0.36022,4.69475,0.49483,NET,QTR,ZYXI,2022-09-30


In [599]:
expense_table = (
    ibes_cleaned_history
    .query('measure == "NET"')
    .get(['oftic', 'datadate', 'suescore', 'actual', 'surpstdev'])
    .merge(
        (
            ibes_cleaned_history
            .query('measure == "SAL"')
            .get(['oftic', 'datadate', 'suescore', 'actual', 'surpstdev'])
        ),
        on=['oftic', 'datadate'],
        suffixes=('_net', '_sal')
        )
    )
expense_table.head(2)

Unnamed: 0,oftic,datadate,suescore_net,actual_net,surpstdev_net,suescore_sal,actual_sal,surpstdev_sal
0,EIGR,2020-03-31,3.67705,-15.239,0.74108,-0.5,0.0,1.25
1,EIGR,2020-09-30,-0.26353,-15.68,1.47519,-0.61357,0.0,2.60768


In [625]:
expense_cleaned_table = (
    expense_table
    .assign(sal_pred = lambda x: x['actual_sal'] + x['surpstdev_sal']*x['suescore_sal'])
    .assign(net_pred = lambda x: x['actual_net'] + x['surpstdev_net']*x['suescore_net'])
    .assign(expense_pred = lambda x: x['sal_pred'] - x['net_pred'])
    .assign(expense_actual = lambda x: x['actual_sal'] - x['actual_net'])
    .assign(expense_std = lambda x: np.sqrt(x['surpstdev_sal']**2 + x['surpstdev_net']**2))
    .assign(expense_surp = lambda x: -(x['expense_actual'] - x['expense_pred'])/x['expense_std'])
    .rename(columns={'expense_surp': 'suescore_exp'})
    .get(['oftic', 'datadate', 'suescore_exp'])
)

In [626]:
sales_earnings_surprise = (
    ibes_cleaned_history
    .query('measure == "EPS"')
    .get(['oftic', 'datadate', 'suescore'])
    .merge(
        (
            ibes_cleaned_history
            .query('measure == "SAL"')
            .get(['oftic', 'datadate', 'suescore'])
        ),
        left_on=['oftic', 'datadate'],
        right_on=['oftic', 'datadate'],
        suffixes=('_eps', '_sal')
    )
    .merge(
        expense_cleaned_table,
        on=['oftic', 'datadate'],
        how='left'
    )
)
sales_earnings_surprise.head(2)

Unnamed: 0,oftic,datadate,suescore_eps,suescore_sal,suescore_exp
0,EIGR,2020-03-31,1.23778,-0.5,-2.305299
1,EIGR,2020-09-30,1.13809,-0.61357,-0.404281


In [627]:
total_data = (
    needed_data
    .drop(columns=['saleq', 'ibq', 'saleq_1', 'ibq_1', 'saleq_4', 'ibq_4', 'bm_bucket', 'dvy', 'dvy_bucket'])
    .merge(sales_earnings_surprise, left_on=['tic', 'datadate'], right_on=['oftic', 'datadate'], how='left')
    .dropna(subset=['suescore_eps', 'suescore_sal', 'suescore_exp'])
    .reset_index(drop=True)
)

Now we can windsorize the data on the earnings. The following output is the same as Panel D table 1 in the paper.

In [628]:
quart = [0.10, 0.25, 0.5,0.75, 0.90]

In [659]:
#Windsorize the data
def windsorize(data):
    #Delete the data 1% and 99%
    return data.clip(data.quantile(0.01), data.quantile(0.99)).replace(data.quantile(0.01), np.nan).replace(data.quantile(0.99), np.nan)
clipped_data = (
    total_data
    .assign(ret_after_hours = lambda    x: windsorize(x['ret_after_hours']))
    .assign(ret_during_day = lambda x: windsorize(x['ret_during_day']))
    .assign(suescore_eps = lambda x: windsorize(x['suescore_eps']))
    .assign(suescore_sal = lambda x: windsorize(x['suescore_sal']))
    .assign(suescore_exp = lambda x: windsorize(x['suescore_exp']))
    .dropna()
)
def if_num_round(x):
    try:
        return round(float(x), 3)
    except:
        return x
clipped_data.describe(percentiles=quart).T.map(if_num_round).to_clipboard()


Panel A Table 1

In [630]:
clipped_data.assign(year = lambda x: x['datadate'].dt.year).groupby('year').size()

year
2020    7564
2021    7848
2022    6209
dtype: int64

Panel B Table 1

In [631]:
clipped_data.groupby('fqtr').size()

fqtr
1.0    5626
2.0    5852
3.0    5894
4.0    4249
dtype: int64

Panel C Table 1

In [632]:
clipped_data.groupby('oftic').size().reset_index(name='count').describe(percentiles=quart)

Unnamed: 0,count
count,2789.0
mean,7.752241
std,3.570955
min,1.0
10%,2.0
25%,5.0
50%,9.0
75%,11.0
90%,11.0
max,12.0


The following are my growth and value statistics for Panel E Table 1.

In [660]:
clipped_data.query('growth == True').describe(percentiles=quart).T.map(if_num_round).to_clipboard()

In [661]:
clipped_data.query('value == True').describe(percentiles=quart).T.map(if_num_round).to_clipboard()

Table 2 Panel A

In [635]:
# Regression on just SAR ~ SUE

import statsmodels.formula.api as smf
from regtabletotext import prettify_result
t1pa = smf.ols('ret_after_hours ~ suescore_eps', data=clipped_data).fit()
prettify_result(t1pa)

OLS Model:
ret_after_hours ~ suescore_eps

Coefficients:
              Estimate  Std. Error  t-Statistic  p-Value
Intercept       -0.001       0.001       -1.278    0.201
suescore_eps     0.003       0.000       18.060    0.000

Summary statistics:
- Number of observations: 21,621
- R-squared: 0.015, Adjusted R-squared: 0.015
- F-statistic: 326.182 on 1 and 21619 DF, p-value: 0.000



In [636]:
at_least_two_forecasts = ((clipped_data.groupby('oftic').size() > 1)
                          .replace({False: np.nan})
                          .dropna()
                          .reset_index()
                          .merge(clipped_data, left_on='oftic', right_on='oftic')
                          .drop(columns=[0])                       
)

At least two forecasts around the same number of observations

In [637]:
# Regression on just SAR ~ SUE

import statsmodels.formula.api as smf
from regtabletotext import prettify_result
t1pa = smf.ols('ret_after_hours ~ suescore_eps', data=at_least_two_forecasts).fit()
prettify_result(t1pa)

OLS Model:
ret_after_hours ~ suescore_eps

Coefficients:
              Estimate  Std. Error  t-Statistic  p-Value
Intercept       -0.001       0.001       -1.249    0.212
suescore_eps     0.003       0.000       17.830    0.000

Summary statistics:
- Number of observations: 21,405
- R-squared: 0.015, Adjusted R-squared: 0.015
- F-statistic: 317.900 on 1 and 21403 DF, p-value: 0.000



In [638]:
import statsmodels.formula.api as smf
from regtabletotext import prettify_result
t1pa = smf.ols('ret_during_day ~ suescore_sal + suescore_exp', data=clipped_data).fit()
prettify_result(t1pa)

OLS Model:
ret_during_day ~ suescore_sal + suescore_exp

Coefficients:
              Estimate  Std. Error  t-Statistic  p-Value
Intercept       -0.002       0.001       -3.240    0.001
suescore_sal     0.004       0.000       20.144    0.000
suescore_exp    -0.001       0.000       -2.295    0.022

Summary statistics:
- Number of observations: 21,621
- R-squared: 0.023, Adjusted R-squared: 0.023
- F-statistic: 250.564 on 2 and 21618 DF, p-value: 0.000



In [662]:
import statsmodels.formula.api as smf
from regtabletotext import prettify_result
t1pa = smf.ols('ret_during_day ~ suescore_sal + suescore_exp', data=at_least_two_forecasts).fit()
prettify_result(t1pa)

OLS Model:
ret_during_day ~ suescore_sal + suescore_exp

Coefficients:
              Estimate  Std. Error  t-Statistic  p-Value
Intercept       -0.002       0.001       -3.268    0.001
suescore_sal     0.004       0.000       20.121    0.000
suescore_exp    -0.001       0.000       -2.286    0.022

Summary statistics:
- Number of observations: 21,405
- R-squared: 0.023, Adjusted R-squared: 0.023
- F-statistic: 250.833 on 2 and 21402 DF, p-value: 0.000



In [642]:
clipped_data_with_dummy = (
    clipped_data
    .assign(value = lambda x: x['value'].replace({True: 1, False: 0}))
)
t3m1 = smf.ols('ret_after_hours ~ suescore_eps*value + suescore_eps + value', data=clipped_data_with_dummy).fit()
prettify_result(t3m1)

OLS Model:
ret_after_hours ~ suescore_eps*value + suescore_eps + value

Coefficients:
                    Estimate  Std. Error  t-Statistic  p-Value
Intercept             -0.003       0.001       -3.686    0.000
suescore_eps           0.003       0.000       14.177    0.000
value                  0.005       0.001        4.091    0.000
suescore_eps:value    -0.000       0.000       -0.407    0.684

Summary statistics:
- Number of observations: 21,621
- R-squared: 0.016, Adjusted R-squared: 0.016
- F-statistic: 115.134 on 3 and 21617 DF, p-value: 0.000



  .assign(value = lambda x: x['value'].replace({True: 1, False: 0}))


In [643]:
t3m2 = smf.ols('ret_during_day ~ value + suescore_sal + suescore_exp + suescore_sal*value + suescore_exp*value', data=clipped_data_with_dummy).fit()
prettify_result(t3m2)

OLS Model:
ret_during_day ~ value + suescore_sal + suescore_exp + suescore_sal*value 
 + suescore_exp*value

Coefficients:
                    Estimate  Std. Error  t-Statistic  p-Value
Intercept             -0.006       0.001       -6.155    0.000
value                  0.007       0.001        5.401    0.000
suescore_sal           0.004       0.000       16.149    0.000
suescore_exp          -0.000       0.000       -0.917    0.359
suescore_sal:value     0.001       0.000        2.190    0.029
suescore_exp:value    -0.001       0.001       -1.639    0.101

Summary statistics:
- Number of observations: 21,621
- R-squared: 0.025, Adjusted R-squared: 0.025
- F-statistic: 110.295 on 5 and 21615 DF, p-value: 0.000

