In [3]:
import pandas as pd
import polars as pl
import numpy as np

from factorlib.utils.system import get_data_dir

In [4]:
raw_data_dir = get_data_dir() / 'raw'
tickers = pl.scan_csv(raw_data_dir / 'tickers_to_train.csv').collect(streaming=True).to_series().to_list()
tickers

['AAME',
 'AAON',
 'AATC',
 'AB',
 'ABC',
 'ABCB',
 'ABM',
 'ABMD',
 'ABT',
 'ACFN',
 'ACGL',
 'ACIW',
 'ACU',
 'ACUR',
 'ADBE',
 'ADC',
 'ADM',
 'ADP',
 'ADSK',
 'ADTN',
 'AE',
 'AEE',
 'AEGN',
 'AEIS',
 'AEM',
 'AEO',
 'AEP',
 'AES',
 'AET',
 'AEY',
 'AFAM',
 'AFG',
 'AFL',
 'AGCO',
 'AGM',
 'AGX',
 'AGYS',
 'AIG',
 'AIN',
 'AIR',
 'AIRT',
 'AIT',
 'AIV',
 'AJG',
 'AJRD',
 'AKR',
 'AKRXQ',
 'ALB',
 'ALCO',
 'ALE',
 'ALG',
 'ALJJ',
 'ALK',
 'ALKS',
 'ALL',
 'ALOG',
 'ALOT',
 'ALX',
 'AMAG',
 'AME',
 'AMED',
 'AMGN',
 'AMOT',
 'AMRN',
 'AMS',
 'AMSC',
 'AMSWA',
 'AMWD',
 'AN',
 'ANDE',
 'ANIK',
 'ANIX',
 'AON',
 'AOS',
 'AP',
 'APA',
 'APD',
 'APH',
 'ARCB',
 'ARGO',
 'AROW',
 'ARTNA',
 'ARTW',
 'ARW',
 'ASB',
 'ASGN',
 'ASH',
 'ASML',
 'ASRV',
 'ASTC',
 'ASTE',
 'ASUR',
 'ASYS',
 'ATGE',
 'ATI',
 'ATNI',
 'ATO',
 'ATR',
 'ATRI',
 'ATRO',
 'AUB',
 'AUBN',
 'AVA',
 'AVB',
 'AVD',
 'AVDL',
 'AVID',
 'AWR',
 'AXAS',
 'AXE',
 'AXP',
 'AXR',
 'AZN',
 'AZO',
 'AZPN',
 'AZZ',
 'B',
 'BA',
 'B

In [5]:
earnings_surprises = pl.scan_csv(raw_data_dir / 'earnings_surprises.csv', try_parse_dates=True).collect(
    streaming=True)
earnings_surprises = (
    earnings_surprises.lazy()
    .select(
        pl.col('date').cast(pl.Datetime).alias('date_index'),
        pl.col('symbol').alias('ticker'),
        ((pl.col('actualEarningResult') - pl.col('estimatedEarning') )/ pl.col('estimatedEarning')).alias('earnings_surprise')
    )
    .collect(streaming=True)
)
earnings_surprises = earnings_surprises.lazy().filter(pl.col('ticker').is_in(tickers)).collect(
    streaming=True)
earnings_surprises

date_index,ticker,earnings_surprise
datetime[μs],str,f64
2023-05-04 00:00:00,"""AAON""",0.264151
2023-02-27 00:00:00,"""AAON""",0.267857
2022-11-07 00:00:00,"""AAON""",0.214286
2022-08-08 00:00:00,"""AAON""",-0.268293
2022-05-05 00:00:00,"""AAON""",0.222222
2022-02-28 00:00:00,"""AAON""",-0.357143
2021-11-04 00:00:00,"""AAON""",-0.236842
2021-08-05 00:00:00,"""AAON""",0.1875
2021-05-06 00:00:00,"""AAON""",0.111111
2021-02-25 00:00:00,"""AAON""",-0.068966


In [6]:
len(earnings_surprises.select(pl.col('ticker').unique()))  # good enough i guess, lol not much to do.

1097

In [7]:
no_duplicates = earnings_surprises.to_pandas().set_index(['date_index', 'ticker'])
no_duplicates = no_duplicates.loc[~no_duplicates.index.duplicated(keep='first')].reset_index(level=1)
no_duplicates.reset_index().sort_values(['ticker', 'date_index'])

Unnamed: 0,date_index,ticker,earnings_surprise
106,1994-07-12,AAON,0.000000
105,1994-10-11,AAON,0.041667
104,1995-02-15,AAON,0.100110
103,1995-04-18,AAON,-0.266825
102,1995-07-20,AAON,-0.096845
...,...,...,...
99734,2020-11-09,ZIXI,0.133333
99733,2021-02-25,ZIXI,0.000000
99732,2021-05-05,ZIXI,0.000000
99731,2021-08-05,ZIXI,0.000000


In [9]:
earnings_surprises_resampled = no_duplicates.reset_index().set_index('date_index').groupby('ticker').resample('1d', convention='start').ffill().reset_index(drop=True, level=0)
earnings_surprises_resampled

Unnamed: 0_level_0,ticker,earnings_surprise
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1
1994-07-12,AAON,0.0
1994-07-13,AAON,0.0
1994-07-14,AAON,0.0
1994-07-15,AAON,0.0
1994-07-16,AAON,0.0
...,...,...
2021-11-04,ZIXI,0.0
2021-11-05,ZIXI,0.0
2021-11-06,ZIXI,0.0
2021-11-07,ZIXI,0.0


In [23]:
fundamental_data_dir = get_data_dir() / 'fundamental'
earnings_surprises_resampled.to_csv(fundamental_data_dir / 'earnings_surprises.csv')