In [96]:
import pandas as pd
import polars as pl
import numpy as np
from scipy.stats import zscore

from factorlib.utils.system import get_data_dir

In [97]:
raw_data_dir = get_data_dir() / 'raw'
tickers = pl.scan_csv(raw_data_dir / 'tickers_to_train.csv').collect(streaming=True).to_series().to_list()
tickers

['AAME',
 'AAON',
 'AATC',
 'AB',
 'ABC',
 'ABCB',
 'ABM',
 'ABMD',
 'ABT',
 'ACFN',
 'ACGL',
 'ACIW',
 'ACU',
 'ACUR',
 'ADBE',
 'ADC',
 'ADM',
 'ADP',
 'ADSK',
 'ADTN',
 'AE',
 'AEE',
 'AEGN',
 'AEIS',
 'AEM',
 'AEO',
 'AEP',
 'AES',
 'AET',
 'AEY',
 'AFAM',
 'AFG',
 'AFL',
 'AGCO',
 'AGM',
 'AGX',
 'AGYS',
 'AIG',
 'AIN',
 'AIR',
 'AIRT',
 'AIT',
 'AIV',
 'AJG',
 'AJRD',
 'AKR',
 'AKRXQ',
 'ALB',
 'ALCO',
 'ALE',
 'ALG',
 'ALJJ',
 'ALK',
 'ALKS',
 'ALL',
 'ALOG',
 'ALOT',
 'ALX',
 'AMAG',
 'AME',
 'AMED',
 'AMGN',
 'AMOT',
 'AMRN',
 'AMS',
 'AMSC',
 'AMSWA',
 'AMWD',
 'AN',
 'ANDE',
 'ANIK',
 'ANIX',
 'AON',
 'AOS',
 'AP',
 'APA',
 'APD',
 'APH',
 'ARCB',
 'ARGO',
 'AROW',
 'ARTNA',
 'ARTW',
 'ARW',
 'ASB',
 'ASGN',
 'ASH',
 'ASML',
 'ASRV',
 'ASTC',
 'ASTE',
 'ASUR',
 'ASYS',
 'ATGE',
 'ATI',
 'ATNI',
 'ATO',
 'ATR',
 'ATRI',
 'ATRO',
 'AUB',
 'AUBN',
 'AVA',
 'AVB',
 'AVD',
 'AVDL',
 'AVID',
 'AWR',
 'AXAS',
 'AXE',
 'AXP',
 'AXR',
 'AZN',
 'AZO',
 'AZPN',
 'AZZ',
 'B',
 'BA',
 'B

In [98]:
relevant_balance_sheets = pl.scan_csv(raw_data_dir / 'balance_sheet_statements.csv', try_parse_dates=True).collect(
    streaming=True)
current_ratio = (
    relevant_balance_sheets.lazy()
    .select(
        pl.col('date').cast(pl.Datetime).alias('date_index'),
        pl.col('symbol').alias('ticker'),
        (pl.col('totalCurrentAssets').cast(pl.Float64) / pl.col('totalCurrentLiabilities').cast(pl.Float64)).alias('currentRatio')
    )
    .collect(streaming=True)
)
current_ratio = current_ratio.lazy().filter(pl.col('ticker').is_in(tickers)).collect(streaming=True).to_pandas()
current_ratio = current_ratio.sort_values(['ticker', 'date_index']).dropna()
current_ratio

Unnamed: 0,date_index,ticker,currentRatio
125,1991-03-31,AAME,0.000000
124,1991-06-30,AAME,9.731844
123,1991-09-30,AAME,7.927273
122,1991-12-31,AAME,12.992126
121,1992-03-31,AAME,4.223650
...,...,...,...
141573,2020-09-30,ZIXI,0.571908
141572,2020-12-31,ZIXI,0.553648
141571,2021-03-31,ZIXI,0.581397
141570,2021-06-30,ZIXI,0.679757


In [99]:
industry_codes = pl.scan_csv(raw_data_dir / 'ohclv_daily.csv', try_parse_dates=True).collect(
    streaming=True)
industry_codes = industry_codes.lazy().select(
    pl.col('datadate').cast(pl.Datetime).alias('date_index'),
    pl.col('tic').alias('ticker'),
    pl.col('gind').alias('industry')
)
industry_codes = industry_codes.lazy().filter(pl.col('ticker').is_in(tickers)).collect(streaming=True)
industry_codes = industry_codes.to_pandas()

In [100]:
merged_data = current_ratio.merge(industry_codes, how='inner', on=['date_index', 'ticker'])
merged_data

Unnamed: 0,date_index,ticker,currentRatio,industry
0,1991-09-30,AAME,7.927273,403010
1,1991-12-31,AAME,12.992126,403010
2,1992-03-31,AAME,4.223650,403010
3,1992-06-30,AAME,4.095960,403010
4,1992-09-30,AAME,4.300275,403010
...,...,...,...,...
90256,2020-09-30,ZIXI,0.571908,451030
90257,2020-12-31,ZIXI,0.553648,451030
90258,2021-03-31,ZIXI,0.581397,451030
90259,2021-06-30,ZIXI,0.679757,451030


In [101]:
no_duplicates = merged_data.set_index(['date_index', 'ticker'])
no_duplicates = no_duplicates.loc[~merged_data.index.duplicated(keep='first')].reset_index(level=1)
no_duplicates

Unnamed: 0_level_0,ticker,currentRatio,industry
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1991-09-30,AAME,7.927273,403010
1991-12-31,AAME,12.992126,403010
1992-03-31,AAME,4.223650,403010
1992-06-30,AAME,4.095960,403010
1992-09-30,AAME,4.300275,403010
...,...,...,...
2020-09-30,ZIXI,0.571908,451030
2020-12-31,ZIXI,0.553648,451030
2021-03-31,ZIXI,0.581397,451030
2021-06-30,ZIXI,0.679757,451030


In [102]:
industry_curr_ratio = no_duplicates
industry_curr_ratio['ind_curr_ratio_avg'] = industry_curr_ratio.groupby(['industry'])['currentRatio'].transform(lambda x: x.rolling(window=4).mean())
industry_curr_ratio['ind_curr_ratio_stdv'] = industry_curr_ratio.groupby(['industry'])['currentRatio'].transform(lambda x: x.rolling(window=4).std())
industry_curr_ratio

Unnamed: 0_level_0,ticker,currentRatio,industry,ind_curr_ratio_avg,ind_curr_ratio_stdv
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1991-09-30,AAME,7.927273,403010,,
1991-12-31,AAME,12.992126,403010,,
1992-03-31,AAME,4.223650,403010,,
1992-06-30,AAME,4.095960,403010,7.309752,4.184224
1992-09-30,AAME,4.300275,403010,6.403003,4.393557
...,...,...,...,...,...
2020-09-30,ZIXI,0.571908,451030,0.469328,0.079217
2020-12-31,ZIXI,0.553648,451030,0.513057,0.057963
2021-03-31,ZIXI,0.581397,451030,0.543434,0.052383
2021-06-30,ZIXI,0.679757,451030,0.596678,0.056571


In [103]:
ir_curr_ratio = industry_curr_ratio
ir_curr_ratio['ir_curr_ratio'] = (ir_curr_ratio['currentRatio'] - ir_curr_ratio['ind_curr_ratio_avg']) / ir_curr_ratio['ind_curr_ratio_stdv']
ir_curr_ratio = ir_curr_ratio.reset_index().sort_values(['ticker', 'date_index']).set_index('date_index')
ir_curr_ratio.drop(columns=['industry', 'ind_curr_ratio_avg', 'ind_curr_ratio_stdv'], inplace=True)
ir_curr_ratio.dropna(inplace=True)
ir_curr_ratio

Unnamed: 0_level_0,ticker,currentRatio,ir_curr_ratio
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1992-06-30,AAME,4.095960,-0.768074
1992-09-30,AAME,4.300275,-0.478593
1992-12-31,AAME,3.110357,-1.482579
1993-03-31,AAME,5.047091,1.138630
1993-06-30,AAME,4.627551,0.428089
...,...,...,...
2020-09-30,ZIXI,0.571908,1.294928
2020-12-31,ZIXI,0.553648,0.700286
2021-03-31,ZIXI,0.581397,0.724739
2021-06-30,ZIXI,0.679757,1.468592


In [104]:
def mask_risk(x):
    if abs(x) < 1:
        return -1
    else:
        return 1

In [105]:
risk_analysis = ir_curr_ratio
risk_analysis['curr_ratio_risk'] = risk_analysis['currentRatio'].apply(mask_risk)
risk_analysis

Unnamed: 0_level_0,ticker,currentRatio,ir_curr_ratio,curr_ratio_risk
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1992-06-30,AAME,4.095960,-0.768074,1
1992-09-30,AAME,4.300275,-0.478593,1
1992-12-31,AAME,3.110357,-1.482579,1
1993-03-31,AAME,5.047091,1.138630,1
1993-06-30,AAME,4.627551,0.428089,1
...,...,...,...,...
2020-09-30,ZIXI,0.571908,1.294928,-1
2020-12-31,ZIXI,0.553648,0.700286,-1
2021-03-31,ZIXI,0.581397,0.724739,-1
2021-06-30,ZIXI,0.679757,1.468592,-1


In [106]:
no_duplicates = risk_analysis.reset_index().set_index(['date_index', 'ticker'])
no_duplicates = no_duplicates.loc[~risk_analysis.index.duplicated(keep='first')].reset_index(level=1)
resampled_analysis = no_duplicates.groupby('ticker').resample('MS', convention='start').ffill().dropna().reset_index(level=0, drop=True)
resampled_analysis

Unnamed: 0_level_0,ticker,currentRatio,ir_curr_ratio,curr_ratio_risk
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1992-07-01,AAME,4.095960,-0.768074,1.0
1992-08-01,AAME,4.095960,-0.768074,1.0
1992-09-01,AAME,4.095960,-0.768074,1.0
1992-10-01,AAME,4.300275,-0.478593,1.0
1992-11-01,AAME,4.300275,-0.478593,1.0
...,...,...,...,...
2015-05-01,WLYB,0.669398,-0.997983,-1.0
2015-06-01,WLYB,0.669398,-0.997983,-1.0
2015-07-01,WLYB,0.669398,-0.997983,-1.0
2015-08-01,WLYB,0.669398,-0.997983,-1.0


In [109]:
fundamental_data_dir = get_data_dir() / 'fundamental'
current_ratio_factor = risk_analysis[['ticker', 'ir_curr_ratio', 'curr_ratio_risk']]
current_ratio_factor['curr_ratio_risk'] = current_ratio_factor['curr_ratio_risk'].astype(float)
current_ratio_factor.to_csv(fundamental_data_dir / 'curr_ratio_analysis.csv')
current_ratio_factor


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,ticker,ir_curr_ratio,curr_ratio_risk
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1992-06-30,AAME,-0.768074,1.0
1992-09-30,AAME,-0.478593,1.0
1992-12-31,AAME,-1.482579,1.0
1993-03-31,AAME,1.138630,1.0
1993-06-30,AAME,0.428089,1.0
...,...,...,...
2020-09-30,ZIXI,1.294928,-1.0
2020-12-31,ZIXI,0.700286,-1.0
2021-03-31,ZIXI,0.724739,-1.0
2021-06-30,ZIXI,1.468592,-1.0
