In [68]:
import pandas as pd
import numpy as np
import polars as pl

from factorlib.utils.system import get_data_dir

In [69]:
raw_data_dir = get_data_dir() / 'raw'
raw_data = pd.read_csv(raw_data_dir / 'balance_sheet_statements.csv', index_col=0)
raw_data

Unnamed: 0,date,symbol,reportedCurrency,cik,fillingDate,acceptedDate,calendarYear,period,cashAndCashEquivalents,shortTermInvestments,...,totalStockholdersEquity,totalEquity,totalLiabilitiesAndStockholdersEquity,minorityInterest,totalLiabilitiesAndTotalEquity,totalInvestments,totalDebt,netDebt,link,finalLink
0,2023-04-30,A,USD,1090872.0,2023-05-26,2023-05-26 17:28:42,2023,Q2,1.175000e+09,0.0,...,5.781000e+09,5.781000e+09,1.079200e+10,0.0,1.079200e+10,186000000.0,2.733000e+09,1.558000e+09,https://www.sec.gov/Archives/edgar/data/109087...,https://www.sec.gov/Archives/edgar/data/109087...
1,2023-01-31,A,USD,1090872.0,2023-03-03,2023-03-02 18:52:33,2023,Q1,1.250000e+09,0.0,...,5.609000e+09,5.609000e+09,1.091900e+10,0.0,1.091900e+10,195000000.0,2.971000e+09,1.721000e+09,https://www.sec.gov/Archives/edgar/data/109087...,https://www.sec.gov/Archives/edgar/data/109087...
2,2022-10-31,A,USD,1090872.0,2022-12-21,2022-12-20 18:42:30,2022,Q4,1.053000e+09,0.0,...,5.289000e+09,5.289000e+09,1.051600e+10,0.0,1.051600e+10,195000000.0,2.769000e+09,1.716000e+09,https://www.sec.gov/Archives/edgar/data/109087...,https://www.sec.gov/Archives/edgar/data/109087...
3,2022-07-31,A,USD,1090872.0,2022-09-01,2022-08-31 18:53:26,2022,Q3,1.071000e+09,6000000.0,...,5.091000e+09,5.091000e+09,1.048400e+10,0.0,1.048400e+10,200000000.0,2.912000e+09,1.841000e+09,https://www.sec.gov/Archives/edgar/data/109087...,https://www.sec.gov/Archives/edgar/data/109087...
4,2022-04-30,A,USD,1090872.0,2022-05-31,2022-05-27 19:33:57,2022,Q2,1.186000e+09,21000000.0,...,5.122000e+09,5.122000e+09,1.045500e+10,0.0,1.045500e+10,211000000.0,2.905000e+09,1.719000e+09,https://www.sec.gov/Archives/edgar/data/109087...,https://www.sec.gov/Archives/edgar/data/109087...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,2007-03-31,ZZLL,USD,1365357.0,2007-06-29,2007-06-28 18:08:07,2007,Q1,7.704310e+05,0.0,...,4.101440e+05,4.101440e+05,3.685415e+06,0.0,3.685415e+06,0.0,0.000000e+00,-7.704310e+05,https://www.sec.gov/Archives/edgar/data/136535...,https://www.sec.gov/Archives/edgar/data/136535...
62,2006-12-31,ZZLL,USD,1365357.0,2006-12-31,2006-12-31 00:00:00,2006,Q4,1.099092e+06,0.0,...,4.168860e+05,4.168860e+05,3.306484e+06,0.0,3.306484e+06,0.0,5.615390e+05,-5.375530e+05,,
63,2006-09-30,ZZLL,USD,1365357.0,2006-09-30,2006-09-30 00:00:00,2006,Q3,6.326700e+05,0.0,...,5.204820e+05,5.204820e+05,3.170866e+06,0.0,3.170866e+06,0.0,5.615380e+05,-7.113200e+04,,
64,2006-06-30,ZZLL,USD,1365357.0,2006-06-30,2006-06-30 00:00:00,2006,Q2,1.710810e+05,0.0,...,6.653510e+05,6.653510e+05,2.559134e+06,0.0,2.559134e+06,0.0,0.000000e+00,-1.710810e+05,,


In [70]:
raw_data_dir = get_data_dir() / 'raw'
raw_data_tickers_to_gvkey = pd.read_csv(raw_data_dir / 'tickers.csv')
raw_data_tickers_to_gvkey

Unnamed: 0,gvkey,co_tic
0,1078,ABT
1,1300,HON
2,1440,AEP
3,2285,BA
4,2403,BMY
...,...,...
13844,14282,APH
13845,14282,APH
13846,3532,GLW
13847,14282,APH


In [71]:
# Create column in balance sheet statement that has gvkey
df1 = raw_data
df2 = raw_data_tickers_to_gvkey
df2.rename(columns={'co_tic': 'symbol'}, inplace=True)
df1 = df1.merge(df2[['symbol', 'gvkey']], on='symbol', how='left')

# Ensure 'date' is in datetime format
df1['date'] = pd.to_datetime(df1['date'])

#rename date index to date_index standard
df1.rename(columns={'date': 'date_index'}, inplace=True)

# Ensure 'gvkey' is of type str for correct panel data operations
df1['gvkey'] = df1['gvkey'].astype(str)

#make gvkey and date_index into index to remove nulls
df1.set_index(['date_index', 'gvkey'], inplace=True)

df1.sort_index(level=0, inplace=True)
df1


Unnamed: 0_level_0,Unnamed: 1_level_0,symbol,reportedCurrency,cik,fillingDate,acceptedDate,calendarYear,period,cashAndCashEquivalents,shortTermInvestments,cashAndShortTermInvestments,...,totalStockholdersEquity,totalEquity,totalLiabilitiesAndStockholdersEquity,minorityInterest,totalLiabilitiesAndTotalEquity,totalInvestments,totalDebt,netDebt,link,finalLink
date_index,gvkey,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1978-03-24,,LAZY,USD,1721741.0,1978-03-24,1978-03-23 19:00:00,1978,Q1,1.027141e+07,0.000000e+00,1.027141e+07,...,9.256057e+06,9.256057e+06,2.836074e+07,0.0,2.836074e+07,0.000000e+00,1.392539e+07,3.653977e+06,,
1985-08-31,2710.0,STZ,USD,16918.0,1985-08-31,1985-08-31 00:00:00,1986,Q2,9.100000e+06,0.000000e+00,9.100000e+06,...,4.790000e+07,4.790000e+07,7.910000e+07,0.0,7.910000e+07,0.000000e+00,7.700000e+06,-1.400000e+06,,
1985-08-31,2710.0,STZ,USD,16918.0,1985-08-31,1985-08-31 00:00:00,1986,Q2,9.100000e+06,0.000000e+00,9.100000e+06,...,4.790000e+07,4.790000e+07,7.910000e+07,0.0,7.910000e+07,0.000000e+00,7.700000e+06,-1.400000e+06,,
1985-08-31,2710.0,STZ,USD,16918.0,1985-08-31,1985-08-31 00:00:00,1986,Q2,9.100000e+06,0.000000e+00,9.100000e+06,...,4.790000e+07,4.790000e+07,7.910000e+07,0.0,7.910000e+07,0.000000e+00,7.700000e+06,-1.400000e+06,,
1985-08-31,2710.0,STZ,USD,16918.0,1985-08-31,1985-08-31 00:00:00,1986,Q2,9.100000e+06,0.000000e+00,9.100000e+06,...,4.790000e+07,4.790000e+07,7.910000e+07,0.0,7.910000e+07,0.000000e+00,7.700000e+06,-1.400000e+06,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-31,29028.0,COST,USD,909832.0,2023-05-31,2023-05-31 00:00:00,2023,Q3,1.249300e+10,1.215000e+09,1.370800e+10,...,2.356800e+10,2.356800e+10,6.675200e+10,5000000.0,6.675200e+10,1.215000e+09,9.004000e+09,-3.489000e+09,,
2023-05-31,29028.0,COST,USD,909832.0,2023-05-31,2023-05-31 00:00:00,2023,Q3,1.249300e+10,1.215000e+09,1.370800e+10,...,2.356800e+10,2.356800e+10,6.675200e+10,5000000.0,6.675200e+10,1.215000e+09,9.004000e+09,-3.489000e+09,,
2023-05-31,29028.0,COST,USD,909832.0,2023-05-31,2023-05-31 00:00:00,2023,Q3,1.249300e+10,1.215000e+09,1.370800e+10,...,2.356800e+10,2.356800e+10,6.675200e+10,5000000.0,6.675200e+10,1.215000e+09,9.004000e+09,-3.489000e+09,,
2023-05-31,29028.0,COST,USD,909832.0,2023-05-31,2023-05-31 00:00:00,2023,Q3,1.249300e+10,1.215000e+09,1.370800e+10,...,2.356800e+10,2.356800e+10,6.675200e+10,5000000.0,6.675200e+10,1.215000e+09,9.004000e+09,-3.489000e+09,,


In [72]:
#removing unnecesary columns
df1 = df1[['totalAssets', 'symbol']]
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,totalAssets,symbol
date_index,gvkey,Unnamed: 2_level_1,Unnamed: 3_level_1
1978-03-24,,2.836074e+07,LAZY
1985-08-31,2710.0,7.910000e+07,STZ
1985-08-31,2710.0,7.910000e+07,STZ
1985-08-31,2710.0,7.910000e+07,STZ
1985-08-31,2710.0,7.910000e+07,STZ
...,...,...,...
2023-05-31,29028.0,6.675200e+10,COST
2023-05-31,29028.0,6.675200e+10,COST
2023-05-31,29028.0,6.675200e+10,COST
2023-05-31,29028.0,6.675200e+10,COST


In [73]:
#renaming totalAssets and symbols so they're in standard 'at' and 'ticker' variable names
df1.rename(columns={'totalAssets' : 'at',
                    'symbol' : 'ticker'}, inplace=True)

#creating column with factor
df1['AssetGrowth'] = (df1['at'] - df1['at'].groupby(level=1).shift(12)) / df1['at'].groupby(level=1).shift(12)

df1



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,Unnamed: 1_level_0,at,ticker,AssetGrowth
date_index,gvkey,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1978-03-24,,2.836074e+07,LAZY,
1985-08-31,2710.0,7.910000e+07,STZ,
1985-08-31,2710.0,7.910000e+07,STZ,
1985-08-31,2710.0,7.910000e+07,STZ,
1985-08-31,2710.0,7.910000e+07,STZ,
...,...,...,...,...
2023-05-31,29028.0,6.675200e+10,COST,0.0
2023-05-31,29028.0,6.675200e+10,COST,0.0
2023-05-31,29028.0,6.675200e+10,COST,0.0
2023-05-31,29028.0,6.675200e+10,COST,0.0
