In [1]:
%load_ext dotenv
%dotenv

In [2]:
import os

INPUT_CSV_TRANSFORM = os.getenv("INPUT_CSV_TRANSFORM")

assert INPUT_CSV_TRANSFORM != None, "Missing INPUT_CSV_TRANSFORM"

Reading data:

In [3]:
import numpy as np
import pandas as pd

import transform_investing_pandas as ti

from common.constants import BALANCE_ARRAY_COLS, INCOME_ARRAY_COLS

%load_ext autoreload
%autoreload 1
%aimport common, transform_investing_pandas

In [4]:
raw_df = pd.read_csv(INPUT_CSV_TRANSFORM)
raw_df.head()

Unnamed: 0,stock_id,source_id,page_name,extraction_time,metric,value
0,CIB:NYSE,investing,general,2022-10-14T18:52:58.033317,currency,USD
1,CIB:NYSE,investing,general,2022-10-14T18:52:58.033317,price,25.09
2,CIB:NYSE,investing,general,2022-10-14T18:52:58.033317,shares,961827000
3,AAPL:NASDAQ,investing,balance-sheet,2022-10-14T18:52:58.374804,date_years,2022202220212021
4,AAPL:NASDAQ,investing,general,2022-10-14T18:52:58.633432,currency,USD


In [5]:
raw_df.query('source_id == "google"')

Unnamed: 0,stock_id,source_id,page_name,extraction_time,metric,value
84,CIB:NYSE,google,quote,2022-10-14T18:53:01.145837,price,25.0
85,CIB:NYSE,google,quote,2022-10-14T18:53:01.145837,pe_ratio,4.66
86,AAPL:NASDAQ,google,quote,2022-10-14T18:53:01.299287,price,138.69
87,AAPL:NASDAQ,google,quote,2022-10-14T18:53:01.299287,pe_ratio,22.91
88,EC:NYSE,google,quote,2022-10-14T18:53:01.808976,price,9.2
89,EC:NYSE,google,quote,2022-10-14T18:53:01.808976,pe_ratio,3.21


In [6]:
investing_df = (
    raw_df
    .query("source_id == 'investing'")
    .drop(columns=["source_id"])
    .set_index(["stock_id", "page_name", "extraction_time"])
    .sort_index()
)
investing_df.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,metric,value
stock_id,page_name,extraction_time,Unnamed: 3_level_1,Unnamed: 4_level_1
AAPL:NASDAQ,balance-sheet,2022-10-14T18:52:58.374804,date_years,2022202220212021
AAPL:NASDAQ,balance-sheet,2022-10-14T18:52:58.374804,date_days_months,"25/06,26/03,25/12,25/09"
AAPL:NASDAQ,balance-sheet,2022-10-14T18:52:58.374804,equity,58107673997193263090
AAPL:NASDAQ,balance-sheet,2022-10-14T18:52:58.374804,money_units,* In Millions of USD (except for per share items)
AAPL:NASDAQ,general,2022-10-14T18:52:58.633432,currency,USD
AAPL:NASDAQ,general,2022-10-14T18:52:58.633432,price,138.52
AAPL:NASDAQ,general,2022-10-14T18:52:58.633432,shares,16070752000
AAPL:NASDAQ,income-statement,2022-10-14T18:52:59.347448,date_years,2022202220212021
AAPL:NASDAQ,income-statement,2022-10-14T18:52:59.347448,date_days_months,"25/06,26/03,25/12,25/09"
AAPL:NASDAQ,income-statement,2022-10-14T18:52:59.347448,reveneus,829599727812394583360


In [7]:
investing_df.reset_index()['page_name'].unique()

array(['balance-sheet', 'general', 'income-statement'], dtype=object)

## Wrangling pages

Page: `general`

In [8]:
general_df = ti.get_pivoted_page_df(investing_df, "general")
general_df

Unnamed: 0_level_0,Unnamed: 1_level_0,metric,currency,price,shares
stock_id,page_name,extraction_time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AAPL:NASDAQ,general,2022-10-14T18:52:58.633432,USD,138.52,16070752000
BCOLOMBIA:BVC,general,2022-10-14T18:52:59.830715,COP,34200.0,961827000
CIB:NYSE,general,2022-10-14T18:52:58.033317,USD,25.09,961827000
EC:NYSE,general,2022-10-14T18:52:58.861408,USD,9.22,41116694690
ECOPETROL:BVC,general,2022-10-14T18:52:59.144838,COP,2163.0,41116694690
PFBCOLOM:BVC,general,2022-10-14T18:53:00.307557,COP,28560.0,961827000


In [9]:
general_df = ti.transform_general_page(general_df)
general_df

Unnamed: 0_level_0,Unnamed: 1_level_0,metric,currency,price,shares
stock_id,page_name,extraction_time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AAPL:NASDAQ,general,2022-10-14T18:52:58.633432,USD,138.52,16070752000
BCOLOMBIA:BVC,general,2022-10-14T18:52:59.830715,COP,34200.0,961827000
CIB:NYSE,general,2022-10-14T18:52:58.033317,USD,25.09,961827000
EC:NYSE,general,2022-10-14T18:52:58.861408,USD,9.22,41116694690
ECOPETROL:BVC,general,2022-10-14T18:52:59.144838,COP,2163.0,41116694690
PFBCOLOM:BVC,general,2022-10-14T18:53:00.307557,COP,28560.0,961827000


### Page: `balance`

In [10]:
balance_df = ti.get_pivoted_page_df(investing_df, "balance-sheet")
balance_df

Unnamed: 0_level_0,Unnamed: 1_level_0,metric,date_days_months,date_years,equity,money_units
stock_id,page_name,extraction_time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL:NASDAQ,balance-sheet,2022-10-14T18:52:58.374804,"25/06,26/03,25/12,25/09",2022202220212021,58107673997193263090,* In Millions of USD (except for per share items)
BCOLOMBIA:BVC,balance-sheet,2022-10-14T18:53:00.532916,"30/06,31/03,31/12,30/09",2022202220212021,35099020302002623223434730261139,* In Millions of COP (except for per share items)
CIB:NYSE,balance-sheet,2022-10-14T18:52:58.783524,"30/06,31/03,31/12,30/09",2022202220212021,35099020302002623223434730261139,* In Millions of COP (except for per share items)
EC:NYSE,balance-sheet,2022-10-14T18:52:59.422797,"30/06,31/03,31/12,30/09",2022202220212021,"-,89144201,71733024,62994784",* In Millions of (except for per share items)
ECOPETROL:BVC,balance-sheet,2022-10-14T18:52:59.614770,"30/06,31/03,31/12,30/09",2022202220212021,"-,89144201,71733024,62994784",* In Millions of COP (except for per share items)
PFBCOLOM:BVC,balance-sheet,2022-10-14T18:53:00.623502,"30/06,31/03,31/12,30/09",2022202220212021,35099020302002623223434730261139,* In Millions of COP (except for per share items)


In [11]:
balance_df = ti.transform_money_units(balance_df)
balance_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,date_days_months,date_years,equity,factor,currency
stock_id,page_name,extraction_time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAPL:NASDAQ,balance-sheet,2022-10-14T18:52:58.374804,"25/06,26/03,25/12,25/09",2022202220212021,58107673997193263090,Millions,USD
BCOLOMBIA:BVC,balance-sheet,2022-10-14T18:53:00.532916,"30/06,31/03,31/12,30/09",2022202220212021,35099020302002623223434730261139,Millions,COP
CIB:NYSE,balance-sheet,2022-10-14T18:52:58.783524,"30/06,31/03,31/12,30/09",2022202220212021,35099020302002623223434730261139,Millions,COP
EC:NYSE,balance-sheet,2022-10-14T18:52:59.422797,"30/06,31/03,31/12,30/09",2022202220212021,"-,89144201,71733024,62994784",,
ECOPETROL:BVC,balance-sheet,2022-10-14T18:52:59.614770,"30/06,31/03,31/12,30/09",2022202220212021,"-,89144201,71733024,62994784",Millions,COP
PFBCOLOM:BVC,balance-sheet,2022-10-14T18:53:00.623502,"30/06,31/03,31/12,30/09",2022202220212021,35099020302002623223434730261139,Millions,COP


In [12]:
balance_df = ti.transform_from_arrays(balance_df, BALANCE_ARRAY_COLS)
balance_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,date_years,date_days_months,equity,factor,currency
stock_id,page_name,extraction_time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAPL:NASDAQ,balance-sheet,2022-10-14T18:52:58.374804,2022,25/06,58107.0,Millions,USD
AAPL:NASDAQ,balance-sheet,2022-10-14T18:52:58.374804,2022,26/03,67399.0,Millions,USD
AAPL:NASDAQ,balance-sheet,2022-10-14T18:52:58.374804,2021,25/12,71932.0,Millions,USD
AAPL:NASDAQ,balance-sheet,2022-10-14T18:52:58.374804,2021,25/09,63090.0,Millions,USD
BCOLOMBIA:BVC,balance-sheet,2022-10-14T18:53:00.532916,2022,30/06,35099020.0,Millions,COP
BCOLOMBIA:BVC,balance-sheet,2022-10-14T18:53:00.532916,2022,31/03,30200262.0,Millions,COP
BCOLOMBIA:BVC,balance-sheet,2022-10-14T18:53:00.532916,2021,31/12,32234347.0,Millions,COP
BCOLOMBIA:BVC,balance-sheet,2022-10-14T18:53:00.532916,2021,30/09,30261139.0,Millions,COP
CIB:NYSE,balance-sheet,2022-10-14T18:52:58.783524,2022,30/06,35099020.0,Millions,COP
CIB:NYSE,balance-sheet,2022-10-14T18:52:58.783524,2022,31/03,30200262.0,Millions,COP


### Page: `income`

In [13]:
income_df = ti.get_pivoted_page_df(investing_df, "income-statement")
income_df

Unnamed: 0_level_0,Unnamed: 1_level_0,metric,date_days_months,date_years,gross_profits,money_units,net_incomes,operating_incomes,reveneus
stock_id,page_name,extraction_time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AAPL:NASDAQ,income-statement,2022-10-14T18:52:59.347448,"25/06,26/03,25/12,25/09",2022202220212021,35885425595424335174,* In Millions of USD (except for per share items),19442250103463020551,23076299794148823786,829599727812394583360
BCOLOMBIA:BVC,income-statement,2022-10-14T18:53:00.691773,"30/06,31/03,31/12,30/09",2022202220212021,,* In Millions of COP (except for per share items),177969517318581444744942620,,
CIB:NYSE,income-statement,2022-10-14T18:52:59.489989,"30/06,31/03,31/12,30/09",2022202220212021,"4911324,-,-,-",* In Millions of COP (except for per share items),177969517318581444744942620,"2571281,-,-,-","5088064,-,-,-"
EC:NYSE,income-statement,2022-10-14T18:52:59.554673,"30/06,31/03,31/12,30/09",2022202220212021,"-,14736268,12628781,8826762",* In Millions of (except for per share items),"-,6572640,6077214,3807248","-,12664307,10388494,7102309","-,32472744,31761115,23332762"
ECOPETROL:BVC,income-statement,2022-10-14T18:53:00.068412,"30/06,31/03,31/12,30/09",2022202220212021,"-,14736268,12628781,8826762",* In Millions of COP (except for per share items),"-,6572640,6077214,3807248","-,12664307,10388494,7102309","-,32472744,31761115,23332762"
PFBCOLOM:BVC,income-statement,2022-10-14T18:53:00.462319,"30/06,31/03,31/12,30/09",2022202220212021,,* In Millions of COP (except for per share items),177969517318581444744942620,,


In [14]:
income_df = ti.transform_money_units(income_df)
income_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,date_days_months,date_years,gross_profits,net_incomes,operating_incomes,reveneus,factor,currency
stock_id,page_name,extraction_time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AAPL:NASDAQ,income-statement,2022-10-14T18:52:59.347448,"25/06,26/03,25/12,25/09",2022202220212021,35885425595424335174,19442250103463020551,23076299794148823786,829599727812394583360,Millions,USD
BCOLOMBIA:BVC,income-statement,2022-10-14T18:53:00.691773,"30/06,31/03,31/12,30/09",2022202220212021,,177969517318581444744942620,,,Millions,COP
CIB:NYSE,income-statement,2022-10-14T18:52:59.489989,"30/06,31/03,31/12,30/09",2022202220212021,"4911324,-,-,-",177969517318581444744942620,"2571281,-,-,-","5088064,-,-,-",Millions,COP
EC:NYSE,income-statement,2022-10-14T18:52:59.554673,"30/06,31/03,31/12,30/09",2022202220212021,"-,14736268,12628781,8826762","-,6572640,6077214,3807248","-,12664307,10388494,7102309","-,32472744,31761115,23332762",,
ECOPETROL:BVC,income-statement,2022-10-14T18:53:00.068412,"30/06,31/03,31/12,30/09",2022202220212021,"-,14736268,12628781,8826762","-,6572640,6077214,3807248","-,12664307,10388494,7102309","-,32472744,31761115,23332762",Millions,COP
PFBCOLOM:BVC,income-statement,2022-10-14T18:53:00.462319,"30/06,31/03,31/12,30/09",2022202220212021,,177969517318581444744942620,,,Millions,COP


In [15]:
income_df = ti.transform_from_arrays(income_df, INCOME_ARRAY_COLS)
income_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,date_years,date_days_months,gross_profits,net_incomes,operating_incomes,reveneus,factor,currency
stock_id,page_name,extraction_time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AAPL:NASDAQ,income-statement,2022-10-14T18:52:59.347448,2022,25/06,35885.0,19442.0,23076.0,82959.0,Millions,USD
AAPL:NASDAQ,income-statement,2022-10-14T18:52:59.347448,2022,26/03,42559.0,25010.0,29979.0,97278.0,Millions,USD
AAPL:NASDAQ,income-statement,2022-10-14T18:52:59.347448,2021,25/12,54243.0,34630.0,41488.0,123945.0,Millions,USD
AAPL:NASDAQ,income-statement,2022-10-14T18:52:59.347448,2021,25/09,35174.0,20551.0,23786.0,83360.0,Millions,USD
BCOLOMBIA:BVC,income-statement,2022-10-14T18:53:00.691773,2022,30/06,,1779695.0,,,Millions,COP
BCOLOMBIA:BVC,income-statement,2022-10-14T18:53:00.691773,2022,31/03,,1731858.0,,,Millions,COP
BCOLOMBIA:BVC,income-statement,2022-10-14T18:53:00.691773,2021,31/12,,1444744.0,,,Millions,COP
BCOLOMBIA:BVC,income-statement,2022-10-14T18:53:00.691773,2021,30/09,,942620.0,,,Millions,COP
CIB:NYSE,income-statement,2022-10-14T18:52:59.489989,2022,30/06,4911324.0,1779695.0,2571281.0,5088064.0,Millions,COP
CIB:NYSE,income-statement,2022-10-14T18:52:59.489989,2022,31/03,,1731858.0,,,Millions,COP


In [16]:
income_df = ti.transform_date(income_df)
income_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gross_profits,net_incomes,operating_incomes,reveneus,factor,currency,date
stock_id,page_name,extraction_time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AAPL:NASDAQ,income-statement,2022-10-14T18:52:59.347448,35885.0,19442.0,23076.0,82959.0,Millions,USD,2022/25/06
AAPL:NASDAQ,income-statement,2022-10-14T18:52:59.347448,42559.0,25010.0,29979.0,97278.0,Millions,USD,2022/26/03
AAPL:NASDAQ,income-statement,2022-10-14T18:52:59.347448,54243.0,34630.0,41488.0,123945.0,Millions,USD,2021/25/12
AAPL:NASDAQ,income-statement,2022-10-14T18:52:59.347448,35174.0,20551.0,23786.0,83360.0,Millions,USD,2021/25/09
BCOLOMBIA:BVC,income-statement,2022-10-14T18:53:00.691773,,1779695.0,,,Millions,COP,2022/30/06
BCOLOMBIA:BVC,income-statement,2022-10-14T18:53:00.691773,,1731858.0,,,Millions,COP,2022/31/03
BCOLOMBIA:BVC,income-statement,2022-10-14T18:53:00.691773,,1444744.0,,,Millions,COP,2021/31/12
BCOLOMBIA:BVC,income-statement,2022-10-14T18:53:00.691773,,942620.0,,,Millions,COP,2021/30/09
CIB:NYSE,income-statement,2022-10-14T18:52:59.489989,4911324.0,1779695.0,2571281.0,5088064.0,Millions,COP,2022/30/06
CIB:NYSE,income-statement,2022-10-14T18:52:59.489989,,1731858.0,,,Millions,COP,2022/31/03


In [17]:
income_df.shape

(24, 7)