In [None]:
# O objetivo aqui vai ser arrumar os dados para backtest

In [5]:
import pandas as pd
import numpy as np
# https://jupyter-notebook.readthedocs.io/en/stable/examples/Notebook/Working%20With%20Markdown%20Cells.html

In [None]:
# Algumas questões sobre markdown
# https://tex.stackexchange.com/questions/130510/write-text-correctly-in-equations #\mathrm and its friends
# https://jupyter-notebook.readthedocs.io/en/stable/examples/Notebook/Working%20With%20Markdown%20Cells.html
# https://www.ibm.com/support/knowledgecenter/en/SSGNPV_1.1.3/dsx/markd-jupyter.html

The magic formula ranks companies based on two factors: return on capital and earnings yield.

#### Return on Capital
$\textrm{ROC} = \textrm{EBIT}/\textrm{(Net Working Capital + Net Fixed Assets)}$

$\begin{split}
\textrm{Net Fixed Assets} & = \textrm{Total Assets} \\
 & - \textrm{Total Current Assets} \\
 & - \textrm{Total Intangibles & Goodwill}
\end{split}$

#### Earnings yield
$\textrm{Earnings Yield} = \textrm{EBIT} / \textrm{Enterprise Value}$

$\begin{split}
\textrm{Enterprise Value} & = \textrm{Market Value of Equity} \\
 & + \textrm{Net Interest-Bearing Debt}
\end{split}$

In [6]:
def org_columns(df):
    '''
    Organiza os dataframes com fundamentos:
        Renomeia columns
        Seleciona apenas tickers de interesse
    '''
    tickers = df.columns.str.extract('(.*)$')[0]
    df.columns = tickers
    on = tickers[tickers.str.contains('.\D3$')]
    pn = tickers[tickers.str.contains('.\D4$')]
    sto_tick = pd.concat([on, pn], ignore_index=True)
    df = df[sto_tick]
    df = df[df.index.str.contains("Mar|Jun|Sep|Dec")]
    df.columns.name = "ticker"
    return df

In [11]:
# EBIT
ebit_ati = pd.read_excel('../../data/economatica_1/ebit_mensal__ativas.xlsx',
                            skiprows = [1,2],
                            header=1,
                            index_col = 0,
                            na_values='-')
ebit_can = pd.read_excel('../../data/economatica_1/ebit_mensal_canceladas.xlsx',
                            skiprows = [1,2],
                            header=1,
                            index_col = 0,
                            na_values='-')
ebit_can.drop(["Jan-86", "Feb-86"], inplace=True)

ebit = pd.concat([ebit_ati,ebit_can],axis=1,sort=False)
ebit = org_columns(ebit)

In [12]:
# Net Working Capital
nwc_ati = pd.read_excel('../../data/economatica_1/working_capital_mensal_ativas.xlsx',
                            skiprows = [1,2],
                            header=1,
                            index_col = 0,
                            na_values='-')
nwc_can = pd.read_excel('../../data/economatica_1/working_capital_mensal_canceladas.xlsx',
                            skiprows = [1,2],
                            header=1,
                            index_col = 0,
                            na_values='-')
nwc_can.drop(["Jan-86", "Feb-86"], inplace=True)

nwc = pd.concat([ebit_ati,ebit_can],axis=1,sort=False)
nwc = org_columns(nwc)

In [26]:
# Total Assets
ta_ati = pd.read_excel('../../data/economatica_1/total_assets_mensal_ativas.xlsx',
                            skiprows = [1,2],
                            header=1,
                            index_col = 0,
                            na_values='-')
ta_can = pd.read_excel('../../data/economatica_1/total_assets_mensal_canceladas.xlsx',
                            skiprows = [1,2],
                            header=1,
                            index_col = 0,
                            na_values='-')
ta_can.drop(["Jan-86", "Feb-86"], inplace=True)

ta = pd.concat([ta_ati,ta_can],axis=1,sort=False)
ta = org_columns(ta)

In [47]:
# Ativo Circulante
ac_ati = pd.read_excel('../../data/economatica_1/ativo_circulante_mensal_ativas.xlsx',
                            skiprows = [1,2],
                            header=1,
                            index_col = 0,
                            na_values='-')
ac_can = pd.read_excel('../../data/economatica_1/ativo_circulante_mensal_canceladas.xlsx',
                            skiprows = [1,2],
                            header=1,
                            index_col = 0,
                            na_values='-')
ac_can.drop(["Jan-86", "Feb-86"], inplace=True)

ac = pd.concat([ac_ati,ac_can],axis=1,sort=False)
ac = org_columns(ac)

In [54]:
# Passivo Circulante
pc_ati = pd.read_excel('../../data/economatica_1/passivo_circulante_mensal_ativas.xlsx',
                            skiprows = [1,2],
                            header=1,
                            index_col = 0,
                            na_values='-')
pc_can = pd.read_excel('../../data/economatica_1/passivo_circulante_mensal_canceladas.xlsx',
                            skiprows = [1,2],
                            header=1,
                            index_col = 0,
                            na_values='-')
pc_can.drop(["Jan-86", "Feb-86"], inplace=True)

pc = pd.concat([pc_ati,pc_can],axis=1,sort=False)
pc = org_columns(pc)

In [68]:
# Total Current Assets
tca = ac - pc

In [59]:
# Goodwill
gw_ati = pd.read_excel('../../data/economatica_1/goodwill_mensal_ativas.xlsx',
                            skiprows = [1,2],
                            header=1,
                            index_col = 0,
                            na_values='-')
gw_can = pd.read_excel('../../data/economatica_1/goodwill_mensal_canceladas.xlsx',
                            skiprows = [1,2],
                            header=1,
                            index_col = 0,
                            na_values='-')
# pc_can.drop(["Jan-86", "Feb-86"], inplace=True)

gw = pd.concat([gw_ati,gw_can],axis=1,sort=False)
gw = org_columns(gw)

In [70]:
# Net Fixed Assets
nfa = ta - tca - gw

In [72]:
# Return on Capital (ROC)
roc = ebit / (nwc + nfa)

In [76]:
# Market Value of Equity
mve_ati = pd.read_excel('../../data/economatica_1/valor_mercado_mensal_ativas.xlsx',
                            skiprows = [1,2],
                            header=1,
                            index_col = 0,
                            na_values='-')
mve_can = pd.read_excel('../../data/economatica_1/valor_mercado_mensal_canceladas.xlsx',
                            skiprows = [1,2],
                            header=1,
                            index_col = 0,
                            na_values='-')
mve_can.drop(["Jan-86", "Feb-86"], inplace=True)

mve = pd.concat([mve_ati,mve_can],axis=1,sort=False)
mve = org_columns(mve)

In [79]:
# Net Interest-Bearing Debt
nibd_ati = pd.read_excel('../../data/economatica_1/net_debt_mensal_ativas.xlsx',
                            skiprows = [1,2],
                            header=1,
                            index_col = 0,
                            na_values='-')
nibd_can = pd.read_excel('../../data/economatica_1/net_debt_mensal_canceladas.xlsx',
                            skiprows = [1,2],
                            header=1,
                            index_col = 0,
                            na_values='-')
nibd_can.drop(["Jan-86", "Feb-86"], inplace=True)

nibd = pd.concat([nibd_ati,nibd_can],axis=1,sort=False)
nibd = org_columns(nibd)

In [86]:
# Enterprise Level
ev = mve + nibd

In [88]:
# Earnings Yield
ey = ebit/ev

In [None]:
# Fazer com que todas linhas cuspam output
# https://stackoverflow.com/questions/31764006/ipython-notebook-display-every-line-output-without-print
# ebit.shape # número de linhas e colunas
# ebit.isna().sum().sum() número de NAs em um dataframe

In [50]:
ebit.shape
ebit.isna().sum().sum()

89129

In [51]:
nwc.shape
nwc.isna().sum().sum()

89129

In [52]:
ta.shape
ta.isna().sum().sum()

84743

In [53]:
ac.shape
ac.isna().sum().sum()

84884

In [56]:
pc.shape
pc.isna().sum().sum()

84883

In [61]:
gw.shape
gw.isna().sum().sum()

105986

In [74]:
roc.shape
roc.isna().sum().sum()

106108

In [78]:
mve.shape
mve.isna().sum().sum()

78722

In [81]:
nibd.shape
nibd.isna().sum().sum()

89304

In [84]:
ev.shape
ev.isna().sum().sum()

(135, 999)

In [62]:
x = ebit+nwc+ta+gw
x.isna().sum().sum()

106107

In [67]:
nrow, ncol = gw.shape
nrow*ncol

134865

In [89]:
ey

ticker,AALR3,ABCB3,ABCB4,ABEV3,ABYA3,ACES3,ACES4,ACGU3,ADHM3,AEDU3,...,WISA3,WISA4,WIZS3,WLMM3,WLMM4,WMBY3,WMBY4,WWOW3,YDUQ3,ZIVI4
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mar-86,,,,,,,,,,,...,,,,,,,,,,
Jun-86,,,,,,,,,,,...,,,,,,,,,,
Sep-86,,,,,,,,,,,...,,,,,,,,,,
Dec-86,,,,0.346678,,,,,,,...,,,,0.253990,0.253990,,,,,
Mar-87,,,,,,,,,,,...,,,,,,,,,,
Jun-87,,,,,,,,,,,...,,,,,,,,,,
Sep-87,,,,,,,,,,,...,,,,,,,,,,
Dec-87,,,,-0.224333,,,,,,,...,,,,1.279382,1.279382,-0.003377,-0.003377,,,
Mar-88,,,,,,,,,,,...,,,,,,,,,,
Jun-88,,,,,,,,,,,...,,,,,,,,,,


In [3]:
tickers = tot_ass.columns.str.extract('(.*)$')[0]
tot_ass.columns = tickers
# https://www.debuggex.com/cheatsheet/regex/python
# https://www.dataquest.io/blog/regex-cheatsheet/
on = tickers[tickers.str.contains('.\D3$')]
pn = tickers[tickers.str.contains('.\D4$')]
sto_tick = pd.concat([on, pn], ignore_index=True)
tot_ass = tot_ass[sto_tick]
tot_ass = tot_ass[tot_ass.index.str.contains("Mar|Jun|Sep|Dec")]
tot_ass.columns.name = "ticker"

In [32]:
# remover colums e row with NA de um dataframe é bem fácil
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html
stocks = tot_ass.loc["Dec-98":"Sep-19"]
stocks = stocks.dropna(axis="columns")

In [69]:
tca

ticker,EALT3,ADHM3,TIET3,AFLT3,BRGE3,CRIV3,RPAD3,BRIV3,ALSO3,APTI3,...,VIVO4,VTEC4,VULC4,WEGE4,WMBY4,WET4,WISA4,ILMD4,ESTC4,ZIVI4
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mar-86,,,,,,,,,,,...,,,,,,,,,,
Jun-86,,,,,,,,,,,...,,,,,,,,,,
Sep-86,,,,,,,,,,,...,,,,,,,,,,
Dec-86,,,,,,,0.000050,,,,...,,-0.000048,,7.967200e-05,,,,,,0.000071
Mar-87,,,,,,,,,,,...,,,,,,,,,,
Jun-87,,,,,,,,,,,...,,,,2.256742e-04,,,,,,
Sep-87,,,,,,,,,,,...,,,,,,,,,,
Dec-87,,,,,,,0.001263,-1.352632e-01,,,...,,-0.000593,,4.804855e-04,0.000626,,,,,0.000473
Mar-88,,,,,,,,,,,...,,,,,,,,,,
Jun-88,,,,,,,,,,,...,,,,1.477176e-03,,,,,,


In [16]:
stocks

ticker,WEGE3,WEGE4
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
Dec-91,5.522262e+01,5.522262e+01
Mar-92,1.030606e+02,1.030606e+02
Jun-92,1.844668e+02,1.844668e+02
Sep-92,3.521265e+02,3.521265e+02
Dec-92,6.858575e+02,6.858575e+02
Mar-93,1.488662e+03,1.488662e+03
Jun-93,3.234225e+03,3.234225e+03
Sep-93,7.472003e+03,7.472003e+03
Dec-93,1.942920e+04,1.942920e+04
Mar-94,5.741367e+04,5.741367e+04


In [218]:
# queremos remover os vizinhos de NAs coluna a coluna
# estou com o problema de contar o numero máximo de consecutive non NA
# https://stackoverflow.com/questions/41968892/counting-consecutive-numbers-in-a-list
# https://stackoverflow.com/questions/44337512/delete-non-consecutive-values-from-a-dataframe-column
# https://stackoverflow.com/questions/52671308/counting-the-number-of-consecutive-values-that-meets-a-condition-pandas-datafra
# http://blog.adeel.io/2016/10/30/removing-neighboring-consecutive-only-duplicates-in-a-pandas-dataframe/
# talvez a melhor maneira de lidar com um shift seja direto no df, sem passar por series
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.shift.html
# True & False # dá False
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.shift.html
sh_plus = tot_ass.shift(periods=1)
sh_minus = tot_ass.shift(periods=-1)
tot_ass[sh_plus.isnull()&sh_minus.isnull()] = np.NAN

In [222]:
tot_ass.to_excel("output.xlsx")

In [195]:
True & False

NameError: name 'T' is not defined

In [178]:
serie
s_plus = serie.shift()
s_minus = serie.shift(periods=-1)

# periods=1

In [183]:
s_plus.values == 2

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [45]:
# criar df com dictionary e um index e usar expressões lógicas
df1 = pd.DataFrame({'A': [12, 44, 17, 1],
                    'B': [4, 8, 100, -3],
                    'C': [47, 68, 52, 1],
                    'D': [0, 0, 0, 0]},
                    index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': [1, 2, 3, 4],
                    'B': [-10, 4, 5, 13],
                    'C': [12, 23, 10, 100],
                    'D': [1, 1, 1, 1]},
                    index=[0, 1, 2, 3])
df3 = pd.DataFrame({'A': [1, 2, 3, 4],
                    'B': [-10, np.nan, 5, 13],
                    'C': [np.nan, 23, 10, 100],
                    'D': [1, 1, 1, 1]},
                    index=[0, 1, 2, 3])

In [46]:
x = df1+df3
x.isna().sum().sum()

2

In [212]:
df3.isnull()

Unnamed: 0,A,B,C,D
0,True,False,False,False
1,False,False,False,False
2,False,False,True,False
3,False,False,False,False


In [203]:
df1&df2

Unnamed: 0,A,B,C,D
0,True,False,False,False
1,False,False,False,False
2,False,False,True,False
3,False,False,False,False


In [200]:
df1|df2

Unnamed: 0,A,B,C,D
0,True,True,True,False
1,False,True,True,False
2,True,False,True,True
3,False,True,True,False


In [208]:
ttt = np.NAN