In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import time

In [2]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
pd.set_option('display.float_format', '{:.2f}'.format)

In [4]:
import fundamentos as ftos
from pandas_datareader import data
import plotly.express as px
import yfinance as yf

In [5]:
# read

In [6]:
multiple= pd.read_csv("dateset_fundamentalist.csv", sep=";", encoding = 'latin_1')
multiple_14= pd.read_csv("dateset_fundamentalist_2014.csv", sep=";", encoding = 'latin_1')
prediction= pd.read_csv("prediction.csv", sep=";", encoding = 'latin_1')

In [7]:
date_quote= pd.read_csv("date_quote_complete.csv", sep=";", encoding = 'latin_1')

In [8]:
multiple =  pd.concat([multiple, multiple_14])

# EPS

In [9]:
date_quote['date_off']=date_quote['date']

In [10]:
date_quote['date_month']=date_quote['date'].str[3:5]
date_quote['date_year']=date_quote['date'].str[6:10]
date_quote['date']=date_quote['date'].str[6:10]+date_quote['date'].str[3:5]+date_quote['date'].str[0:2]

In [13]:
date_quote_clean=date_quote[['date', 'company' , 'date_month', 'date_year', 'close' ]]

In [14]:
date_quote_clean = date_quote_clean.reset_index(drop=True)

In [15]:
date_quote_clean['date_month']=date_quote_clean['date_month'].astype(int)
date_quote_clean['date_year']=date_quote_clean['date_year'].astype(int)

In [16]:
date_quote_clean['date']=date_quote_clean['date'].astype(int)

In [17]:
date_quote_clean=date_quote_clean.sort_values(by=['company', 'date'], ascending=False)

In [18]:
date_quote_clean_1=date_quote_clean.drop_duplicates(subset=["company", 'date_month','date_year'], keep = 'first').reset_index()

In [19]:
date_quote_clean_1.head()

Unnamed: 0,index,date,company,date_month,date_year,close
0,8325,20210914,YDUQ3,9,2021,24.86
1,8316,20210831,YDUQ3,8,2021,25.59
2,8294,20210730,YDUQ3,7,2021,28.26
3,8273,20210630,YDUQ3,6,2021,32.84
4,8252,20210531,YDUQ3,5,2021,32.9


# create contributions

In [20]:
def create_contributions(df):
    df['contributions']=0
    contributions=0
    for year in range(2014,2022):
        for month in range(3, 13, 3):
            contributions=contributions+1
            # Condition
            conditions = [
                (df['date_year'] == year) & (df['date_month'] == month)]
            choices = [contributions]
            df['contributions'] = np.select(conditions, choices, default=df['contributions'])
    return df

In [21]:
date_quote_clean.head()

Unnamed: 0,date,company,date_month,date_year,close
8325,20210914,YDUQ3,9,2021,24.86
8324,20210913,YDUQ3,9,2021,25.12
8323,20210910,YDUQ3,9,2021,23.26
8322,20210909,YDUQ3,9,2021,23.59
8321,20210908,YDUQ3,9,2021,23.1


In [22]:
date_quote_clean_filter=date_quote_clean.sort_values(by='date', ascending=False)

In [23]:
date_quote_clean_filter=date_quote_clean_filter.drop_duplicates(subset=['company', 'date_month', 'date_year'], keep='first')

In [24]:
date_quote_clean_filter=date_quote_clean_filter[['company', 'date_month', 'date_year', 'close']]

In [25]:
multiple_price=pd.merge(multiple,date_quote_clean_filter,how='left',left_on=['company','date_year','date_month'], 
                        right_on=['company','date_year','date_month'])

In [26]:
multiple_price['eps']=multiple_price['eps'].astype('float')

In [27]:
multiple_price_2=multiple_price[multiple_price['close'].notna()].reset_index(drop=True)

In [28]:
multiple_price_2['pe']=multiple_price_2['close']/multiple_price_2['eps']
multiple_price_2['pe'] = multiple_price_2['eps'].apply(lambda x: 0 if x == 0 else x )

In [29]:
# delete duplicated companies (eg CGRA3 and CGRA4)
multiple_price_2=multiple_price_2.sort_values(by=['company'])
multiple_price_2=multiple_price_2.drop_duplicates(subset=['name', 'date', ], keep='first')

In [30]:
# add prediction and target 

In [31]:
prediction.head()

Unnamed: 0,date,open,high,low,close,adj close,volume,company,date_month,date_year,date_2,contributions_old,contributions,date_fim,close_fim,days,prediction,real_profit_loss
0,01/12/2016,15.4,15.75,14.78,14.92,14.77,541600,AALR3,12,2016,20161201,12,12,01/03/2017,12.89,90,11.47,-0.14
1,01/03/2017,12.87,12.92,12.74,12.89,12.76,61600,AALR3,3,2017,20170301,13,13,01/06/2017,17.95,92,9.03,0.39
2,01/06/2017,17.83,17.95,17.44,17.95,17.77,140500,AALR3,6,2017,20170601,14,14,01/09/2017,16.33,92,23.94,-0.09
3,01/09/2017,16.58,16.62,16.33,16.33,16.17,142200,AALR3,9,2017,20170901,15,15,01/12/2017,14.75,91,14.36,-0.1
4,01/12/2017,14.87,15.14,14.75,14.75,14.6,141600,AALR3,12,2017,20171201,16,16,01/03/2018,15.4,90,15.14,0.04


In [34]:
prediction_clean=prediction[['company', 'date_year', 'contributions', 'date_month', 'volume', 'date_fim', 'close_fim', 'prediction', 'real_profit_loss' ]]

In [40]:
multiple_price_target=pd.merge(multiple_price_2, prediction_clean, how='left', 
                               left_on = ['company','date_year','date_month'], 
                               right_on=['company','date_year','date_month'])

In [41]:
multiple_price_target=multiple_price_target[ ~(multiple_price_target['real_profit_loss'].isna()) ].reset_index(drop=True)

## analysis 

In [42]:
# fill na with zero

In [43]:
multiple_price_target['real_profit_loss']=multiple_price_target['real_profit_loss'].astype('float')
multiple_price_target['growth_profit']=multiple_price_target['growth_profit'].astype('float')
multiple_price_target['cl']=multiple_price_target['cl'].astype('float')
multiple_price_target['d_e']=multiple_price_target['d_e'].astype('float')

In [44]:
multiple_price_target.head(2)

Unnamed: 0,company,date,date_month,date_year,code2,setor,subsetor,name,total_assets,current_assets,total_liabilities,current_liabilities,code,freq,shares,net_revenue,net_profit,growth_profit,net_equity,net_revenue_12,net_profit_12m,assets,cl,ebit,roe,net_margin,eps,bvps,d_e,gross_debt,availabilities,net_debt,stock,credit_portfolio,deposits,service revenue,close,pe,contributions,volume,date_fim,close_fim,prediction,real_profit_loss
0,AALR3,31/12/2019,12,2019,AALR,health,Medical-hospital services analysis and diagnosis,ALLIAR,2401121.02,417043.01,2401121.02,406231.01,AALR3,16,118293,262156.03,7442.0,-0.48,1282190.05,1072865,41300,2401121.02,1.03,24920.99,0.03,0.01,0.35,10.84,0.48,614769.01,88301.0,526468.01,10928.0,,,,18.16,0.35,24.0,168600.0,02/03/2020,20.08,22.22,0.08
1,AALR3,31/03/2020,3,2020,AALR,health,Medical-hospital services analysis and diagnosis,ALLIAR,2624782.08,619860.99,2624782.08,634014.02,AALR3,17,118293,235863.01,-21680.0,-3.91,1262906.05,1046883,9734,2624782.08,0.98,-1131.0,0.01,-0.02,0.08,10.68,0.65,823556.99,256040.0,567516.99,10410.0,,,,9.0,0.08,25.0,593400.0,01/06/2020,10.5,23.81,-0.48


In [45]:
# multiple_price_target.groupby(['date_year', 'date_month', 'contributions'])["company"].count().reset_index()

In [46]:
## Clean data 

In [47]:
# select month
multiple_price_target_14=multiple_price_target[multiple_price_target['contributions'].isin([1,2,3,4])]

In [50]:
prepare_dataset=multiple_price_target_14.drop(columns=['company', 'date', 'date_month', 'date_year','code2', 'name', 'code', 'freq', 'contributions', 'date_fim', 'close_fim'])

In [51]:
prepare_dataset_dummy=pd.get_dummies(prepare_dataset, columns=["setor", 'subsetor' ], prefix="c_")

In [52]:
prepare_dataset.head(2)

Unnamed: 0,setor,subsetor,total_assets,current_assets,total_liabilities,current_liabilities,shares,net_revenue,net_profit,growth_profit,net_equity,net_revenue_12,net_profit_12m,assets,cl,ebit,roe,net_margin,eps,bvps,d_e,gross_debt,availabilities,net_debt,stock,credit_portfolio,deposits,service revenue,close,pe,volume,prediction,real_profit_loss
37,financial,financial intermediaries,18220771.33,12869739.52,18220771.33,12062381.06,151043,170935.01,79767.0,0.11,2031399.94,567821,295400,18220771.33,1.07,,0.15,0.14,1.96,13.45,0.0,,,,,5573562.88,4496803.1,41874.0,9.96,1.96,175810.0,10.76,0.12
38,financial,financial intermediaries,18386843.65,13230279.68,18386843.65,11963915.26,151043,160764.0,72137.0,-0.05,1986349.95,435813,280343,18386843.65,1.11,,0.14,0.17,1.86,13.15,0.0,,,,,5519622.14,4648846.82,40956.0,9.53,1.86,163205.0,6.2,0.14


In [53]:
# validate the correlation 

In [54]:
prepare_datase_correlation=prepare_dataset.drop(columns=["setor", 'subsetor' ])

In [55]:
corr = prepare_datase_correlation.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,total_assets,current_assets,total_liabilities,current_liabilities,shares,net_revenue,net_profit,growth_profit,net_equity,net_revenue_12,net_profit_12m,assets,cl,ebit,roe,net_margin,eps,bvps,d_e,gross_debt,availabilities,net_debt,stock,credit_portfolio,deposits,service revenue,close,pe,volume,prediction,real_profit_loss
total_assets,1.0,0.949363,1.0,0.920635,-0.008089,0.466995,0.11405,-0.010532,0.670427,0.476132,0.544748,1.0,-0.040127,0.87007,0.010881,0.001284,0.066726,0.039872,-0.025422,0.980323,0.965416,0.972137,0.906141,0.993673,0.992586,0.982102,-0.014133,0.066726,0.178715,-0.01371,0.047469
current_assets,0.949363,1.0,0.949363,0.994388,-0.008159,0.202395,0.216824,-0.009408,0.415011,0.209377,0.52485,0.949363,-0.043137,0.845693,0.010645,-0.002199,0.073917,0.027289,-0.026164,0.951453,0.966333,0.935104,0.929007,0.993781,0.992844,0.980211,-0.011495,0.073917,0.117538,-0.011075,0.04926
total_liabilities,1.0,0.949363,1.0,0.920635,-0.008089,0.466995,0.11405,-0.010532,0.670427,0.476132,0.544748,1.0,-0.040127,0.87007,0.010881,0.001284,0.066726,0.039872,-0.025422,0.980323,0.965416,0.972137,0.906141,0.993673,0.992586,0.982102,-0.014133,0.066726,0.178715,-0.01371,0.047469
current_liabilities,0.920635,0.994388,0.920635,1.0,-0.007227,0.121875,0.229244,-0.007402,0.335081,0.128055,0.499245,0.920635,-0.044531,0.805014,0.010102,-0.002045,0.071632,0.023945,-0.026459,0.919821,0.934036,0.904065,0.8594,0.997235,0.99538,0.982552,-0.009886,0.071632,0.11231,-0.009487,0.045947
shares,-0.008089,-0.008159,-0.008089,-0.007227,1.0,-0.00198,-0.001875,0.002081,-0.006037,-0.002755,-0.002711,-0.008089,-0.016752,0.000576,0.003633,0.002351,-0.008082,-0.027138,-0.00616,-0.005157,-0.011358,-0.003316,-0.016142,0.453437,0.453006,0.561613,-0.004085,-0.008082,-0.006325,-0.004017,0.013931
net_revenue,0.466995,0.202395,0.466995,0.121875,-0.00198,1.0,-0.235182,-0.008966,0.862921,0.995522,0.250256,0.466995,-0.035038,0.868853,0.010198,0.063501,0.024524,0.03262,0.010178,0.942346,0.945521,0.929464,0.919976,0.80254,0.818826,0.814114,-0.010638,0.024524,0.233398,-0.010577,0.024523
net_profit,0.11405,0.216824,0.11405,0.229244,-0.001875,-0.235182,1.0,0.053523,-0.063769,-0.238324,0.700872,0.11405,0.024599,-0.168833,-0.009787,-0.005453,0.11558,0.003848,-0.008221,-0.372138,-0.260252,-0.399443,-0.208773,0.579234,0.589387,0.541474,-0.004557,0.11558,-0.202863,-0.004399,0.115538
growth_profit,-0.010532,-0.009408,-0.010532,-0.007402,0.002081,-0.008966,0.053523,1.0,-0.009602,-0.013932,0.024894,-0.010532,0.088956,-0.001676,0.002881,0.000268,0.120299,0.198192,0.013675,-0.019369,-0.007012,-0.022661,-0.010233,-0.085634,-0.085055,-0.073179,0.011107,0.120299,-0.007136,0.017463,0.026874
net_equity,0.670427,0.415011,0.670427,0.335081,-0.006037,0.862921,-0.063769,-0.009602,1.0,0.875348,0.454402,0.670427,0.000387,0.8745,0.009771,0.000962,0.03626,0.055415,-0.023755,0.948777,0.944537,0.937937,0.888695,0.672051,0.675353,0.688814,-0.016981,0.03626,0.216752,-0.01662,0.040373
net_revenue_12,0.476132,0.209377,0.476132,0.128055,-0.002755,0.995522,-0.238324,-0.013932,0.875348,1.0,0.24908,0.476132,-0.035267,0.857846,0.010028,0.008922,0.022954,0.033907,0.008972,0.946106,0.949933,0.93299,0.92623,0.937897,0.93383,0.956505,-0.010241,0.022954,0.232831,-0.010115,0.028497


In [56]:
prepare_dataset_clean=prepare_dataset.drop(columns=["current_liabilities", 'current_assets', 'total_liabilities', 
                                                    'deposits', 'assets', 'net_revenue', 'pe' ,
                                                   'availabilities','net_debt','stock',
                                                   'service revenue', "credit_portfolio", "deposits",
                                                   'gross_debt', "ebit"])

In [57]:
prepare_dataset_clean_correlation=prepare_dataset_clean.drop(columns=["setor", 'subsetor' ])
corr = prepare_dataset_clean_correlation.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,total_assets,shares,net_profit,growth_profit,net_equity,net_revenue_12,net_profit_12m,cl,roe,net_margin,eps,bvps,d_e,close,volume,prediction,real_profit_loss
total_assets,1.0,-0.008089,0.11405,-0.010532,0.670427,0.476132,0.544748,-0.040127,0.010881,0.001284,0.066726,0.039872,-0.025422,-0.014133,0.178715,-0.01371,0.047469
shares,-0.008089,1.0,-0.001875,0.002081,-0.006037,-0.002755,-0.002711,-0.016752,0.003633,0.002351,-0.008082,-0.027138,-0.00616,-0.004085,-0.006325,-0.004017,0.013931
net_profit,0.11405,-0.001875,1.0,0.053523,-0.063769,-0.238324,0.700872,0.024599,-0.009787,-0.005453,0.11558,0.003848,-0.008221,-0.004557,-0.202863,-0.004399,0.115538
growth_profit,-0.010532,0.002081,0.053523,1.0,-0.009602,-0.013932,0.024894,0.088956,0.002881,0.000268,0.120299,0.198192,0.013675,0.011107,-0.007136,0.017463,0.026874
net_equity,0.670427,-0.006037,-0.063769,-0.009602,1.0,0.875348,0.454402,0.000387,0.009771,0.000962,0.03626,0.055415,-0.023755,-0.016981,0.216752,-0.01662,0.040373
net_revenue_12,0.476132,-0.002755,-0.238324,-0.013932,0.875348,1.0,0.24908,-0.035267,0.010028,0.008922,0.022954,0.033907,0.008972,-0.010241,0.232831,-0.010115,0.028497
net_profit_12m,0.544748,-0.002711,0.700872,0.024894,0.454402,0.24908,1.0,0.019784,-0.010957,-0.005611,0.193634,0.016099,-0.014116,-0.012169,0.016901,-0.011506,0.11002
cl,-0.040127,-0.016752,0.024599,0.088956,0.000387,-0.035267,0.019784,1.0,0.011218,0.010555,0.216597,0.376013,-0.043429,-0.010533,-0.02825,-0.009425,0.004168
roe,0.010881,0.003633,-0.009787,0.002881,0.009771,0.010028,-0.010957,0.011218,1.0,-0.0024,0.023529,0.010671,-0.436858,0.007768,0.010793,0.007519,0.021932
net_margin,0.001284,0.002351,-0.005453,0.000268,0.000962,0.008922,-0.005611,0.010555,-0.0024,1.0,-0.017587,-0.005163,0.00781,0.00264,-0.026279,0.002629,-0.150611


In [58]:
prepare_dataset_clean_correlation.isna().sum()

total_assets        0
shares              0
net_profit          0
growth_profit       0
net_equity          0
net_revenue_12      0
net_profit_12m      0
cl                  0
roe                 0
net_margin          0
eps                 0
bvps                0
d_e                 0
close               0
volume              0
prediction          0
real_profit_loss    0
dtype: int64

# Modeling

In [59]:
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm

In [60]:
X_train = prepare_dataset_clean_correlation.drop(columns=['real_profit_loss'])

In [61]:
y_train = prepare_dataset_clean_correlation['real_profit_loss']

In [62]:
features_x=list(X_train.columns)
const_name=['const'] 
features= const_name + features_x

In [63]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
# train MLR model
X_train = sm.add_constant(X_train)
regressor = sm.OLS(y_train, X_train).fit()
regressor.summary(xname=features)

0,1,2,3
Dep. Variable:,real_profit_loss,R-squared:,0.044
Model:,OLS,Adj. R-squared:,0.026
Method:,Least Squares,F-statistic:,2.445
Date:,"Sun, 16 Jan 2022",Prob (F-statistic):,0.00126
Time:,19:38:31,Log-Likelihood:,3.7225
No. Observations:,871,AIC:,26.55
Df Residuals:,854,BIC:,107.6
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.1480,0.451,0.328,0.743,-0.737,1.033
total_assets,0.0310,0.134,0.232,0.817,-0.232,0.294
shares,0.0540,0.122,0.443,0.658,-0.185,0.294
net_profit,0.8468,0.432,1.959,0.050,-0.002,1.695
growth_profit,0.1273,0.243,0.524,0.601,-0.350,0.605
net_equity,-0.1457,0.296,-0.492,0.623,-0.727,0.435
net_revenue_12,0.2826,0.256,1.102,0.271,-0.221,0.786
net_profit_12m,0.0540,0.315,0.171,0.864,-0.564,0.672
cl,-0.0137,0.106,-0.129,0.897,-0.221,0.194

0,1,2,3
Omnibus:,632.248,Durbin-Watson:,2.137
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22363.712
Skew:,2.855,Prob(JB):,0.0
Kurtosis:,27.158,Cond. No.,151.0


# Apply only prediction and close variables

In [265]:
X_2 = prepare_dataset_clean_correlation.drop(columns=['real_profit_loss'])

In [266]:
# Keep the signficant variables
X_2=X_2[[ 'net_margin', 'net_profit']]

In [267]:
features_x=list(X_2.columns)
const_name=['const'] 
features_2= const_name + features_x

In [268]:
scaler = MinMaxScaler()
X_2 = scaler.fit_transform(X_2)
# train MLR model
X_2 = sm.add_constant(X_2)
regressor = sm.OLS(y_train, X_2).fit()
regressor.summary(xname=features_2)

0,1,2,3
Dep. Variable:,real_profit_loss,R-squared:,0.036
Model:,OLS,Adj. R-squared:,0.034
Method:,Least Squares,F-statistic:,16.13
Date:,"Sun, 16 Jan 2022",Prob (F-statistic):,1.32e-07
Time:,20:57:22,Log-Likelihood:,0.11118
No. Observations:,871,AIC:,5.778
Df Residuals:,868,BIC:,20.09
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4022,0.309,1.304,0.193,-0.203,1.008
net_margin,-1.0924,0.243,-4.500,0.000,-1.569,-0.616
net_profit,0.7951,0.231,3.442,0.001,0.342,1.249

0,1,2,3
Omnibus:,621.263,Durbin-Watson:,2.132
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21264.228
Skew:,2.79,Prob(JB):,0.0
Kurtosis:,26.554,Cond. No.,77.7


# Apply the model

In [118]:
# Modeling 

In [174]:
from sklearn.linear_model import LinearRegression

In [291]:
multiple_price_target = multiple_price_target[~multiple_price_target['net_profit'].isna()]

In [327]:
date_detail_new_model= pd.DataFrame()
investiment_month=10000
investiment=0
# 4 contribution to train the model. After, the model will learn with new information
contribution_filter=[1,2,3,4]

In [328]:
for contributions in range(5,31):
    
    investiment=investiment+investiment_month
    investiment_per_share=investiment/10
    
    # train the model
    df_model=multiple_price_target[multiple_price_target['contributions'].isin(contribution_filter)].reset_index(drop=True)
    df_model_x=df_model[[ 'net_margin', 'net_profit']]
    df_model_y=df_model[[ 'real_profit_loss']]
    

    regressor = LinearRegression()
    regressor.fit(df_model_x, df_model_y)
    score = regressor.score(df_model_x, df_model_y)
    
    # apply the model 
    df_next=multiple_price_target[multiple_price_target['contributions'].isin([contributions])].reset_index(drop=True)
    df_next_x=df_next[[ 'net_margin', 'net_profit']]
    prediction = regressor.predict(df_next_x)
    
    df_next['result_prediction']=df_next['net_margin']*regressor.coef_[0,0] + df_next['net_profit']*regressor.coef_[0, 1]  + regressor.intercept_[0]

    
    # select the stocks, olny profitable  
    df_next_clean= df_next[df_next['net_profit']>0]
    selected_next=df_next_clean.sort_values(by=['result_prediction'], ascending=False).head(10)
    selected_next['no_shares']=investiment_per_share/selected_next['close']
    selected_next['final_profit_loss']=selected_next['no_shares']*selected_next['close_fim'] 
    
    date_detail_new_model= pd.concat([date_detail_new_model, selected_next])
    
    investiment=selected_next['final_profit_loss'].sum()
    contributions_add=[contributions]
    contribution_filter=contribution_filter + contributions_add

In [329]:
date_detail_new_model.groupby(['contributions'])["final_profit_loss"].sum().reset_index()

Unnamed: 0,contributions,final_profit_loss
0,5.0,10759.94
1,6.0,18204.51
2,7.0,29095.78
3,8.0,40601.66
4,9.0,48363.43
5,10.0,71793.63
6,11.0,85633.16
7,12.0,106329.39
8,13.0,105696.29
9,14.0,129262.61


In [330]:
date_detail_new_model.to_csv('date_detail_new_model.csv')