# Stock price prediction

This project is an attempt to predict future stock prices of shares traded on the Brasil Bolsa Balcão (B3) based on some fundamentalist multipliers and cross sectional analysis. A linear regression is used in this approach, where the multipliers are the independent variables and the stock returns are the dependent variable.

### Project Script
- [Imports](#imports)
- [Get data](#get_data)
- [Process data](#process_data)
- [Inspect features](#inspect_features)
- [Create model](#create_model)
- [Evaluate model](#evaluate_model)

<a id='imports'></a>
## Imports

In [1]:
# standard imports
from matplotlib.pyplot import figure
from multiprocessing import Pool
import matplotlib.pyplot as plt
import pmdarima as pm
import pandas as pd
import numpy as np
import math
import sys
import os

# path hack
sys.path.insert(0, os.path.abspath('../clair'))

# custom imports
from clair.data.economatica import read_data, reshape_data
from clair.utils import visualization, preprocessing
from clair.learn import prediction
from clair.learn.models import logit, mlp

<a id='get_data'></a>
## Get data

In [2]:
# set data directory
base_dir = '../economatica/'

# read screening data
asset_info = read_data.screening(base_dir + 'info/info_acoes.xlsx')
asset_info.head()

Unnamed: 0,Nome,Classe,Bolsa / Fonte,Tipo de Ativo,Ativo / Cancelado,Código
0,524 Particip,ON,Bovespa,Ação,ativo,QVQP3B
1,Abc Brasil,PN,Bovespa,Ação,ativo,ABCB4
2,Aco Altona,ON,Bovespa,Ação,ativo,EALT3
3,Aco Altona,PN,Bovespa,Ação,ativo,EALT4
4,Advanced-Dh,ON,Bovespa,Ação,ativo,ADHM3


In [3]:
# read matrixx data
balance = read_data.matrixx(base_dir + 'balanco/', asset_info['Código'])
matrixx = read_data.matrixx(base_dir + 'indicadores/', asset_info['Código'])

capital_giro.xlsx
divida_liquida.xlsx
fluxo_caixa_livre.xlsx
invested_capital.xlsx
lucro_liquido.xlsx
patrimonio_liquido.xlsx
valor_mercado.xlsx
alavancagem_financeira.xlsx
dividend_ratio.xlsx
estrutura_capital.xlsx
indice_forca_relativa.xlsx
liquidez.xlsx
margem_liquida.xlsx
momentum.xlsx
preco-div-vendas.xlsx
ROE.xlsx
taxa_interna_retorno.xlsx


In [4]:
# print example
balance['valor_mercado'].tail()

Unnamed: 0_level_0,QVQP3B,ABCB4,EALT3,EALT4,ADHM3,TIET3,TIET4,TIET11,AFLT3,ALEF3B,...,WEGE3,MWET3,MWET4,WHRL3,WHRL4,WSON33,WIZS3,WLMM3,WLMM4,YDUQ3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3T2018,0,2792061.0,110925.0,110925.0,12857.1,3808386.2,3808386.2,3808386.2,316055.85,0,...,41419000.0,5145.0,5145.0,7592640.8,7592640.8,2905776.0,1247277.0,174062.1226,174062.1226,7499823.87
4T2018,0,3387284.0,94282.5,94282.5,11344.5,3989621.2,3989621.2,3989621.2,350121.75,0,...,36784270.0,6091.68,6091.68,6841247.8,6841247.8,2849760.0,1120950.0,183462.3915,183462.3915,7130684.88
1T2019,0,4137488.0,122880.0,122880.0,12100.8,4378927.85,4378927.85,4378927.85,336873.9,0,...,37752230.0,7109.4,7109.4,6763965.6,6763965.6,2641015.08,1359212.0,233405.4305,233405.4305,7985346.6
2T2019,0,4131469.0,104430.0,104430.0,10058.79,4677617.0,4677617.0,4677617.0,331194.675,0,...,44711270.0,6970.68,6970.68,6911022.96,6911022.96,2493540.0,1747787.0,221910.0455,221910.0455,8714094.0
3T2019,0,3851856.0,196140.0,196140.0,13764.66,4950162.43,4950162.43,4950162.43,1103982.25,0,...,50822790.0,9018.0,9018.0,6561619.9,6561619.9,2422296.0,1782966.0,282833.3155,282833.3155,10847379.84


<a id='process_data'></a>
## Process data

In [5]:
# take ratios from balance data
for feat in balance.keys():
    matrixx[feat] = preprocessing.ratio(balance[feat], balance['valor_mercado'])

matrixx['valor_mercado'] = balance['valor_mercado']
matrixx['size'] = balance['valor_mercado'].apply(lambda x: x.map(lambda y: 0.0 if y <= 0 else math.log(y)))

In [6]:
# pivot data
raw_data = reshape_data.matrixx_to_date_key(matrixx, matrixx['valor_mercado'].index)

# iterate over dates
cross_data = {}
date_1 = None
date_2 = None

for date in raw_data:
    # ignore first two iterations
    if date_1 is None:
        date_1 = date
        continue
    elif date_2 is None:
        date_2 = date_1
        date_1 = date
        continue

    # remove outliers, twice
    df = raw_data[date_2]
    df = preprocessing.drop_outliers(df, 3)
    cross_data[date_1] = preprocessing.drop_outliers(df, 3)

    # standardize data
    cross_data[date_1] = preprocessing.standardize(cross_data[date_1].drop('valor_mercado', axis=1))

    # compute stock return signals
    df = raw_data[date]['valor_mercado'] - raw_data[date_1]['valor_mercado']
    cross_data[date_1]['signal'] = (df > 0).replace(True, 1).replace(False, -1)
    
    # set last period dependent variable as a new feature
    df = raw_data[date_1]['valor_mercado'] - raw_data[date_2]['valor_mercado']
    cross_data[date_1]['last_signal'] = (df > 0).replace(True, 1).replace(False, -1)

    # i++
    date_2 = date_1
    date_1 = date

  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np

  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col

  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col

  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]
  return (a - mns) / sstd
  df = data[np.abs(stats.zscore(data[col])) > zscore]


In [7]:
# print example
cross_data['1T2019'].tail()

Unnamed: 0,alavancagem_financeira,dividend_ratio,estrutura_capital,indice_forca_relativa,liquidez,margem_liquida,momentum,preco-div-vendas,ROE,taxa_interna_retorno,capital_giro,divida_liquida,fluxo_caixa_livre,invested_capital,lucro_liquido,patrimonio_liquido,size,signal,last_signal
WHRL4,0.530631,0.155724,-0.663904,0.794727,-0.441575,-0.058345,-0.120986,0.096356,-0.107465,-0.06664,0.029599,-0.286066,0.466321,-0.616493,-0.091133,-0.348518,0.742785,1.0,-1.0
WSON33,0.021956,0.731631,0.276828,0.880035,-0.412521,0.046507,0.277129,0.335552,-0.092692,-0.077795,-0.031323,0.061271,0.257833,-0.126187,-0.040788,-0.059198,0.596448,-1.0,-1.0
WIZS3,0.269823,1.556451,-0.914729,-1.185145,0.209455,0.37179,-1.083267,0.448617,4.961023,-1.535405,-0.136534,-0.262735,0.088099,-0.690142,0.140115,-0.477146,0.440532,1.0,1.0
WLMM3,0.159726,-0.779186,-0.736383,-0.174182,-0.443115,-0.068461,-0.759362,-0.420199,-0.308138,-1.375803,0.327549,-0.366924,0.006,0.538331,0.016007,1.015142,0.138091,-1.0,1.0
WLMM4,0.159726,-0.779186,-0.736383,0.229722,-0.441387,-0.068461,-0.224487,-0.418441,-0.308138,-0.366714,0.327549,-0.366924,0.006,0.538331,0.016007,1.015142,0.138091,-1.0,1.0


<a id='create_model'></a>
## Create model

In [8]:
# define number of cores to be used
count = 4

# define function args
# args = [
#     [cross_data, 'signal', logit, None],
#     [cross_data, 'signal', mlp, 'logistic'],
#     [cross_data, 'signal', mlp, 'tanh'],
#     [cross_data, 'signal', mlp, 'relu']
# ]
args = [
    [cross_data, 'signal', logit, None],
    [cross_data, 'signal', mlp, 'tanh']
]

# call computing function
pool = Pool(count)
results = pool.map_async(prediction.cross_section, args)

# get results
pool.close()
pool.join()
data = results.get()
results = {}
# results['logit'], results['logistic'], results['tanh'], results['relu'] = data
results['logit'], results['tanh'] = data

ValueError: not enough values to unpack (expected 4, got 2)

<a id='evaluate_model'></a>
## Evaluate model

In [10]:
df = pd.DataFrame()
df['train'] = results['logit']['consolidated']['train_score'].describe()
df['test'] = results['logit']['consolidated']['test_score'].describe()
df

Unnamed: 0,train,test
count,121.0,121.0
mean,0.69825,0.536364
std,0.216133,0.2752
min,0.025316,0.0
25%,0.597403,0.330097
50%,0.742424,0.589474
75%,0.836364,0.727273
max,1.0,0.987013


In [11]:
df = pd.DataFrame()
df['train'] = results['tanh']['consolidated']['train_score'].describe()
df['test'] = results['tanh']['consolidated']['test_score'].describe()
df

Unnamed: 0,train,test
count,121.0,121.0
mean,0.752963,0.546463
std,0.176699,0.26318
min,0.065574,0.0
25%,0.691358,0.377358
50%,0.787565,0.586667
75%,0.858209,0.738462
max,1.0,0.987013


In [36]:
df = results['logit']['consolidated']

df[df['train_score'] > 0.9].describe()

Unnamed: 0,train_score,test_score
count,19.0,19.0
mean,0.958917,0.851409
std,0.028314,0.153086
min,0.909091,0.434783
25%,0.933495,0.796215
50%,0.966667,0.918033
75%,0.980952,0.966224
max,1.0,0.987013


In [37]:
df = results['tanh']['consolidated']

df[df['train_score'] > 0.9].describe()

Unnamed: 0,train_score,test_score
count,19.0,19.0
mean,0.960598,0.851123
std,0.025972,0.1551
min,0.909091,0.416667
25%,0.934409,0.796215
50%,0.966667,0.918033
75%,0.980952,0.966224
max,1.0,0.987013
