In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import yfinance as yfin
import matplotlib.pyplot as plt
import seaborn as sns
import time

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Ibovespa

In [2]:
yfin.pdr_override()
ibovespa = yfin.download(['^BVSP'])
ibovespa

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1993-04-27,24.799999,25.400000,24.500000,24.500000,24.500000,0
1993-04-28,24.500000,24.600000,23.700001,24.299999,24.299999,0
1993-04-29,24.299999,24.799999,23.700001,23.700001,23.700001,0
1993-04-30,23.700001,24.200001,23.700001,24.100000,24.100000,0
1993-05-03,24.100000,24.400000,23.799999,24.100000,24.100000,0
...,...,...,...,...,...,...
2023-09-25,116009.000000,116031.000000,115573.000000,115925.000000,115925.000000,9580000
2023-09-26,115922.000000,115922.000000,114162.000000,114193.000000,114193.000000,11241800
2023-09-27,114194.000000,115340.000000,113366.000000,114327.000000,114327.000000,13359300
2023-09-28,114875.000000,115954.000000,114811.000000,115731.000000,115731.000000,10387000


In [3]:
ibovespa = ibovespa.drop(['Volume','Adj Close'],axis=1)
ibovespa

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1993-04-27,24.799999,25.400000,24.500000,24.500000
1993-04-28,24.500000,24.600000,23.700001,24.299999
1993-04-29,24.299999,24.799999,23.700001,23.700001
1993-04-30,23.700001,24.200001,23.700001,24.100000
1993-05-03,24.100000,24.400000,23.799999,24.100000
...,...,...,...,...
2023-09-25,116009.000000,116031.000000,115573.000000,115925.000000
2023-09-26,115922.000000,115922.000000,114162.000000,114193.000000
2023-09-27,114194.000000,115340.000000,113366.000000,114327.000000
2023-09-28,114875.000000,115954.000000,114811.000000,115731.000000


In [4]:
ibovespa['Tomorrow Close'] = ibovespa['Close'].shift(-1)
ibovespa

Unnamed: 0_level_0,Open,High,Low,Close,Tomorrow Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1993-04-27,24.799999,25.400000,24.500000,24.500000,24.299999
1993-04-28,24.500000,24.600000,23.700001,24.299999,23.700001
1993-04-29,24.299999,24.799999,23.700001,23.700001,24.100000
1993-04-30,23.700001,24.200001,23.700001,24.100000,24.100000
1993-05-03,24.100000,24.400000,23.799999,24.100000,24.900000
...,...,...,...,...,...
2023-09-25,116009.000000,116031.000000,115573.000000,115925.000000,114193.000000
2023-09-26,115922.000000,115922.000000,114162.000000,114193.000000,114327.000000
2023-09-27,114194.000000,115340.000000,113366.000000,114327.000000,115731.000000
2023-09-28,114875.000000,115954.000000,114811.000000,115731.000000,116565.000000


In [5]:
ibovespa['Increased'] = (ibovespa['Tomorrow Close'] > ibovespa['Close']).astype(int)
ibovespa

Unnamed: 0_level_0,Open,High,Low,Close,Tomorrow Close,Increased
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1993-04-27,24.799999,25.400000,24.500000,24.500000,24.299999,0
1993-04-28,24.500000,24.600000,23.700001,24.299999,23.700001,0
1993-04-29,24.299999,24.799999,23.700001,23.700001,24.100000,1
1993-04-30,23.700001,24.200001,23.700001,24.100000,24.100000,0
1993-05-03,24.100000,24.400000,23.799999,24.100000,24.900000,1
...,...,...,...,...,...,...
2023-09-25,116009.000000,116031.000000,115573.000000,115925.000000,114193.000000,0
2023-09-26,115922.000000,115922.000000,114162.000000,114193.000000,114327.000000,1
2023-09-27,114194.000000,115340.000000,113366.000000,114327.000000,115731.000000,1
2023-09-28,114875.000000,115954.000000,114811.000000,115731.000000,116565.000000,1


# Technical Analysis Indicators

In [6]:
time_periods = [3,7,10]

ibovespa_Cont = pd.DataFrame()

for time_period in time_periods:

  weights = list(range(1, time_period+1))
  # Defina uma função personalizada para calcular a média ponderada
  def weighted_average(values):
      return (values * weights).sum() / sum(weights)
  rolling_weighted_averages = ibovespa.rolling(time_period).apply(weighted_average, raw=True)

  ######################################################################################

  # Calcule as mudanças de preço diárias
  ibovespa['Price Change'] = ibovespa['Close'].diff()

  # Separe as mudanças de preço positivas e negativas
  ibovespa['Positive Change'] = np.where(ibovespa['Price Change'] > 0, ibovespa['Price Change'], 0)
  ibovespa['Negative Change'] = np.where(ibovespa['Price Change'] < 0, -ibovespa['Price Change'], 0)

  # Calcule o RS (Relative Strength)
  rs = ibovespa['Positive Change'].rolling(time_period).mean() / ibovespa['Negative Change'].rolling(time_period).mean()

  # Calcule o RSI usando a fórmula
  rsi = 100 - (100 / (1 + rs))

  ######################################################################################

  # Especifique os períodos para o cálculo do MACD e da linha de sinal
  periodo_curto = time_period + 9
  periodo_longo = time_period + 23
  periodo_sinal = time_period + 6

  # Calcule as EMA12 e EMA26
  ema12 = ibovespa['Close'].ewm(span=periodo_curto, adjust=False).mean()
  ema26 = ibovespa['Close'].ewm(span=periodo_longo, adjust=False).mean()

  # Calcule o MACD
  macd = ema12 - ema26

  ######################################################################################

  signal_line = macd.ewm(span=periodo_sinal, adjust=False).mean()

  ######################################################################################

  rolling_max = ibovespa['High'].rolling(time_period).max()
  rolling_min = ibovespa['Low'].rolling(time_period).min()

  ######################################################################################

  # Calcule o Preço Típico
  ibovespa['Typical Price'] = (ibovespa['High'] + ibovespa['Low'] + ibovespa['Close']) / 3

  # Calcule a Média Móvel do Preço Típico
  ibovespa['Typical Price Moving Average'] = ibovespa['Typical Price'].rolling(time_period).mean()

  # Calcule o Desvio Padrão do Preço Típico
  ibovespa['Typical Price Std'] = ibovespa['Typical Price'].rolling(time_period).std()

  ######################################################################################

  ibovespa_Cont[f'simple_{time_period}days_moving_average'] = ibovespa['Close'].rolling(time_period).mean()
  ibovespa_Cont[f'weighted_{time_period}days_moving_average'] = rolling_weighted_averages['Close']
  ibovespa_Cont[f'{time_period}days_momentum'] = ibovespa['Close'].rolling(time_period).apply(lambda x: x[-1] - x[0])
  ibovespa_Cont[f'{time_period}days_stochastic_K'] = 100*(ibovespa['Close'] - rolling_min) / (rolling_max - rolling_min)
  ibovespa_Cont[f'{time_period}days_stochastic_D'] = ibovespa_Cont[f'{time_period}days_stochastic_K'].rolling(time_period).mean()
  ibovespa_Cont[f'{time_period}days_RSI'] = rsi
  ibovespa_Cont[f'{time_period}days_MACD'] = macd
  ibovespa_Cont[f'{time_period}days_Signal_Line'] = signal_line
  ibovespa_Cont[f'{time_period}days_LW_R'] = 100*(rolling_max - ibovespa['Close']) / (rolling_max - rolling_min)
  ibovespa_Cont[f'{time_period}days_CCI'] = (ibovespa['Typical Price'] - ibovespa['Typical Price Moving Average']) / (0.015 * ibovespa['Typical Price Std'])

  #####################################################################################

ibovespa_Cont = ibovespa_Cont.dropna()

In [7]:
ibovespa_Cont = ibovespa_Cont['2020-01-01':]

In [8]:
df_x = ibovespa_Cont
df_x

Unnamed: 0_level_0,simple_3days_moving_average,weighted_3days_moving_average,3days_momentum,3days_stochastic_K,3days_stochastic_D,3days_RSI,3days_MACD,3days_Signal_Line,3days_LW_R,3days_CCI,...,simple_10days_moving_average,weighted_10days_moving_average,10days_momentum,10days_stochastic_K,10days_stochastic_D,10days_RSI,10days_MACD,10days_Signal_Line,10days_LW_R,10days_CCI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02,117023.666667,117363.500000,2039.0,100.000000,56.321045,67.801455,2373.626214,1971.690129,0.000000,71.408683,...,115321.6,116245.072727,6677.0,100.000000,90.939106,80.516050,1892.869919,1389.749660,0.000000,90.860845
2020-01-03,117414.666667,117705.166667,1743.0,65.478842,59.713605,64.499382,2409.093513,2059.170806,34.521158,50.934747,...,115902.7,116678.781818,5091.0,84.263959,91.963865,78.936361,1945.492848,1455.131212,15.736041,86.941298
2020-01-06,117719.333333,117436.833333,-1695.0,39.102768,68.193870,60.618030,2343.296172,2115.995879,60.897232,-72.258477,...,116328.9,116856.109091,2563.0,70.517560,90.387521,70.995074,1944.488194,1512.702621,29.482440,41.056211
2020-01-07,117082.333333,116908.166667,-1045.0,24.655111,43.078907,0.000000,2247.810550,2142.358813,75.344889,-55.033878,...,116563.6,116916.672727,1531.0,58.070866,86.250023,63.539864,1923.304910,1561.008773,41.929134,10.580794
2020-01-08,116595.666667,116490.500000,-631.0,27.507448,30.421776,0.000000,2114.278389,2136.742728,72.492552,-54.615252,...,116675.2,116859.109091,1126.0,40.342241,80.286247,56.750544,1876.959629,1598.179462,59.657759,-11.374234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-25,116026.333333,115989.666667,-220.0,11.274824,6.344364,0.000000,-185.594142,-42.275333,88.725176,-61.391004,...,117720.2,117261.818182,-2043.0,8.367007,58.377888,43.758144,-108.194363,-42.012157,91.632993,-116.274038
2023-09-26,115375.666667,115073.000000,-1816.0,1.104775,5.491418,0.000000,-409.654833,-115.751233,98.895225,-73.952006,...,117342.7,116620.509091,-3983.0,0.551798,50.614477,27.316428,-229.104008,-64.022963,99.448202,-123.273190
2023-09-27,114815.000000,114548.666667,-1598.0,36.060038,16.146546,6.871795,-569.843068,-206.569600,63.939962,-54.944568,...,116957.8,116072.200000,-5065.0,14.982850,44.662438,26.664242,-324.656272,-94.685705,85.017150,-104.957494
2023-09-28,114750.333333,115006.666667,1538.0,91.383308,42.849374,47.033639,-576.852734,-280.626227,8.616692,71.978538,...,116591.7,115849.145455,-3027.0,36.872466,39.074883,28.298755,-345.873152,-124.237169,63.127534,-50.667749


In [9]:
df_y = ibovespa['Increased']['2020-01-01':]
df_y

Date
2020-01-02    0
2020-01-03    0
2020-01-06    0
2020-01-07    0
2020-01-08    0
             ..
2023-09-25    0
2023-09-26    1
2023-09-27    1
2023-09-28    1
2023-09-29    0
Name: Increased, Length: 933, dtype: int64

In [10]:
predictors = df_x.columns

train_size = int(round(len(df_y)*0.7,0))
test_size = len(df_y) - train_size

# Splits the X dataset into train and test
x_train = np.array(df_x[0:train_size])
x_test = np.array(df_x[train_size:])

# Splits the Y dataset into train and test
y_train = np.array(df_y[:train_size])
y_test = np.array(df_y[train_size:])

#Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

# Defina os hiperparâmetros que você deseja ajustar
param_grid = {
    'penalty': ['l1', 'l2'],           # Tipo de penalização
    'C': [0.1, 1, 10],                # Parâmetro de inversão da regularização
    'solver': ['liblinear', 'saga'],   # Algoritmo de otimização
    'max_iter': [100, 200, 300]       # Número máximo de iterações
}

# Crie o modelo de Regressão Logística
LR_model = LogisticRegression(random_state=1)

# Crie um objeto GridSearchCV para realizar a pesquisa em grade
grid_search = GridSearchCV(estimator=LR_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Realize a pesquisa em grade nos dados de treinamento
grid_search.fit(x_train, y_train)

print("")
# Imprima os melhores hiperparâmetros encontrados
print("Melhores Hiperparâmetros:")
print(grid_search.best_params_)
print("")

# Avalie o modelo com os melhores hiperparâmetros nos dados de teste
best_LR = grid_search.best_estimator_

start_time = time.time()
y_pred = best_LR.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")


Melhores Hiperparâmetros:
{'C': 0.1, 'max_iter': 200, 'penalty': 'l1', 'solver': 'saga'}

Acurácia nos Dados de Teste: 0.4679

True Positive Rate: 0.459

Tempo de execução: 0.0003 segundos




# Gradient Boosting

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

# Defina os hiperparâmetros que você deseja ajustar
param_grid = {
    'n_estimators': [50, 100, 200],          # Número de estimadores (árvores)
    'learning_rate': [0.01, 0.1, 0.2],      # Taxa de aprendizado
    'max_depth': [3, 4, 5],                # Profundidade máxima das árvores
    'min_samples_split': [2, 3, 4],         # Número mínimo de amostras necessárias para dividir um nó
    'min_samples_leaf': [1, 2, 3],          # Número mínimo de amostras em uma folha
}

# Crie o modelo de Regressão Logística
GB_model = GradientBoostingClassifier(random_state=1)

# Crie um objeto GridSearchCV para realizar a pesquisa em grade
grid_search = RandomizedSearchCV(estimator=GB_model, param_distributions=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Realize a pesquisa em grade nos dados de treinamento
grid_search.fit(x_train, y_train)

# Imprima os melhores hiperparâmetros encontrados
print("Melhores Hiperparâmetros:")
print(grid_search.best_params_)
print("")

# Avalie o modelo com os melhores hiperparâmetros nos dados de teste
best_GB = grid_search.best_estimator_

start_time = time.time()
y_pred = best_GB.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")

Melhores Hiperparâmetros:
{'n_estimators': 100, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_depth': 4, 'learning_rate': 0.1}

Acurácia nos Dados de Teste: 0.4464

True Positive Rate: 0.4266

Tempo de execução: 0.0027 segundos


#Naive Bayes - Gaussian

In [13]:
from sklearn.naive_bayes import GaussianNB

# Defina os hiperparâmetros que você deseja ajustar
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6],   # Parâmetro de suavização
}

# Crie o modelo Naive Bayes (GaussianNB)
GaussianNB_model = GaussianNB()

# Crie um objeto GridSearchCV para realizar a pesquisa em grade
grid_search = GridSearchCV(estimator=GaussianNB_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Realize a pesquisa em grade nos dados de treinamento
grid_search.fit(x_train, y_train)

# Imprima os melhores hiperparâmetros encontrados
print("Melhores Hiperparâmetros:")
print(grid_search.best_params_)
print("")

# Avalie o modelo com os melhores hiperparâmetros nos dados de teste
best_GNB = grid_search.best_estimator_

start_time = time.time()
y_pred = best_GNB.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")

Melhores Hiperparâmetros:
{'var_smoothing': 1e-06}

Acurácia nos Dados de Teste: 0.5036

True Positive Rate: 0.4675

Tempo de execução: 0.0016 segundos


#Naive Bayes - Bernoulli

In [14]:
from sklearn.naive_bayes import BernoulliNB

# Defina os hiperparâmetros que você deseja ajustar
param_grid = {
    'alpha': [1.0, 0.1, 0.01, 0.001],   # Parâmetro de suavização Laplace
    'binarize': [0.0, 0.1, 0.2, 0.3],  # Valor de limiarização para binarização
}

# Crie o modelo Naive Bayes (GaussianNB)
BernoulliNB_model = BernoulliNB()

# Crie um objeto GridSearchCV para realizar a pesquisa em grade
grid_search = GridSearchCV(estimator=BernoulliNB_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Realize a pesquisa em grade nos dados de treinamento
grid_search.fit(x_train, y_train)

# Imprima os melhores hiperparâmetros encontrados
print("Melhores Hiperparâmetros:")
print(grid_search.best_params_)
print("")

# Avalie o modelo com os melhores hiperparâmetros nos dados de teste
best_BNB = grid_search.best_estimator_

start_time = time.time()
y_pred = best_BNB.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")

Melhores Hiperparâmetros:
{'alpha': 0.1, 'binarize': 0.2}

Acurácia nos Dados de Teste: 0.4893

True Positive Rate: 0.474

Tempo de execução: 0.0003 segundos


#X Gradient Boosting

In [15]:
from xgboost import XGBClassifier

# Defina os hiperparâmetros que você deseja ajustar
param_grid = {
    'n_estimators': [50, 100, 200],            # Número de árvores (estimadores)
    'learning_rate': [0.01, 0.1, 0.2],         # Taxa de aprendizado
    'max_depth': [3, 4, 5],                   # Profundidade máxima das árvores
    'min_child_weight': [1, 2, 3],            # Peso mínimo da criança
}

# Crie o modelo XGBoost
XGB_model = XGBClassifier(random_state=1)

# Crie um objeto GridSearchCV para realizar a pesquisa em grade
grid_search = RandomizedSearchCV(estimator=XGB_model, param_distributions=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Realize a pesquisa em grade nos dados de treinamento
grid_search.fit(x_train, y_train)

# Imprima os melhores hiperparâmetros encontrados
print("Melhores Hiperparâmetros:")
print(grid_search.best_params_)
print("")

# Avalie o modelo com os melhores hiperparâmetros nos dados de teste
best_XGB = grid_search.best_estimator_

start_time = time.time()
y_pred = best_XGB.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")

Melhores Hiperparâmetros:
{'n_estimators': 100, 'min_child_weight': 3, 'max_depth': 4, 'learning_rate': 0.01}

Acurácia nos Dados de Teste: 0.475

True Positive Rate: 0.4532

Tempo de execução: 0.0016 segundos


#Light Gradient Boosting

In [16]:
from lightgbm import LGBMClassifier

# Defina os hiperparâmetros que você deseja ajustar
param_grid = {
    'n_estimators': [50, 100, 200],            # Número de árvores (estimadores)
    'learning_rate': [0.01, 0.1, 0.2],         # Taxa de aprendizado
    'max_depth': [3, 4, 5],                   # Profundidade máxima das árvores
    'min_child_weight': [1, 2, 3],            # Peso mínimo da criança
}

# Crie o modelo LightGBM
LGB_model = LGBMClassifier(random_state=1)

# Crie um objeto GridSearchCV para realizar a pesquisa em grade
grid_search = RandomizedSearchCV(estimator=LGB_model, param_distributions=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Realize a pesquisa em grade nos dados de treinamento
grid_search.fit(x_train, y_train)

# Imprima os melhores hiperparâmetros encontrados
print("Melhores Hiperparâmetros:")
print(grid_search.best_params_)
print("")

# Avalie o modelo com os melhores hiperparâmetros nos dados de teste
best_LGB = grid_search.best_estimator_

start_time = time.time()
y_pred = best_LGB.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")

[LightGBM] [Info] Number of positive: 340, number of negative: 313
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6503
[LightGBM] [Info] Number of data points in the train set: 653, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.520674 -> initscore=0.082742
[LightGBM] [Info] Start training from score 0.082742
Melhores Hiperparâmetros:
{'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.01}

Acurácia nos Dados de Teste: 0.4429

True Positive Rate: 0.4247

Tempo de execução: 0.0029 segundos


#SVM Classifier

In [17]:
from sklearn.svm import SVC

# Defina os hiperparâmetros e as distribuições para pesquisa aleatória
param_dist = {
    'C':[0.1, 1, 10],                # Parâmetro de margem (distribuição exponencial)
    'kernel': ['linear', 'rbf'],        # Tipo de kernel
    'gamma': [0.001, 0.01, 0.1],       # Parâmetro do kernel RBF (distribuição uniforme)
}

# Crie o modelo SVM
SVC_model = SVC(random_state=1)

n_iter = 50

# Crie um objeto RandomizedSearchCV para realizar a pesquisa aleatória
random_search = GridSearchCV(estimator=SVC_model, param_grid=param_dist, cv=5, scoring='accuracy', n_jobs=-1)

# Realize a pesquisa aleatória nos dados de treinamento
random_search.fit(x_train, y_train)

# Imprima os melhores hiperparâmetros encontrados
print("Melhores Hiperparâmetros:")
print(random_search.best_params_)
print("")

# Avalie o modelo com os melhores hiperparâmetros nos dados de teste
best_SVM = random_search.best_estimator_

start_time = time.time()
y_pred = best_SVM.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")

Melhores Hiperparâmetros:
{'C': 0.1, 'gamma': 0.001, 'kernel': 'rbf'}

Acurácia nos Dados de Teste: 0.4786

True Positive Rate: 0.4786

Tempo de execução: 0.0139 segundos


#Random Forest Classifier

In [18]:
from sklearn.ensemble import RandomForestClassifier

# Defina os hiperparâmetros e as distribuições para pesquisa aleatória
param_dist = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100, 200],
    'n_estimators': [10, 25, 30, 50, 100, 200],
    'max_features': [2, 3,4,5],
}

# Crie o modelo SVM
RF_model = RandomForestClassifier(random_state=1)

# Crie um objeto RandomizedSearchCV para realizar a pesquisa aleatória
random_search = GridSearchCV(estimator=RF_model, param_grid=param_dist, cv=5, scoring='accuracy', n_jobs=-1)

# Realize a pesquisa aleatória nos dados de treinamento
random_search.fit(x_train, y_train)

# Imprima os melhores hiperparâmetros encontrados
print("Melhores Hiperparâmetros:")
print(random_search.best_params_)
print("")

best_RF = random_search.best_estimator_

start_time = time.time()
y_pred = best_RF.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")

Melhores Hiperparâmetros:
{'max_depth': 5, 'max_features': 5, 'min_samples_leaf': 10, 'n_estimators': 10}

Acurácia nos Dados de Teste: 0.5357

True Positive Rate: 0.513

Tempo de execução: 0.0024 segundos


#MLP

In [30]:
# Implements the random forest model one more time, but with different parameters

from sklearn.neural_network import MLPClassifier

params = {
    'hidden_layer_sizes': [(10, 2, 1), (10, 4, 1), (10, 8, 1), (10, 16, 1), (10, 24, 1)],
    'activation': ['relu', 'tanh', 'logistic', 'identity'],
    'learning_rate': ['constant', 'adaptive'],
    'alpha': [0.001, 0.01],
    'solver': ['adam', 'lbfgs', 'sgd']}

# Crie o classificador MLP
MLP_model = MLPClassifier(max_iter=10000)

# Execute a pesquisa aleatória
grid_search = RandomizedSearchCV(estimator=MLP_model,
                             param_distributions=params,
                             cv=4,
                             scoring="accuracy",
                             n_jobs=-1,
                             verbose=1)

grid_search.fit(x_train, y_train)

# Avalie o modelo com os melhores hiperparâmetros nos dados de teste
best_MLP = grid_search.best_estimator_

start_time = time.time()
y_pred = best_MLP.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")

Fitting 4 folds for each of 10 candidates, totalling 40 fits
Acurácia nos Dados de Teste: 0.5393

True Positive Rate: 0.5373

Tempo de execução: 0.0003 segundos


8 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py", line 749, in fit
    return self._fit(X, y, incremental=False)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py", line 491, in _fit
    raise ValueError(
ValueError: Solver produced non-finite parameter weights. The input data may contain large values and need to be preproc

# Data Resizing

In [20]:
n_agg = 10
x_agg = []

for i in range(len(df_x)-n_agg+1):
  x_agg.append(df_x[i:i+n_agg])

x_agg = np.array(x_agg)

In [21]:
train_size = int(round(x_agg.shape[0]*0.7,0))
test_size = 1 - train_size

In [22]:
x_train_agg = x_agg[:train_size]
x_test_agg = x_agg[train_size:]

In [23]:
y_agg = df_y[n_agg-1:]

In [24]:
from scipy.special import y1
y_train_agg = y_agg[:train_size]
y_test_agg = y_agg[train_size:]

In [25]:
x_train_agg.shape, y_train_agg.shape

((647, 10, 30), (647,))

In [26]:
x_test_agg.shape, y_test_agg.shape

((277, 10, 30), (277,))

#LSTM

In [27]:
from tensorflow.keras.layers import LSTM

n = 1
layer_1 = [8, 16, 24, 32, 64, 128]
layer_2 = [4, 8, 12, 16, 32, 64]

for i,j in zip(layer_1, layer_2):

    print(f'LSTM {n}')
    print('Layer 1 = ', i)
    print('Layer 2 = ', j)

    # Creates the MLP with 2 hidden layers
    LSTM_model = Sequential([
        LSTM(i, activation='relu', input_shape=(x_train_agg.shape[1], x_train_agg.shape[2]), return_sequences=True),
        LSTM(j, activation='relu', return_sequences=False),
        Dropout(0.2),
        Dense(1)])

    # Compiles the model
    LSTM_model.compile(optimizer='adam',
                  loss='mse',
                  metrics=['accuracy'])

    LSTM_model.fit(x_train_agg, y_train_agg.values, validation_split=0.1, epochs=10, batch_size=16, verbose=0)

    start_time = time.time()
    y_pred = LSTM_model.predict(x_test_agg)
    end_time = time.time()

    threshold = 0.5
    y_pred = (y_pred > threshold).astype(int)

    accuracy = accuracy_score(y_test_agg, y_pred)
    print('Accuracy: '+str(round(accuracy,4)))

    # Obtendo valores da matriz de confusão
    tn, fp, fn, tp = confusion_matrix(y_test_agg, y_pred).ravel()
    # Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
    tpr = round(tp / (tp + fp),4)
    print("True Positive Rate:", round(tpr,4))

    execution_time = end_time - start_time
    print("Tempo de execução:", round(execution_time, 4), "segundos")
    print('')

    n+=1

LSTM 1
Layer 1 =  8
Layer 2 =  4
Accuracy: 0.5199
True Positive Rate: nan
Tempo de execução: 0.2689 segundos

LSTM 2
Layer 1 =  16
Layer 2 =  8


  tpr = round(tp / (tp + fp),4)


Accuracy: 0.5596
True Positive Rate: 0.6571
Tempo de execução: 0.2246 segundos

LSTM 3
Layer 1 =  24
Layer 2 =  12
Accuracy: 0.4801
True Positive Rate: 0.4801
Tempo de execução: 0.3456 segundos

LSTM 4
Layer 1 =  32
Layer 2 =  16
Accuracy: 0.5126
True Positive Rate: 0.0
Tempo de execução: 0.2576 segundos

LSTM 5
Layer 1 =  64
Layer 2 =  32
Accuracy: 0.4693
True Positive Rate: 0.4493
Tempo de execução: 0.2466 segundos

LSTM 6
Layer 1 =  128
Layer 2 =  64
Accuracy: 0.5162
True Positive Rate: 0.4971
Tempo de execução: 0.2576 segundos



#GRU

In [28]:
# Implements the random forest model one more time, but with different parameters

from tensorflow.keras.layers import GRU

n = 1
layer_1 = [8, 16, 24, 32, 64, 128]
layer_2 = [4, 8, 12, 16, 32, 64]

for i,j in zip(layer_1, layer_2):

    print(f'GRU {n}')
    print('Layer 1 = ', i)
    print('Layer 2 = ', j)

    # Creates the MLP with 2 hidden layers
    GRU_model = Sequential([
        GRU(i, activation='relu', input_shape=(x_train_agg.shape[1], x_train_agg.shape[2]), return_sequences=True),
        GRU(j, activation='relu', return_sequences=False),
        Dropout(0.2),
        Dense(1)])

    # Compiles the model
    GRU_model.compile(optimizer='adam',
                  loss='mse',
                  metrics=['accuracy'])

    GRU_model.fit(x_train_agg, y_train_agg.values, validation_split=0.1, epochs=10, batch_size=16, verbose=0)

    start_time = time.time()
    y_pred = GRU_model.predict(x_test_agg)
    end_time = time.time()

    threshold = 0.5
    y_pred = (y_pred > threshold).astype(int)

    accuracy = accuracy_score(y_test_agg, y_pred)
    print('Accuracy: '+str(round(accuracy,4)))

    # Obtendo valores da matriz de confusão
    tn, fp, fn, tp = confusion_matrix(y_test_agg, y_pred).ravel()
    # Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
    tpr = round(tp / (tp + fp),4)
    print("True Positive Rate:", tpr)

    execution_time = end_time - start_time
    print("Tempo de execução:", round(execution_time, 4), "segundos")
    print('')

    n+=1

GRU 1
Layer 1 =  8
Layer 2 =  4
Accuracy: 0.5199
True Positive Rate: nan
Tempo de execução: 0.2307 segundos

GRU 2
Layer 1 =  16
Layer 2 =  8


  tpr = round(tp / (tp + fp),4)


Accuracy: 0.5487
True Positive Rate: 0.5833
Tempo de execução: 0.2357 segundos

GRU 3
Layer 1 =  24
Layer 2 =  12
Accuracy: 0.4801
True Positive Rate: 0.4801
Tempo de execução: 0.2361 segundos

GRU 4
Layer 1 =  32
Layer 2 =  16
Accuracy: 0.5199
True Positive Rate: nan
Tempo de execução: 0.2709 segundos

GRU 5
Layer 1 =  64
Layer 2 =  32


  tpr = round(tp / (tp + fp),4)


Accuracy: 0.5054
True Positive Rate: 0.4906
Tempo de execução: 0.2785 segundos

GRU 6
Layer 1 =  128
Layer 2 =  64
Accuracy: 0.491
True Positive Rate: 0.4765
Tempo de execução: 0.2735 segundos



# Benchmark

In [29]:
true_ratio = round(len(y_test[y_test == 1])/len(y_test),4)
true_ratio

0.4786