In [36]:
import pandas as pd
import tensorflow as tf
import numpy as np
import yfinance as yfin
import matplotlib.pyplot as plt
import seaborn as sns
import time

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Ibovespa

In [37]:
yfin.pdr_override()
ibovespa = yfin.download(['^BVSP'])
ibovespa

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1993-04-27,24.799999,25.400000,24.500000,24.500000,24.500000,0
1993-04-28,24.500000,24.600000,23.700001,24.299999,24.299999,0
1993-04-29,24.299999,24.799999,23.700001,23.700001,23.700001,0
1993-04-30,23.700001,24.200001,23.700001,24.100000,24.100000,0
1993-05-03,24.100000,24.400000,23.799999,24.100000,24.100000,0
...,...,...,...,...,...,...
2023-09-21,118695.000000,118695.000000,116013.000000,116145.000000,116145.000000,12685800
2023-09-22,116147.000000,116968.000000,115855.000000,116009.000000,116009.000000,9443500
2023-09-25,116009.000000,116031.000000,115573.000000,115925.000000,115925.000000,9580000
2023-09-26,115922.000000,115922.000000,114162.000000,114193.000000,114193.000000,11241800


In [38]:
ibovespa = ibovespa.drop(['Volume','Adj Close'],axis=1)
ibovespa

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1993-04-27,24.799999,25.400000,24.500000,24.500000
1993-04-28,24.500000,24.600000,23.700001,24.299999
1993-04-29,24.299999,24.799999,23.700001,23.700001
1993-04-30,23.700001,24.200001,23.700001,24.100000
1993-05-03,24.100000,24.400000,23.799999,24.100000
...,...,...,...,...
2023-09-21,118695.000000,118695.000000,116013.000000,116145.000000
2023-09-22,116147.000000,116968.000000,115855.000000,116009.000000
2023-09-25,116009.000000,116031.000000,115573.000000,115925.000000
2023-09-26,115922.000000,115922.000000,114162.000000,114193.000000


In [39]:
ibovespa['Tomorrow Close'] = ibovespa['Close'].shift(-1)
ibovespa

Unnamed: 0_level_0,Open,High,Low,Close,Tomorrow Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1993-04-27,24.799999,25.400000,24.500000,24.500000,24.299999
1993-04-28,24.500000,24.600000,23.700001,24.299999,23.700001
1993-04-29,24.299999,24.799999,23.700001,23.700001,24.100000
1993-04-30,23.700001,24.200001,23.700001,24.100000,24.100000
1993-05-03,24.100000,24.400000,23.799999,24.100000,24.900000
...,...,...,...,...,...
2023-09-21,118695.000000,118695.000000,116013.000000,116145.000000,116009.000000
2023-09-22,116147.000000,116968.000000,115855.000000,116009.000000,115925.000000
2023-09-25,116009.000000,116031.000000,115573.000000,115925.000000,114193.000000
2023-09-26,115922.000000,115922.000000,114162.000000,114193.000000,114327.046875


In [40]:
ibovespa['Increased'] = (ibovespa['Tomorrow Close'] > ibovespa['Close']).astype(int)
ibovespa

Unnamed: 0_level_0,Open,High,Low,Close,Tomorrow Close,Increased
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1993-04-27,24.799999,25.400000,24.500000,24.500000,24.299999,0
1993-04-28,24.500000,24.600000,23.700001,24.299999,23.700001,0
1993-04-29,24.299999,24.799999,23.700001,23.700001,24.100000,1
1993-04-30,23.700001,24.200001,23.700001,24.100000,24.100000,0
1993-05-03,24.100000,24.400000,23.799999,24.100000,24.900000,1
...,...,...,...,...,...,...
2023-09-21,118695.000000,118695.000000,116013.000000,116145.000000,116009.000000,0
2023-09-22,116147.000000,116968.000000,115855.000000,116009.000000,115925.000000,0
2023-09-25,116009.000000,116031.000000,115573.000000,115925.000000,114193.000000,0
2023-09-26,115922.000000,115922.000000,114162.000000,114193.000000,114327.046875,1


# Technical Analysis Indicators

In [41]:
time_periods = [3,7,10]

for time_period in time_periods:

  weights = list(range(1, time_period+1))
  # Defina uma função personalizada para calcular a média ponderada
  def weighted_average(values):
      return (values * weights).sum() / sum(weights)
  rolling_weighted_averages = ibovespa.rolling(time_period).apply(weighted_average, raw=True)

  ######################################################################################

  # Calcule as mudanças de preço diárias
  ibovespa['Price Change'] = ibovespa['Close'].diff()

  # Separe as mudanças de preço positivas e negativas
  ibovespa['Positive Change'] = np.where(ibovespa['Price Change'] > 0, ibovespa['Price Change'], 0)
  ibovespa['Negative Change'] = np.where(ibovespa['Price Change'] < 0, -ibovespa['Price Change'], 0)

  # Calcule o RS (Relative Strength)
  rs = ibovespa['Positive Change'].rolling(time_period).mean() / ibovespa['Negative Change'].rolling(time_period).mean()

  # Calcule o RSI usando a fórmula
  rsi = 100 - (100 / (1 + rs))

  ######################################################################################

  # Especifique os períodos para o cálculo do MACD e da linha de sinal
  periodo_curto = time_period + 9
  periodo_longo = time_period + 23
  periodo_sinal = time_period + 6

  # Calcule as EMA12 e EMA26
  ema12 = ibovespa['Close'].ewm(span=periodo_curto, adjust=False).mean()
  ema26 = ibovespa['Close'].ewm(span=periodo_longo, adjust=False).mean()

  # Calcule o MACD
  macd = ema12 - ema26

  ######################################################################################

  signal_line = macd.ewm(span=periodo_sinal, adjust=False).mean()

  ######################################################################################

  rolling_max = ibovespa['High'].rolling(time_period).max()
  rolling_min = ibovespa['Low'].rolling(time_period).min()

  ######################################################################################

  # Calcule o Preço Típico
  ibovespa['Typical Price'] = (ibovespa['High'] + ibovespa['Low'] + ibovespa['Close']) / 3

  # Calcule a Média Móvel do Preço Típico
  ibovespa['Typical Price Moving Average'] = ibovespa['Typical Price'].rolling(time_period).mean()

  # Calcule o Desvio Padrão do Preço Típico
  ibovespa['Typical Price Std'] = ibovespa['Typical Price'].rolling(time_period).std()

  ######################################################################################

  ibovespa[f'simple_{time_period}days_moving_average'] = ibovespa['Close'].rolling(time_period).mean()
  ibovespa[f'weighted_{time_period}days_moving_average'] = rolling_weighted_averages['Close']
  ibovespa[f'{time_period}days_momentum'] = ibovespa['Close'].rolling(time_period).apply(lambda x: x[-1] - x[0])
  ibovespa[f'{time_period}days_stochastic_K'] = 100*(ibovespa['Close'] - rolling_min) / (rolling_max - rolling_min)
  ibovespa[f'{time_period}days_stochastic_D'] = ibovespa[f'{time_period}days_stochastic_K'].rolling(time_period).mean()
  ibovespa[f'{time_period}days_RSI'] = rsi
  ibovespa[f'{time_period}days_MACD'] = macd
  ibovespa[f'{time_period}days_Signal_Line'] = signal_line
  ibovespa[f'{time_period}days_LW_R'] = 100*(rolling_max - ibovespa['Close']) / (rolling_max - rolling_min)
  ibovespa[f'{time_period}days_CCI'] = (ibovespa['Typical Price'] - ibovespa['Typical Price Moving Average']) / (0.015 * ibovespa['Typical Price Std'])

  #####################################################################################

ibovespa = ibovespa.dropna()

In [42]:
ibovespa = ibovespa['2020-01-01':]

In [43]:
df_y = ibovespa['Increased']
df_y

Date
2020-01-02    0
2020-01-03    0
2020-01-06    0
2020-01-07    0
2020-01-08    0
             ..
2023-09-20    0
2023-09-21    0
2023-09-22    0
2023-09-25    0
2023-09-26    1
Name: Increased, Length: 930, dtype: int64

In [44]:
ibovespa_TDD = pd.DataFrame()

for time_period in time_periods:

  ibovespa_TDD[f'simple_{time_period}days_moving_average_TDD'] = np.where(ibovespa['Close'] > ibovespa[f'simple_{time_period}days_moving_average'], 1, -1)
  ibovespa_TDD[f'weighted_{time_period}days_moving_average_TDD'] = np.where(ibovespa['Close'] > ibovespa[f'weighted_{time_period}days_moving_average'], 1, -1)
  ibovespa_TDD[f'{time_period}days_stochastic_K_TDD'] = np.where(ibovespa[f'{time_period}days_stochastic_K'] > ibovespa[f'{time_period}days_stochastic_K'].shift(1), 1, -1)
  ibovespa_TDD[f'{time_period}days_stochastic_D_TDD'] = np.where(ibovespa[f'{time_period}days_stochastic_D'] > ibovespa[f'{time_period}days_stochastic_D'].shift(1), 1, -1)
  ibovespa_TDD[f'{time_period}days_LW_R_TDD'] = np.where(ibovespa[f'{time_period}days_LW_R'] > ibovespa[f'{time_period}days_LW_R'].shift(1), 1, -1)
  ibovespa_TDD[f'{time_period}days_MACD_TDD'] = np.where(ibovespa[f'{time_period}days_MACD'] > ibovespa[f'{time_period}days_MACD'].shift(1), 1, -1)
  ibovespa_TDD[f'{time_period}days_RSI_TDD'] = np.where((ibovespa[f'{time_period}days_RSI'] > 70) | (ibovespa[f'{time_period}days_RSI'] < ibovespa[f'{time_period}days_RSI'].shift(1)), -1, 1)
  ibovespa_TDD[f'{time_period}_CCI_TDD'] = np.where((ibovespa[f'{time_period}days_CCI'] > 200) | (ibovespa[f'{time_period}days_CCI'] < ibovespa[f'{time_period}days_CCI'].shift(1)), -1, 1)
  ibovespa_TDD[f'{time_period}days_momentum_TDD'] = np.where(ibovespa[f'{time_period}days_momentum'] > 0, 1, -1)

In [45]:
df_x = ibovespa_TDD
df_x = df_x.replace(-1,0)

In [46]:
df_x = df_x.set_index(ibovespa.index)
df_x

Unnamed: 0_level_0,simple_3days_moving_average_TDD,weighted_3days_moving_average_TDD,3days_stochastic_K_TDD,3days_stochastic_D_TDD,3days_LW_R_TDD,3days_MACD_TDD,3days_RSI_TDD,3_CCI_TDD,3days_momentum_TDD,simple_7days_moving_average_TDD,...,7days_momentum_TDD,simple_10days_moving_average_TDD,weighted_10days_moving_average_TDD,10days_stochastic_K_TDD,10days_stochastic_D_TDD,10days_LW_R_TDD,10days_MACD_TDD,10days_RSI_TDD,10_CCI_TDD,10days_momentum_TDD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02,1,1,0,0,0,0,1,1,1,1,...,1,1,1,0,0,0,0,0,1,1
2020-01-03,1,1,0,1,1,1,0,0,1,1,...,1,1,1,0,1,1,1,0,0,1
2020-01-06,0,0,0,1,1,0,0,0,0,0,...,1,1,1,0,0,1,0,0,0,1
2020-01-07,0,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,1
2020-01-08,0,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-20,1,1,1,1,0,1,1,1,1,1,...,1,1,1,1,1,0,1,1,1,1
2023-09-21,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
2023-09-22,0,0,1,0,0,0,1,1,0,0,...,0,0,0,0,1,1,0,1,0,0
2023-09-25,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [47]:
predictors = df_x.columns

train_size = int(round(len(df_y)*0.7,0))
test_size = len(df_y) - train_size

# Splits the X dataset into train and test
x_train = np.array(df_x[0:train_size])
x_test = np.array(df_x[train_size:])

# Splits the Y dataset into train and test
y_train = np.array(df_y[:train_size])
y_test = np.array(df_y[train_size:])

#Logistic Regression

In [48]:
from sklearn.linear_model import LogisticRegression

# Defina os hiperparâmetros que você deseja ajustar
param_grid = {
    'penalty': ['l1', 'l2'],           # Tipo de penalização
    'C': [0.1, 1, 10],                # Parâmetro de inversão da regularização
    'solver': ['liblinear', 'saga'],   # Algoritmo de otimização
    'max_iter': [100, 200, 300]       # Número máximo de iterações
}

# Crie o modelo de Regressão Logística
LR_model = LogisticRegression(random_state=1)

# Crie um objeto GridSearchCV para realizar a pesquisa em grade
grid_search = GridSearchCV(estimator=LR_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Realize a pesquisa em grade nos dados de treinamento
grid_search.fit(x_train, y_train)

print("")
# Imprima os melhores hiperparâmetros encontrados
print("Melhores Hiperparâmetros:")
print(grid_search.best_params_)
print("")

# Avalie o modelo com os melhores hiperparâmetros nos dados de teste
best_LR = grid_search.best_estimator_

start_time = time.time()
y_pred = best_LR.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")


Melhores Hiperparâmetros:
{'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'}

Acurácia nos Dados de Teste: 0.4982

True Positive Rate: 0.4833

Tempo de execução: 0.0003 segundos


# Gradient Boosting

In [49]:
from sklearn.ensemble import GradientBoostingClassifier

# Defina os hiperparâmetros que você deseja ajustar
param_grid = {
    'n_estimators': [50, 100, 200],          # Número de estimadores (árvores)
    'learning_rate': [0.01, 0.1, 0.2],      # Taxa de aprendizado
    'max_depth': [3, 4, 5],                # Profundidade máxima das árvores
    'min_samples_split': [2, 3, 4],         # Número mínimo de amostras necessárias para dividir um nó
    'min_samples_leaf': [1, 2, 3],          # Número mínimo de amostras em uma folha
}

# Crie o modelo de Regressão Logística
GB_model = GradientBoostingClassifier(random_state=1)

# Crie um objeto GridSearchCV para realizar a pesquisa em grade
grid_search = RandomizedSearchCV(estimator=GB_model, param_distributions=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Realize a pesquisa em grade nos dados de treinamento
grid_search.fit(x_train, y_train)

# Imprima os melhores hiperparâmetros encontrados
print("Melhores Hiperparâmetros:")
print(grid_search.best_params_)
print("")

# Avalie o modelo com os melhores hiperparâmetros nos dados de teste
best_GB = grid_search.best_estimator_

start_time = time.time()
y_pred = best_GB.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")

Melhores Hiperparâmetros:
{'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 5, 'learning_rate': 0.01}

Acurácia nos Dados de Teste: 0.4659

True Positive Rate: 0.4615

Tempo de execução: 0.002 segundos


#Naive Bayes - Gaussian

In [50]:
from sklearn.naive_bayes import GaussianNB

# Defina os hiperparâmetros que você deseja ajustar
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6],   # Parâmetro de suavização
}

# Crie o modelo Naive Bayes (GaussianNB)
GaussianNB_model = GaussianNB()

# Crie um objeto GridSearchCV para realizar a pesquisa em grade
grid_search = GridSearchCV(estimator=GaussianNB_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Realize a pesquisa em grade nos dados de treinamento
grid_search.fit(x_train, y_train)

# Imprima os melhores hiperparâmetros encontrados
print("Melhores Hiperparâmetros:")
print(grid_search.best_params_)
print("")

# Avalie o modelo com os melhores hiperparâmetros nos dados de teste
best_GNB = grid_search.best_estimator_

start_time = time.time()
y_pred = best_GNB.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")

Melhores Hiperparâmetros:
{'var_smoothing': 1e-09}

Acurácia nos Dados de Teste: 0.491

True Positive Rate: 0.4726

Tempo de execução: 0.0009 segundos


#Naive Bayes - Bernoulli

In [51]:
from sklearn.naive_bayes import BernoulliNB

# Defina os hiperparâmetros que você deseja ajustar
param_grid = {
    'alpha': [1.0, 0.1, 0.01, 0.001],   # Parâmetro de suavização Laplace
    'binarize': [0.0, 0.1, 0.2, 0.3],  # Valor de limiarização para binarização
}

# Crie o modelo Naive Bayes (GaussianNB)
BernoulliNB_model = BernoulliNB()

# Crie um objeto GridSearchCV para realizar a pesquisa em grade
grid_search = GridSearchCV(estimator=BernoulliNB_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Realize a pesquisa em grade nos dados de treinamento
grid_search.fit(x_train, y_train)

# Imprima os melhores hiperparâmetros encontrados
print("Melhores Hiperparâmetros:")
print(grid_search.best_params_)
print("")

# Avalie o modelo com os melhores hiperparâmetros nos dados de teste
best_BNB = grid_search.best_estimator_

start_time = time.time()
y_pred = best_BNB.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")

Melhores Hiperparâmetros:
{'alpha': 1.0, 'binarize': 0.0}

Acurácia nos Dados de Teste: 0.4875

True Positive Rate: 0.4694

Tempo de execução: 0.0004 segundos


#X Gradient Boosting

In [52]:
from xgboost import XGBClassifier

# Defina os hiperparâmetros que você deseja ajustar
param_grid = {
    'n_estimators': [50, 100, 200],            # Número de árvores (estimadores)
    'learning_rate': [0.01, 0.1, 0.2],         # Taxa de aprendizado
    'max_depth': [3, 4, 5],                   # Profundidade máxima das árvores
    'min_child_weight': [1, 2, 3],            # Peso mínimo da criança
}

# Crie o modelo XGBoost
XGB_model = XGBClassifier(random_state=1)

# Crie um objeto GridSearchCV para realizar a pesquisa em grade
grid_search = RandomizedSearchCV(estimator=XGB_model, param_distributions=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Realize a pesquisa em grade nos dados de treinamento
grid_search.fit(x_train, y_train)

# Imprima os melhores hiperparâmetros encontrados
print("Melhores Hiperparâmetros:")
print(grid_search.best_params_)
print("")

# Avalie o modelo com os melhores hiperparâmetros nos dados de teste
best_XGB = grid_search.best_estimator_

start_time = time.time()
y_pred = best_XGB.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")

Melhores Hiperparâmetros:
{'n_estimators': 50, 'min_child_weight': 3, 'max_depth': 4, 'learning_rate': 0.01}

Acurácia nos Dados de Teste: 0.509

True Positive Rate: 0.4915

Tempo de execução: 0.0014 segundos


#Light Gradient Boosting

In [53]:
from lightgbm import LGBMClassifier

# Defina os hiperparâmetros que você deseja ajustar
param_grid = {
    'n_estimators': [50, 100, 200],            # Número de árvores (estimadores)
    'learning_rate': [0.01, 0.1, 0.2],         # Taxa de aprendizado
    'max_depth': [3, 4, 5],                   # Profundidade máxima das árvores
    'min_child_weight': [1, 2, 3],            # Peso mínimo da criança
}

# Crie o modelo LightGBM
LGB_model = LGBMClassifier(random_state=1)

# Crie um objeto GridSearchCV para realizar a pesquisa em grade
grid_search = RandomizedSearchCV(estimator=LGB_model, param_distributions=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Realize a pesquisa em grade nos dados de treinamento
grid_search.fit(x_train, y_train)

# Imprima os melhores hiperparâmetros encontrados
print("Melhores Hiperparâmetros:")
print(grid_search.best_params_)
print("")

# Avalie o modelo com os melhores hiperparâmetros nos dados de teste
best_LGB = grid_search.best_estimator_

start_time = time.time()
y_pred = best_LGB.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")

[LightGBM] [Info] Number of positive: 338, number of negative: 313
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54
[LightGBM] [Info] Number of data points in the train set: 651, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.519201 -> initscore=0.076843
[LightGBM] [Info] Start training from score 0.076843
Melhores Hiperparâmetros:
{'n_estimators': 50, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.01}

Acurácia nos Dados de Teste: 0.5448

True Positive Rate: 0.5185

Tempo de execução: 0.0028 segundos


#SVM Classifier

In [54]:
from sklearn.svm import SVC

# Defina os hiperparâmetros e as distribuições para pesquisa aleatória
param_dist = {
    'C':[0.1, 1, 10],                # Parâmetro de margem (distribuição exponencial)
    'kernel': ['linear', 'rbf'],        # Tipo de kernel
    'gamma': [0.001, 0.01, 0.1],       # Parâmetro do kernel RBF (distribuição uniforme)
}

# Crie o modelo SVM
SVC_model = SVC(random_state=1)

n_iter = 50

# Crie um objeto RandomizedSearchCV para realizar a pesquisa aleatória
random_search = GridSearchCV(estimator=SVC_model, param_grid=param_dist, cv=5, scoring='accuracy', n_jobs=-1)

# Realize a pesquisa aleatória nos dados de treinamento
random_search.fit(x_train, y_train)

# Imprima os melhores hiperparâmetros encontrados
print("Melhores Hiperparâmetros:")
print(random_search.best_params_)
print("")

# Avalie o modelo com os melhores hiperparâmetros nos dados de teste
best_SVM = random_search.best_estimator_

start_time = time.time()
y_pred = best_SVM.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")

Melhores Hiperparâmetros:
{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}

Acurácia nos Dados de Teste: 0.4624

True Positive Rate: 0.4474

Tempo de execução: 0.0108 segundos


#Random Forest Classifier

In [55]:
from sklearn.ensemble import RandomForestClassifier

# Defina os hiperparâmetros e as distribuições para pesquisa aleatória
param_dist = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100, 200],
    'n_estimators': [10, 25, 30, 50, 100, 200],
    'max_features': [2, 3,4,5],
}

# Crie o modelo SVM
RF_model = RandomForestClassifier(random_state=1)

# Crie um objeto RandomizedSearchCV para realizar a pesquisa aleatória
random_search = GridSearchCV(estimator=RF_model, param_grid=param_dist, cv=5, scoring='accuracy', n_jobs=-1)

# Realize a pesquisa aleatória nos dados de treinamento
random_search.fit(x_train, y_train)

# Imprima os melhores hiperparâmetros encontrados
print("Melhores Hiperparâmetros:")
print(random_search.best_params_)
print("")

best_RF = random_search.best_estimator_

start_time = time.time()
y_pred = best_RF.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")

Melhores Hiperparâmetros:
{'max_depth': 10, 'max_features': 2, 'min_samples_leaf': 10, 'n_estimators': 10}

Acurácia nos Dados de Teste: 0.5054

True Positive Rate: 0.4891

Tempo de execução: 0.0016 segundos


#MLP

In [56]:
# Implements the random forest model one more time, but with different parameters

from sklearn.neural_network import MLPClassifier

params = {
    'hidden_layer_sizes': [(10, 2, 1), (10, 4, 1), (10, 8, 1), (10, 16, 1), (10, 24, 1)],
    'activation': ['relu', 'tanh', 'logistic', 'identity'],
    'learning_rate': ['constant', 'adaptive'],
    'alpha': [0.001, 0.01],
    'solver': ['adam', 'lbfgs', 'sgd']}

# Crie o classificador MLP
MLP_model = MLPClassifier(max_iter=10000)

# Execute a pesquisa aleatória
grid_search = RandomizedSearchCV(estimator=MLP_model,
                             param_distributions=params,
                             cv=4,
                             scoring="accuracy",
                             n_jobs=-1,
                             verbose=1)

grid_search.fit(x_train, y_train)

# Avalie o modelo com os melhores hiperparâmetros nos dados de teste
best_MLP = grid_search.best_estimator_

start_time = time.time()
y_pred = best_MLP.predict(x_test)
end_time = time.time()

test_accuracy = accuracy_score(y_test, y_pred)
print("Acurácia nos Dados de Teste:", round(test_accuracy,4))
print("")

# Obtendo valores da matriz de confusão
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
tpr = round(tp / (tp + fp),4)
print("True Positive Rate:", round(tpr,4))
print('')

execution_time = end_time - start_time
print("Tempo de execução:", round(execution_time, 4), "segundos")

Fitting 4 folds for each of 10 candidates, totalling 40 fits
Acurácia nos Dados de Teste: 0.5125

True Positive Rate: 0.4948

Tempo de execução: 0.0005 segundos


# Data Resizing

In [57]:
n_agg = 10
x_agg = []

for i in range(len(df_x)-n_agg+1):
  x_agg.append(df_x[i:i+n_agg])

x_agg = np.array(x_agg)

In [58]:
train_size = int(round(x_agg.shape[0]*0.7,0))
test_size = 1 - train_size

In [59]:
x_train_agg = x_agg[:train_size]
x_test_agg = x_agg[train_size:]

In [60]:
y_agg = df_y[n_agg-1:]

In [61]:
from scipy.special import y1
y_train_agg = y_agg[:train_size]
y_test_agg = y_agg[train_size:]

In [62]:
x_train_agg.shape, y_train_agg.shape

((645, 10, 27), (645,))

In [63]:
x_test_agg.shape, y_test_agg.shape

((276, 10, 27), (276,))

#LSTM

In [67]:
from tensorflow.keras.layers import LSTM

n = 1
layer_1 = [8, 16, 24, 32, 64, 128]
layer_2 = [4, 8, 12, 16, 32, 64]

for i,j in zip(layer_1, layer_2):

    print(f'LSTM {n}')
    print('Layer 1 = ', i)
    print('Layer 2 = ', j)

    # Creates the MLP with 2 hidden layers
    LSTM_model = Sequential([
        LSTM(i, activation='relu', input_shape=(x_train_agg.shape[1], x_train_agg.shape[2]), return_sequences=True),
        LSTM(j, activation='relu', return_sequences=False),
        Dropout(0.2),
        Dense(1)])

    # Compiles the model
    LSTM_model.compile(optimizer='adam',
                  loss='mse',
                  metrics=['accuracy'])

    LSTM_model.fit(x_train_agg, y_train_agg.values, validation_split=0.1, epochs=10, batch_size=16, verbose=0)

    start_time = time.time()
    y_pred = LSTM_model.predict(x_test_agg)
    end_time = time.time()

    threshold = 0.5
    y_pred = (y_pred > threshold).astype(int)

    accuracy = accuracy_score(y_test_agg, y_pred)
    print('Accuracy: '+str(round(accuracy,4)))

    # Obtendo valores da matriz de confusão
    tn, fp, fn, tp = confusion_matrix(y_test_agg, y_pred).ravel()
    # Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
    tpr = round(tp / (tp + fp),4)
    print("True Positive Rate:", round(tpr,4))

    execution_time = end_time - start_time
    print("Tempo de execução:", round(execution_time, 4), "segundos")
    print('')

    n+=1

LSTM 1
Layer 1 =  8
Layer 2 =  4
Accuracy: 0.4674
True Positive Rate: 0.4516
Tempo de execução: 0.2395 segundos

LSTM 2
Layer 1 =  16
Layer 2 =  8
Accuracy: 0.4964
True Positive Rate: 0.4624
Tempo de execução: 0.343 segundos

LSTM 3
Layer 1 =  24
Layer 2 =  12
Accuracy: 0.4928
True Positive Rate: 0.3571
Tempo de execução: 0.2438 segundos

LSTM 4
Layer 1 =  32
Layer 2 =  16
Accuracy: 0.4529
True Positive Rate: 0.4585
Tempo de execução: 0.2756 segundos

LSTM 5
Layer 1 =  64
Layer 2 =  32
Accuracy: 0.4855
True Positive Rate: 0.4653
Tempo de execução: 0.2283 segundos

LSTM 6
Layer 1 =  128
Layer 2 =  64
Accuracy: 0.4964
True Positive Rate: 0.4255
Tempo de execução: 0.2558 segundos



#GRU

In [68]:
# Implements the random forest model one more time, but with different parameters

from tensorflow.keras.layers import GRU

n = 1
layer_1 = [8, 16, 24, 32, 64, 128]
layer_2 = [4, 8, 12, 16, 32, 64]

for i,j in zip(layer_1, layer_2):

    print(f'GRU {n}')
    print('Layer 1 = ', i)
    print('Layer 2 = ', j)

    # Creates the MLP with 2 hidden layers
    GRU_model = Sequential([
        GRU(i, activation='relu', input_shape=(x_train_agg.shape[1], x_train_agg.shape[2]), return_sequences=True),
        GRU(j, activation='relu', return_sequences=False),
        Dropout(0.2),
        Dense(1)])

    # Compiles the model
    GRU_model.compile(optimizer='adam',
                  loss='mse',
                  metrics=['accuracy'])

    GRU_model.fit(x_train_agg, y_train_agg.values, validation_split=0.1, epochs=10, batch_size=16, verbose=0)

    start_time = time.time()
    y_pred = GRU_model.predict(x_test_agg)
    end_time = time.time()

    threshold = 0.5
    y_pred = (y_pred > threshold).astype(int)

    accuracy = accuracy_score(y_test_agg, y_pred)
    print('Accuracy: '+str(round(accuracy,4)))

    # Obtendo valores da matriz de confusão
    tn, fp, fn, tp = confusion_matrix(y_test_agg, y_pred).ravel()
    # Calculando a True Positive Rate (Taxa de Verdadeiros Positivos)
    tpr = round(tp / (tp + fp),4)
    print("True Positive Rate:", tpr)

    execution_time = end_time - start_time
    print("Tempo de execução:", round(execution_time, 4), "segundos")
    print('')

    n+=1

GRU 1
Layer 1 =  8
Layer 2 =  4
Accuracy: 0.4493
True Positive Rate: 0.3913
Tempo de execução: 0.2529 segundos

GRU 2
Layer 1 =  16
Layer 2 =  8
Accuracy: 0.4855
True Positive Rate: 0.451
Tempo de execução: 0.2594 segundos

GRU 3
Layer 1 =  24
Layer 2 =  12
Accuracy: 0.5217
True Positive Rate: nan
Tempo de execução: 0.2514 segundos

GRU 4
Layer 1 =  32
Layer 2 =  16


  tpr = round(tp / (tp + fp),4)


Accuracy: 0.4674
True Positive Rate: 0.438
Tempo de execução: 0.2956 segundos

GRU 5
Layer 1 =  64
Layer 2 =  32
Accuracy: 0.4855
True Positive Rate: 0.4342
Tempo de execução: 0.2561 segundos

GRU 6
Layer 1 =  128
Layer 2 =  64
Accuracy: 0.5254
True Positive Rate: 0.507
Tempo de execução: 0.2853 segundos



# Benchmark

In [66]:
true_ratio = round(len(y_test[y_test == 1])/len(y_test),4)
true_ratio

0.4803