## Correlação das features

In [1]:
import numpy as np
import pandas as pd
from treino_avaliacao import treinar_avaliar
from Models.LSTM.LSTM import LSTMModel
from Models.GRU.GRU import GRUModel
import sys, os
sys.path.append('..')


RANDOM_SEED = 33

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
get_dataset_path = lambda  stock, get_labels,get_train: os.path.join("..","FinalDatasets", stock,f"{stock}_{'y' if get_labels else 'X'}_timeseries_{'train' if get_train else 'test'}.npy")

X_train = np.load(get_dataset_path(stock= "VALE3", get_labels= False, get_train= True))
y_train = np.load(get_dataset_path(stock= "VALE3", get_labels= True, get_train= True))
X_test  = np.load(get_dataset_path(stock= "VALE3", get_labels= False, get_train= False))
y_test  = np.load(get_dataset_path(stock= "VALE3", get_labels= True, get_train= False))

In [3]:
df_train = pd.read_csv("..\\FinalDatasets\\VALE3\\VALE3_tabular_train.csv")
df_test = pd.read_csv("..\\FinalDatasets\\VALE3\\VALE3_tabular_test.csv")

In [4]:
coluna_target = df_train["hasRise"]
df_train.drop(columns= ["Date", "hasRise"], inplace= True)
df_test.drop(columns= ["Date"], inplace= True)

In [5]:
def get_highly_correlated_features(correlation_matrix, threshold):
  correlated_pairs = []
  for i in range(len(correlation_matrix.columns)):
    for j in range(i):
      if abs(correlation_matrix.iloc[i, j]) > threshold:
        pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])
        coefficient = correlation_matrix.iloc[i, j]
        correlated_pairs.append((pair, coefficient))
  return sorted(correlated_pairs, key= lambda pair: pair[1], reverse=True)

MAX_CORRELATION = 0.90

corr_matrix = df_train.corr(method='spearman').abs()
correlation_list = get_highly_correlated_features(corr_matrix, MAX_CORRELATION)
len(correlation_list), correlation_list[:10]


(54,
 [(('bom negócio_count', 'valorização_count'), 1.0),
  (('lucro_count', 'valorização_count'), 1.0),
  (('lucro_count', 'bom negócio_count'), 1.0),
  (('neutro_count', 'valorização_count'), 1.0),
  (('neutro_count', 'bom negócio_count'), 1.0),
  (('neutro_count', 'lucro_count'), 1.0),
  (('desvalorização_count', 'valorização_count'), 1.0),
  (('desvalorização_count', 'bom negócio_count'), 1.0),
  (('desvalorização_count', 'lucro_count'), 1.0),
  (('desvalorização_count', 'neutro_count'), 1.0)])

In [6]:
# Drop high correlated features in correlation list

f2drop = []
for feature_pair, _ in correlation_list:
  if feature_pair[0] not in f2drop and feature_pair[1] not in f2drop:
    f2drop.append(feature_pair[1])

df_train_reduced = df_train.drop(f2drop, axis='columns')
df_test_reduced = df_test.drop(f2drop, axis='columns')

In [7]:
len(df_train_reduced.columns)

40

In [8]:
from create_train_sets import get_sequences_X_y

In [9]:
df_train_reduced["hasRise"] = coluna_target 
X_train_reduced, y_train_reduced = get_sequences_X_y(df_train_reduced)


In [10]:
X_test_reduced, y_test_reduced = get_sequences_X_y(df_test_reduced)

In [11]:
X_train_reduced.shape,X_test_reduced.shape

((370, 7, 40), (115, 7, 40))

## Colocar dropout

In [12]:
from Models.LSTM.LSTM_dropout import LSTMModel_dropout
from Models.GRU.GRU_dropout import GRUModel_dropout


In [13]:
from treino_avaliacao import treinar, avaliar

In [14]:
n_trials = 100
file_to_save_model = "low_corr_features.pt"

In [15]:
model = treinar( X_train_reduced, y_train_reduced,LSTMModel_dropout, n_trials, 'VALE3'+file_to_save_model)
model.eval()
avaliar(model, X_train_reduced, y_train_reduced, X_test_reduced, y_test_reduced)


[I 2024-07-12 00:02:22,664] A new study created in memory with name: no-name-13b8842d-0284-4ecc-8cd7-e8d577cebd06
[I 2024-07-12 00:02:36,839] Trial 0 finished with value: 0.7499437966867545 and parameters: {'hidden_size': 18, 'num_layers': 4, 'learning_rate': 0.0009724929887188219}. Best is trial 0 with value: 0.7499437966867545.
[I 2024-07-12 00:02:43,903] Trial 1 finished with value: 0.6877798397334394 and parameters: {'hidden_size': 38, 'num_layers': 1, 'learning_rate': 0.00018248998624737642}. Best is trial 0 with value: 0.7499437966867545.
[I 2024-07-12 00:02:53,866] Trial 2 finished with value: 0.7366469896605747 and parameters: {'hidden_size': 72, 'num_layers': 3, 'learning_rate': 0.0002730209722191135}. Best is trial 0 with value: 0.7499437966867545.
[I 2024-07-12 00:03:02,979] Trial 3 finished with value: 0.7438178840819636 and parameters: {'hidden_size': 125, 'num_layers': 3, 'learning_rate': 0.00023672403410176067}. Best is trial 0 with value: 0.7499437966867545.
[I 2024-07-

Number of finished trials:  100
Best trial:
  Value:  0.910958118187034
  Params: 
    hidden_size: 69
    num_layers: 1
    learning_rate: 0.005737968057665162
do not know the model
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       187
           1       1.00      1.00      1.00       183

    accuracy                           1.00       370
   macro avg       1.00      1.00      1.00       370
weighted avg       1.00      1.00      1.00       370

              precision    recall  f1-score   support

           0       0.65      0.65      0.65        68
           1       0.49      0.49      0.49        47

    accuracy                           0.58       115
   macro avg       0.57      0.57      0.57       115
weighted avg       0.58      0.58      0.58       115



In [16]:
model = treinar( X_train_reduced, y_train_reduced,GRUModel_dropout, n_trials, 'VALE3'+file_to_save_model)
model.eval()
avaliar(model, X_train_reduced, y_train_reduced, X_test_reduced, y_test_reduced)


[I 2024-07-12 00:14:20,039] A new study created in memory with name: no-name-18442c8b-9a2d-4e60-bee3-31d505d4168f
[I 2024-07-12 00:14:29,770] Trial 0 finished with value: 0.7511896733601509 and parameters: {'hidden_size': 77, 'num_layers': 5, 'learning_rate': 0.005397658724109907}. Best is trial 0 with value: 0.7511896733601509.
[I 2024-07-12 00:14:35,903] Trial 1 finished with value: 0.8591452991452992 and parameters: {'hidden_size': 121, 'num_layers': 1, 'learning_rate': 0.0008132688465363583}. Best is trial 1 with value: 0.8591452991452992.
[I 2024-07-12 00:14:41,925] Trial 2 finished with value: 0.8762657442345198 and parameters: {'hidden_size': 95, 'num_layers': 1, 'learning_rate': 0.0033816828663464794}. Best is trial 2 with value: 0.8762657442345198.
[I 2024-07-12 00:14:49,566] Trial 3 finished with value: 0.7222959674957139 and parameters: {'hidden_size': 84, 'num_layers': 3, 'learning_rate': 0.000203044642538705}. Best is trial 2 with value: 0.8762657442345198.
[I 2024-07-12 0

Number of finished trials:  100
Best trial:
  Value:  0.910932801686829
  Params: 
    hidden_size: 51
    num_layers: 1
    learning_rate: 0.0030332268738291496
do not know the model
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       187
           1       0.99      1.00      1.00       183

    accuracy                           1.00       370
   macro avg       1.00      1.00      1.00       370
weighted avg       1.00      1.00      1.00       370

              precision    recall  f1-score   support

           0       0.59      0.68      0.63        68
           1       0.41      0.32      0.36        47

    accuracy                           0.53       115
   macro avg       0.50      0.50      0.49       115
weighted avg       0.51      0.53      0.52       115



In [17]:
df_train = pd.read_csv("..\\FinalDatasets\\PETR4\\PETR4_tabular_train.csv")
df_test = pd.read_csv("..\\FinalDatasets\\PETR4\\PETR4_tabular_test.csv")

coluna_target = df_train["hasRise"]
df_train.drop(columns= ["Date", "hasRise"], inplace= True)
df_test.drop(columns= ["Date"], inplace= True)

# Drop high correlated features in correlation list

f2drop = []
for feature_pair, _ in correlation_list:
  if feature_pair[0] not in f2drop and feature_pair[1] not in f2drop:
    f2drop.append(feature_pair[1])

df_train_reduced = df_train.drop(f2drop, axis='columns')
df_test_reduced = df_test.drop(f2drop, axis='columns')

df_train_reduced["hasRise"] = coluna_target 
X_train_reduced, y_train_reduced = get_sequences_X_y(df_train_reduced)
X_test_reduced, y_test_reduced = get_sequences_X_y(df_test_reduced)

In [18]:
X_train_reduced.shape,X_test_reduced.shape

((370, 7, 40), (115, 7, 40))

In [19]:
model = treinar( X_train_reduced, y_train_reduced,LSTMModel_dropout, n_trials, 'PETR4'+file_to_save_model)
model.eval()
avaliar(model, X_train_reduced, y_train_reduced, X_test_reduced, y_test_reduced)


[I 2024-07-12 00:25:42,801] A new study created in memory with name: no-name-98a78309-f515-446c-85ea-a71381bc7c2c
[I 2024-07-12 00:25:51,358] Trial 0 finished with value: 0.7790014074525109 and parameters: {'hidden_size': 110, 'num_layers': 2, 'learning_rate': 0.009480276077163707}. Best is trial 0 with value: 0.7790014074525109.
[I 2024-07-12 00:25:59,650] Trial 1 finished with value: 0.7918042746074919 and parameters: {'hidden_size': 58, 'num_layers': 3, 'learning_rate': 0.003276792563509802}. Best is trial 1 with value: 0.7918042746074919.
[I 2024-07-12 00:26:09,002] Trial 2 finished with value: 0.703616122677534 and parameters: {'hidden_size': 116, 'num_layers': 4, 'learning_rate': 0.008591032654885465}. Best is trial 1 with value: 0.7918042746074919.
[I 2024-07-12 00:26:17,389] Trial 3 finished with value: 0.5867125103228796 and parameters: {'hidden_size': 84, 'num_layers': 3, 'learning_rate': 0.00010057437628508267}. Best is trial 1 with value: 0.7918042746074919.
[I 2024-07-12 0

Number of finished trials:  100
Best trial:
  Value:  0.8712365095960409
  Params: 
    hidden_size: 76
    num_layers: 1
    learning_rate: 0.0036966944374137688
do not know the model
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       169
           1       0.98      0.98      0.98       201

    accuracy                           0.98       370
   macro avg       0.98      0.98      0.98       370
weighted avg       0.98      0.98      0.98       370

              precision    recall  f1-score   support

           0       0.45      0.59      0.51        49
           1       0.61      0.47      0.53        66

    accuracy                           0.52       115
   macro avg       0.53      0.53      0.52       115
weighted avg       0.54      0.52      0.52       115



In [20]:
model = treinar( X_train_reduced, y_train_reduced,GRUModel_dropout, n_trials, 'PETR4'+file_to_save_model)
model.eval()
avaliar(model, X_train_reduced, y_train_reduced, X_test_reduced, y_test_reduced)


[I 2024-07-12 00:38:09,079] A new study created in memory with name: no-name-ee7bebfe-27fe-4745-8f2a-273cfa48e949
[I 2024-07-12 00:38:18,961] Trial 0 finished with value: 0.6177112620489978 and parameters: {'hidden_size': 73, 'num_layers': 5, 'learning_rate': 0.007946186456591158}. Best is trial 0 with value: 0.6177112620489978.
[I 2024-07-12 00:38:26,839] Trial 1 finished with value: 0.6199104263207322 and parameters: {'hidden_size': 73, 'num_layers': 3, 'learning_rate': 0.0003003396388745038}. Best is trial 1 with value: 0.6199104263207322.
[I 2024-07-12 00:38:34,843] Trial 2 finished with value: 0.6098167593216329 and parameters: {'hidden_size': 54, 'num_layers': 3, 'learning_rate': 0.00036843450581779553}. Best is trial 1 with value: 0.6199104263207322.
[I 2024-07-12 00:38:42,166] Trial 3 finished with value: 0.8247499581766885 and parameters: {'hidden_size': 124, 'num_layers': 3, 'learning_rate': 0.0022134239125972755}. Best is trial 3 with value: 0.8247499581766885.
[I 2024-07-12

Number of finished trials:  100
Best trial:
  Value:  0.8913883119141335
  Params: 
    hidden_size: 103
    num_layers: 1
    learning_rate: 0.004331616459644097
do not know the model
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       169
           1       0.99      1.00      0.99       201

    accuracy                           0.99       370
   macro avg       0.99      0.99      0.99       370
weighted avg       0.99      0.99      0.99       370

              precision    recall  f1-score   support

           0       0.44      0.65      0.53        49
           1       0.60      0.39      0.48        66

    accuracy                           0.50       115
   macro avg       0.52      0.52      0.50       115
weighted avg       0.54      0.50      0.50       115



Os modelos continuam tendo overfit, provalvemente isso ocorre por causa da falta de daos, uma vez que o dataset é muito pequeno (menos de 400 samples).