## Correlação das features

In [1]:
import numpy as np
import pandas as pd
from treino_avaliacao import treinar_avaliar
from Models.LSTM.LSTM import LSTMModel
from Models.GRU.GRU import GRUModel
import sys, os
sys.path.append('..')


RANDOM_SEED = 33

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
get_dataset_path = lambda  stock, get_labels,get_train: os.path.join("..","FinalDatasets", stock,f"{stock}_{'y' if get_labels else 'X'}_timeseries_{'train' if get_train else 'test'}.npy")

X_train = np.load(get_dataset_path(stock= "VALE3", get_labels= False, get_train= True))
y_train = np.load(get_dataset_path(stock= "VALE3", get_labels= True, get_train= True))
X_test  = np.load(get_dataset_path(stock= "VALE3", get_labels= False, get_train= False))
y_test  = np.load(get_dataset_path(stock= "VALE3", get_labels= True, get_train= False))

In [3]:
df_train = pd.read_csv("..\\FinalDatasets\\VALE3\\VALE3_tabular_train.csv")
df_test = pd.read_csv("..\\FinalDatasets\\VALE3\\VALE3_tabular_test.csv")

In [4]:
coluna_target = df_train["hasRise"]
df_train.drop(columns= ["Date", "hasRise"], inplace= True)
df_test.drop(columns= ["Date"], inplace= True)

In [5]:
def get_highly_correlated_features(correlation_matrix, threshold):
  correlated_pairs = []
  for i in range(len(correlation_matrix.columns)):
    for j in range(i):
      if abs(correlation_matrix.iloc[i, j]) > threshold:
        pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])
        coefficient = correlation_matrix.iloc[i, j]
        correlated_pairs.append((pair, coefficient))
  return sorted(correlated_pairs, key= lambda pair: pair[1], reverse=True)

MAX_CORRELATION = 0.90

corr_matrix = df_train.corr(method='spearman').abs()
correlation_list = get_highly_correlated_features(corr_matrix, MAX_CORRELATION)
len(correlation_list), correlation_list[:10]


(54,
 [(('bom negócio_count', 'valorização_count'), 1.0),
  (('lucro_count', 'valorização_count'), 1.0),
  (('lucro_count', 'bom negócio_count'), 1.0),
  (('neutro_count', 'valorização_count'), 1.0),
  (('neutro_count', 'bom negócio_count'), 1.0),
  (('neutro_count', 'lucro_count'), 1.0),
  (('desvalorização_count', 'valorização_count'), 1.0),
  (('desvalorização_count', 'bom negócio_count'), 1.0),
  (('desvalorização_count', 'lucro_count'), 1.0),
  (('desvalorização_count', 'neutro_count'), 1.0)])

In [6]:
# Drop high correlated features in correlation list

f2drop = []
for feature_pair, _ in correlation_list:
  if feature_pair[0] not in f2drop and feature_pair[1] not in f2drop:
    f2drop.append(feature_pair[1])

df_train_reduced = df_train.drop(f2drop, axis='columns')
df_test_reduced = df_test.drop(f2drop, axis='columns')

In [7]:
len(df_train_reduced.columns)

40

In [8]:
from create_train_sets import get_sequences_X_y

In [9]:
df_train_reduced["hasRise"] = coluna_target 
X_train_reduced, y_train_reduced = get_sequences_X_y(df_train_reduced)


In [10]:
X_test_reduced, y_test_reduced = get_sequences_X_y(df_test_reduced)

In [11]:
X_train_reduced.shape,X_test_reduced.shape

((370, 7, 40), (115, 7, 40))

## Colocar dropout

In [12]:
from Models.LSTM.LSTM_dropout import LSTMModel_dropout
from Models.GRU.GRU_dropout import GRUModel_dropout


In [13]:
from treino_avaliacao import treinar, avaliar

In [14]:
n_trials = 20
file_to_save_model = "low_corr_features.pt"

In [15]:
model = treinar( X_train_reduced, y_train_reduced,LSTMModel_dropout, n_trials, 'VALE3'+file_to_save_model)
avaliar(model, X_train_reduced, y_train_reduced, X_test_reduced, y_test_reduced)


[I 2024-07-11 23:03:18,438] A new study created in memory with name: no-name-efac26d5-cc3e-48aa-806b-ac0656403176


In [None]:
model = treinar( X_train_reduced, y_train_reduced,GRUModel_dropout, n_trials, 'VALE3'+file_to_save_model)
avaliar(model, X_train_reduced, y_train_reduced, X_test_reduced, y_test_reduced)


In [None]:
df_train = pd.read_csv("..\\FinalDatasets\\PETR4\\PETR4_tabular_train.csv")
df_test = pd.read_csv("..\\FinalDatasets\\PETR4\\PETR4_tabular_test.csv")

coluna_target = df_train["hasRise"]
df_train.drop(columns= ["Date", "hasRise"], inplace= True)
df_test.drop(columns= ["Date"], inplace= True)

# Drop high correlated features in correlation list

f2drop = []
for feature_pair, _ in correlation_list:
  if feature_pair[0] not in f2drop and feature_pair[1] not in f2drop:
    f2drop.append(feature_pair[1])

df_train_reduced = df_train.drop(f2drop, axis='columns')
df_test_reduced = df_test.drop(f2drop, axis='columns')

df_train_reduced["hasRise"] = coluna_target 
X_train_reduced, y_train_reduced = get_sequences_X_y(df_train_reduced)
X_test_reduced, y_test_reduced = get_sequences_X_y(df_test_reduced)

In [None]:
X_train_reduced.shape,X_test_reduced.shape

((370, 7, 40), (115, 7, 40))

In [None]:
model = treinar( X_train_reduced, y_train_reduced,LSTMModel_dropout, n_trials, 'PETR4'+file_to_save_model)
avaliar(model, X_train_reduced, y_train_reduced, X_test_reduced, y_test_reduced)


In [None]:
model = treinar( X_train_reduced, y_train_reduced,GRUModel_dropout, n_trials, 'PETR4'+file_to_save_model)
avaliar(model, X_train_reduced, y_train_reduced, X_test_reduced, y_test_reduced)
