In [25]:
# Instalação e importação de Bibliotecas
# pip install pandas numpy matplotlib scikit-learn optuna tensorflow keras seaborn
 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.neural_network import MLPClassifier

import optuna

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout, SpatialDropout1D, SimpleRNN, Embedding
from keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [2]:
# Pré-processamento dos dados
df = pd.read_csv("C:/Users/ana_v/OneDrive/Documentos/Mestrado/MachineLearning/TCGA.csv", low_memory=False)
unique_types = df['Type'].unique()
type_to_numeric = {type_name: index for index, type_name in enumerate(unique_types)}
df['Type'] = df['Type'].map(type_to_numeric)
type_column = df['Type']
df_num = df.drop(columns=["Sample", "Type"])

# Normalização dos dados
scaler = StandardScaler()
dados_normalizados = scaler.fit_transform(df_num)

# Aplicação do PCA
pca = PCA(n_components=0.8)  
pca.fit(dados_normalizados)
dados_pca = pca.transform(dados_normalizados)

In [18]:
# Fixar a seed para garantir a reprodutibilidade
seed = 1

# Divisão de treino e teste com random_state
X_train, X_test, y_train, y_test = train_test_split(dados_pca, df['Type'], test_size=0.2, random_state=seed)

# Função objetivo para otimização do MLP
def objective_mlp(trial):
    # Sugestão do número de camadas ocultas (entre 1 e 6 camadas)
    n_layers = trial.suggest_int('n_layers', 2, 6)
    
    # Sugestão do número de neurônios em cada camada
    hidden_layer_sizes = [trial.suggest_int(f'n_units_l{i}', 25, 150) for i in range(n_layers)]
    
    # Outros hiperparâmetros
    activation = trial.suggest_categorical('activation', ['relu', 'tanh', 'logistic'])
    solver = trial.suggest_categorical('solver', ['adam', 'sgd'])
    alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
    learning_rate_init = trial.suggest_loguniform('learning_rate_init', 1e-4, 1e-1)
    
    # Criar e treinar o modelo
    model = MLPClassifier(hidden_layer_sizes=tuple(hidden_layer_sizes), activation=activation,
                          solver=solver, alpha=alpha, learning_rate_init=learning_rate_init,
                          max_iter=500, random_state=seed)
    
    # Validação cruzada com random_state para reprodutibilidade
    cv = KFold(n_splits=5, shuffle=True, random_state=seed)
    score = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy').mean()
    return score

# Estudo de otimização com Optuna para MLP com seed fixa
study_mlp = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=seed))
study_mlp.optimize(objective_mlp, n_trials=10)

# Melhores parâmetros
best_params_mlp = study_mlp.best_params
print("Melhores parâmetros para MLP:", best_params_mlp)

# Treinamento e avaliação do MLP com os melhores parâmetros
best_mlp_model = MLPClassifier(hidden_layer_sizes=tuple(best_params_mlp[f'n_units_l{i}'] for i in range(best_params_mlp['n_layers'])),
                               activation=best_params_mlp['activation'],
                               solver=best_params_mlp['solver'],
                               alpha=best_params_mlp['alpha'],
                               learning_rate_init=best_params_mlp['learning_rate_init'],
                               max_iter=500, random_state=seed)
best_mlp_model.fit(X_train, y_train)
mlp_predictions_train = best_mlp_model.predict(X_train)
mlp_predictions_test = best_mlp_model.predict(X_test)

# Avaliação do modelo
mlp_accuracy_train = accuracy_score(y_train, mlp_predictions_train)
mlp_accuracy_test = accuracy_score(y_test, mlp_predictions_test)
mlp_report = classification_report(y_test, mlp_predictions_test, zero_division=1)

[I 2024-08-26 15:32:25,764] A new study created in memory with name: no-name-b1a1a781-eab3-4985-b90e-e4ecdacefc6b
  alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
  learning_rate_init = trial.suggest_loguniform('learning_rate_init', 1e-4, 1e-1)
[I 2024-08-26 15:32:28,977] Trial 0 finished with value: 0.5215391498881432 and parameters: {'n_layers': 4, 'n_units_l0': 115, 'n_units_l1': 25, 'n_units_l2': 63, 'n_units_l3': 43, 'activation': 'logistic', 'solver': 'sgd', 'alpha': 0.00047509237210306113, 'learning_rate_init': 0.011367330868956235}. Best is trial 0 with value: 0.5215391498881432.
  alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
  learning_rate_init = trial.suggest_loguniform('learning_rate_init', 1e-4, 1e-1)
[I 2024-08-26 15:32:49,279] Trial 1 finished with value: 0.7486800894854586 and parameters: {'n_layers': 3, 'n_units_l0': 135, 'n_units_l1': 28, 'n_units_l2': 109, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.0746528346269155, 'learning_rate_init': 0.0008

Melhores parâmetros para MLP: {'n_layers': 3, 'n_units_l0': 138, 'n_units_l1': 97, 'n_units_l2': 25, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.04306660168805714, 'learning_rate_init': 0.007414474013266335}


In [23]:
# Avaliação para MLP
print("\nResultados do MLP:")
print("Matriz de Confusão:\n", confusion_matrix(y_test, mlp_predictions_test))
print("Acurácia (Treinamento):", accuracy_score(y_train, mlp_predictions_train))
print("Acurácia (Teste):", accuracy_score(y_test, mlp_predictions_test))

# Acurácia média na validação cruzada
cv_mean_score = cross_val_score(best_mlp_model, X_train, y_train, cv=10, scoring='accuracy').mean()
print(f'Acurácia média na validação cruzada (MLP): {cv_mean_score:.4f}')

# Classification Report
print(f'\nClassification Report (MLP):\n{mlp_report}')


Resultados do MLP:
Matriz de Confusão:
 [[82  4  0  2  2]
 [13 26  0  2  0]
 [ 0  1 27  1  2]
 [ 0  7  2 11  1]
 [ 2  0  0  0  2]]
Acurácia (Treinamento): 1.0
Acurácia (Teste): 0.7914438502673797
Acurácia média na validação cruzada (MLP): 0.7794

Classification Report (MLP):
              precision    recall  f1-score   support

           0       0.85      0.91      0.88        90
           1       0.68      0.63      0.66        41
           2       0.93      0.87      0.90        31
           3       0.69      0.52      0.59        21
           4       0.29      0.50      0.36         4

    accuracy                           0.79       187
   macro avg       0.69      0.69      0.68       187
weighted avg       0.79      0.79      0.79       187



In [21]:
# Definir número de classes
num_classes = len(unique_types)

# Função para criar o modelo CNN com flexibilidade no número de camadas
def create_cnn_model(n_conv_layers=2, n_dense_layers=1, filters=32, kernel_size=3, pool_size=2, dense_units=64, dropout_rate=0.5):
    model = Sequential()

    # Adicionar camadas convolucionais conforme definido por Optuna
    for i in range(n_conv_layers):
        if i == 0:
            # Primeira camada convolucional, precisa definir o input_shape
            model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', padding='same', input_shape=(X_train.shape[1], 1)))
        else:
            model.add(Conv1D(filters=filters * (2 ** i), kernel_size=kernel_size, activation='relu', padding='same'))
        model.add(MaxPooling1D(pool_size=pool_size))

    model.add(Flatten())

    # Adicionar camadas densas conforme definido por Optuna
    for _ in range(n_dense_layers):
        model.add(Dense(dense_units, activation='relu'))
        model.add(Dropout(dropout_rate))
    
    # Camada de saída
    model.add(Dense(num_classes, activation='softmax'))
    
    model.compile(optimizer=Adam(),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# Função objetivo para otimização do CNN com Optuna
def objective_cnn(trial):
    # Hiperparâmetros para CNN com intervalos mais restritos
    n_conv_layers = trial.suggest_int('n_conv_layers', 1, 4)  # Número de camadas convolucionais
    n_dense_layers = trial.suggest_int('n_dense_layers', 1, 3)  # Número de camadas densas
    filters = trial.suggest_int('filters', 16, 64)
    kernel_size = trial.suggest_int('kernel_size', 2, 5)  # Intervalo ajustado
    pool_size = trial.suggest_int('pool_size', 2, 3)  # Intervalo ajustado
    dense_units = trial.suggest_int('dense_units', 32, 128)
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.2, 0.5)
    epochs = trial.suggest_int('epochs', 5, 20)  # Número de épocas para otimização

    # Criação do modelo
    model = create_cnn_model(n_conv_layers=n_conv_layers,
                             n_dense_layers=n_dense_layers,
                             filters=filters,
                             kernel_size=kernel_size,
                             pool_size=pool_size,
                             dense_units=dense_units,
                             dropout_rate=dropout_rate)

    # Validação cruzada
    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    cv_scores = []
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]

        model.fit(X_fold_train, y_fold_train, epochs=epochs, batch_size=32, verbose=0)
        val_loss, val_accuracy = model.evaluate(X_fold_val, y_fold_val, verbose=0)
        cv_scores.append(val_accuracy)
    
    # Média da acurácia de validação cruzada
    mean_cv_accuracy = np.mean(cv_scores)

    return mean_cv_accuracy

# Estudo de otimização com Optuna para CNN
study_cnn = optuna.create_study(direction='maximize')
study_cnn.optimize(objective_cnn, n_trials=10)

# Melhores parâmetros
best_params_cnn = study_cnn.best_params
print("Melhores parâmetros para CNN:", best_params_cnn)

# Treinamento e avaliação do CNN com melhores parâmetros
best_cnn_model = create_cnn_model(n_conv_layers=best_params_cnn['n_conv_layers'],
                                  n_dense_layers=best_params_cnn['n_dense_layers'],
                                  filters=best_params_cnn['filters'],
                                  kernel_size=best_params_cnn['kernel_size'],
                                  pool_size=best_params_cnn['pool_size'],
                                  dense_units=best_params_cnn['dense_units'],
                                  dropout_rate=best_params_cnn['dropout_rate'])

# Treinamento do modelo com o melhor número de épocas
history = best_cnn_model.fit(X_train, y_train, epochs=best_params_cnn['epochs'], batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Avaliação da CNN no conjunto de teste
cnn_loss, cnn_accuracy = best_cnn_model.evaluate(X_test, y_test)

# Predições da CNN no conjunto de teste
cnn_predictions = best_cnn_model.predict(X_test)
cnn_predictions_classes = np.argmax(cnn_predictions, axis=1)

# Classification report da CNN
cnn_report = classification_report(y_test, cnn_predictions_classes, target_names=[str(cls) for cls in unique_types], zero_division=1, digits=2)

# Matriz de confusão
conf_matrix = confusion_matrix(y_test, cnn_predictions_classes)

# Acurácia de treinamento
train_accuracy = history.history['accuracy'][-1]

# Acurácia da validação cruzada 
cv_mean_accuracy = study_cnn.best_value

[I 2024-08-26 15:35:56,589] A new study created in memory with name: no-name-7dfaa4b7-a6ea-40ed-9f1d-310d03eb3c7c
  dropout_rate = trial.suggest_uniform('dropout_rate', 0.2, 0.5)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
[I 2024-08-26 15:36:03,848] Trial 0 finished with value: 0.8158389210700989 and parameters: {'n_conv_layers': 2, 'n_dense_layers': 3, 'filters': 36, 'kernel_size': 2, 'pool_size': 2, 'dense_units': 111, 'dropout_rate': 0.29756163591894164, 'epochs': 5}. Best is trial 0 with value: 0.8158389210700989.
[I 2024-08-26 15:36:16,926] Trial 1 finished with value: 0.9053333282470704 and parameters: {'n_conv_layers': 2, 'n_dense_layers': 2, 'filters': 39, 'kernel_size': 4, 'pool_size': 2, 'dense_units': 112, 'dropout_rate': 0.29788533249844734, 'epochs': 13}. Best is trial 1 with value: 0.9053333282470704.
[I 2024-08-26 15:36:25,960] Trial 2 finished with value: 0.8318657755851746 and parameters: {'n_conv_layers': 4, 'n_dense_layers': 2, 'filters':

Melhores parâmetros para CNN: {'n_conv_layers': 2, 'n_dense_layers': 1, 'filters': 55, 'kernel_size': 4, 'pool_size': 2, 'dense_units': 128, 'dropout_rate': 0.4536719286586085, 'epochs': 20}
Epoch 1/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.3669 - loss: 2.5761 - val_accuracy: 0.4813 - val_loss: 1.2751
Epoch 2/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6033 - loss: 1.1094 - val_accuracy: 0.6738 - val_loss: 0.9236
Epoch 3/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7446 - loss: 0.7788 - val_accuracy: 0.6471 - val_loss: 0.8675
Epoch 4/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7837 - loss: 0.6707 - val_accuracy: 0.7059 - val_loss: 0.7503
Epoch 5/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.8473 - loss: 0.4418 - val_accuracy: 0.7326 - val_loss: 0.6982
Epoc

In [22]:
# Exibindo resultados da CNN
print(f'\nCNN - Acurácia (Treinamento): {train_accuracy:.4f}')
print(f'CNN - Acurácia (Teste): {cnn_accuracy:.4f}')
print(f'Acurácia média na validação cruzada (CNN): {cv_mean_accuracy:.4f}')
print(f'\nMatriz de Confusão (CNN):\n{conf_matrix}')
print(f'\nClassification Report (CNN):\n{cnn_report}')


CNN - Acurácia (Treinamento): 0.9799
CNN - Acurácia (Teste): 0.7594
Acurácia média na validação cruzada (CNN): 0.9253

Matriz de Confusão (CNN):
[[82  5  1  2  0]
 [17 21  1  2  0]
 [ 2  0 29  0  0]
 [ 5  5  2  9  0]
 [ 3  0  0  0  1]]

Classification Report (CNN):
              precision    recall  f1-score   support

        LumA       0.75      0.91      0.82        90
        LumB       0.68      0.51      0.58        41
       Basal       0.88      0.94      0.91        31
        Her2       0.69      0.43      0.53        21
      Normal       1.00      0.25      0.40         4

    accuracy                           0.76       187
   macro avg       0.80      0.61      0.65       187
weighted avg       0.76      0.76      0.74       187



In [30]:
# Definir unique_types e target_names
unique_types = df['Type'].unique()
target_names = [str(cls) for cls in unique_types]

# Matrizes de confusão
conf_matrix_cnn = confusion_matrix(y_test, cnn_predictions_classes)
conf_matrix_mlp = confusion_matrix(y_test, mlp_predictions_test)

# Função para plotar e salvar a matriz de confusão
def plot_confusion_matrix(conf_matrix, title, filename):
    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Greys', 
                xticklabels=target_names, yticklabels=target_names, 
                annot_kws={"size": 16})  
    plt.title(title, fontsize=14)
    plt.xlabel('Predicted Label', fontsize=14)
    plt.ylabel('True Label', fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.savefig(filename)
    plt.close()

# Plotar e salvar a matriz de confusão para CNN
plot_confusion_matrix(conf_matrix_cnn, 'Confusion Matrix - CNN', 'confusion_matrix_cnn_op.png')

# Plotar e salvar a matriz de confusão para MLP
plot_confusion_matrix(conf_matrix_mlp, 'Confusion Matrix - MLP', 'confusion_matrix_mlp_op.png')