In [15]:
# Instalação e importação de Bibliotecas
# pip install pandas numpy matplotlib scikit-learn optuna tensorflow keras
 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.neural_network import MLPClassifier

import optuna

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout, SpatialDropout1D, SimpleRNN, Embedding
from keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [4]:
# Pré-processamento dos dados
df = pd.read_csv("C:/Users/ana_v/OneDrive/Documentos/Mestrado/MachineLearning/TCGA.csv", low_memory=False)
unique_types = df['Type'].unique()
type_to_numeric = {type_name: index for index, type_name in enumerate(unique_types)}
df['Type'] = df['Type'].map(type_to_numeric)
type_column = df['Type']
df_num = df.drop(columns=["Sample", "Type"])

# Normalização dos dados
scaler = StandardScaler()
dados_normalizados = scaler.fit_transform(df_num)

# Aplicação do PCA
pca = PCA(n_components=0.8)  
pca.fit(dados_normalizados)
dados_pca = pca.transform(dados_normalizados)

In [5]:
# Divisão de treino e teste
X_train, X_test, y_train, y_test = train_test_split(dados_pca, df['Type'], test_size=0.2, random_state=42)

# Função objetivo para otimização do MLP
def objective_mlp(trial):
    # Hiperparâmetros para MLP
    hidden_layer_sizes = trial.suggest_categorical('hidden_layer_sizes', [(50,), (100,), (100, 50), (50, 25)])
    activation = trial.suggest_categorical('activation', ['relu', 'tanh', 'logistic'])
    solver = trial.suggest_categorical('solver', ['adam', 'sgd'])
    alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
    learning_rate_init = trial.suggest_loguniform('learning_rate_init', 1e-4, 1e-1)
    
    # Criar e treinar o modelo
    model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation,
                          solver=solver, alpha=alpha, learning_rate_init=learning_rate_init,
                          max_iter=500, random_state=42)
    
    # Validação cruzada
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()
    return score

# Estudar otimização com Optuna para MLP
study_mlp = optuna.create_study(direction='maximize')
study_mlp.optimize(objective_mlp, n_trials=10)

# Melhores parâmetros
best_params_mlp = study_mlp.best_params
print("Melhores parâmetros para MLP:", best_params_mlp)

# Treinamento e avaliação do MLP com melhores parâmetros
best_mlp_model = MLPClassifier(hidden_layer_sizes=best_params_mlp['hidden_layer_sizes'],
                               activation=best_params_mlp['activation'],
                               solver=best_params_mlp['solver'],
                               alpha=best_params_mlp['alpha'],
                               learning_rate_init=best_params_mlp['learning_rate_init'],
                               max_iter=500, random_state=42)
best_mlp_model.fit(X_train, y_train)
mlp_predictions_train = best_mlp_model.predict(X_train)
mlp_predictions_test = best_mlp_model.predict(X_test)

[I 2024-08-22 16:51:34,667] A new study created in memory with name: no-name-1e761759-ba2a-4214-a303-6246e44637a6
  alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
  learning_rate_init = trial.suggest_loguniform('learning_rate_init', 1e-4, 1e-1)
[I 2024-08-22 16:51:43,635] Trial 0 finished with value: 0.743275167785235 and parameters: {'hidden_layer_sizes': (50, 25), 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.0002053104838395193, 'learning_rate_init': 0.0002831966545366896}. Best is trial 0 with value: 0.743275167785235.
  alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
  learning_rate_init = trial.suggest_loguniform('learning_rate_init', 1e-4, 1e-1)
[I 2024-08-22 16:51:45,823] Trial 1 finished with value: 0.7553020134228189 and parameters: {'hidden_layer_sizes': (50,), 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.0582191049554753, 'learning_rate_init': 0.005880474526406358}. Best is trial 1 with value: 0.7553020134228189.
  alpha = trial.suggest_loguniform('al

Melhores parâmetros para MLP: {'hidden_layer_sizes': (50, 25), 'activation': 'logistic', 'solver': 'sgd', 'alpha': 0.019463631700611245, 'learning_rate_init': 0.008053656710050518}




In [6]:
# Avaliação para MLP
print("\nResultados do MLP:")
print("Matriz de Confusão:\n", confusion_matrix(y_test, mlp_predictions_test))
print("Acurácia (Treinamento):", accuracy_score(y_train, mlp_predictions_train))
print("Acurácia (Teste):", accuracy_score(y_test, mlp_predictions_test))

# Acurácia média na validação cruzada
cv_mean_score = cross_val_score(best_mlp_model, X_train, y_train, cv=10, scoring='accuracy').mean()
print(f'Acurácia média na validação cruzada (MLP): {cv_mean_score:.4f}')

# Classification Report
mlp_report = classification_report(y_test, mlp_predictions_test, zero_division=1, target_names=[str(cls) for cls in unique_types])
print(f'\nClassification Report (MLP):\n{mlp_report}')


Resultados do MLP:
Matriz de Confusão:
 [[76 14  1  4  2]
 [10 25  1  2  0]
 [ 1  1 26  0  1]
 [ 0  3  3  9  0]
 [ 5  0  1  0  2]]
Acurácia (Treinamento): 0.9933155080213903
Acurácia (Teste): 0.7379679144385026




Acurácia média na validação cruzada (MLP): 0.7955

Classification Report (MLP):
              precision    recall  f1-score   support

        LumA       0.83      0.78      0.80        97
        LumB       0.58      0.66      0.62        38
       Basal       0.81      0.90      0.85        29
        Her2       0.60      0.60      0.60        15
      Normal       0.40      0.25      0.31         8

    accuracy                           0.74       187
   macro avg       0.64      0.64      0.64       187
weighted avg       0.74      0.74      0.74       187





In [7]:
# Definir número de classes
num_classes = len(unique_types) 

# Função para criar o modelo CNN
def create_cnn_model(filters=32, kernel_size=3, pool_size=2, dense_units=64, dropout_rate=0.5):
    model = Sequential([
        Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(pool_size=pool_size),
        Conv1D(filters=filters * 2, kernel_size=kernel_size, activation='relu'),
        MaxPooling1D(pool_size=pool_size),
        Flatten(),
        Dense(dense_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer=Adam(),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# Função objetivo para otimização do CNN
def objective_cnn(trial):
    # Hiperparâmetros para CNN
    filters = trial.suggest_int('filters', 16, 64)
    kernel_size = trial.suggest_int('kernel_size', 2, 5)
    pool_size = trial.suggest_int('pool_size', 2, 4)
    dense_units = trial.suggest_int('dense_units', 32, 128)
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.2, 0.5)

    # Criação do modelo
    model = create_cnn_model(filters=filters,
                             kernel_size=kernel_size,
                             pool_size=pool_size,
                             dense_units=dense_units,
                             dropout_rate=dropout_rate)

    # Validação cruzada
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]

        model.fit(X_fold_train, y_fold_train, epochs=10, batch_size=32, verbose=0)
        val_loss, val_accuracy = model.evaluate(X_fold_val, y_fold_val, verbose=0)
        cv_scores.append(val_accuracy)
    
    # Média da acurácia de validação cruzada
    mean_cv_accuracy = np.mean(cv_scores)

    return mean_cv_accuracy

# Estudar otimização com Optuna para CNN
study_cnn = optuna.create_study(direction='maximize')
study_cnn.optimize(objective_cnn, n_trials=10)

# Melhores parâmetros
best_params_cnn = study_cnn.best_params
print("Melhores parâmetros para CNN:", best_params_cnn)

# Treinamento e avaliação do CNN com melhores parâmetros
best_cnn_model = create_cnn_model(filters=best_params_cnn['filters'],
                                  kernel_size=best_params_cnn['kernel_size'],
                                  pool_size=best_params_cnn['pool_size'],
                                  dense_units=best_params_cnn['dense_units'],
                                  dropout_rate=best_params_cnn['dropout_rate'])

# Treinamento do modelo
history = best_cnn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Avaliação da CNN no conjunto de teste
cnn_loss, cnn_accuracy = best_cnn_model.evaluate(X_test, y_test)

# Predições da CNN no conjunto de teste
cnn_predictions = best_cnn_model.predict(X_test)
cnn_predictions_classes = np.argmax(cnn_predictions, axis=1)

# Classification report da CNN
cnn_report = classification_report(y_test, cnn_predictions_classes, target_names=[str(cls) for cls in unique_types], zero_division=1, digits=2)

# Matriz de confusão
conf_matrix = confusion_matrix(y_test, cnn_predictions_classes)

# Acurácia de treinamento
train_accuracy = history.history['accuracy'][-1]

# Acurácia da validação cruzada (a média das acurácias obtidas durante a otimização)
cv_mean_accuracy = study_cnn.best_value


[I 2024-08-22 16:54:17,853] A new study created in memory with name: no-name-b4a8f953-8e41-48b2-9118-aded4f6bcd23
  dropout_rate = trial.suggest_uniform('dropout_rate', 0.2, 0.5)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
[I 2024-08-22 16:54:26,810] Trial 0 finished with value: 0.8559194684028626 and parameters: {'filters': 57, 'kernel_size': 5, 'pool_size': 4, 'dense_units': 102, 'dropout_rate': 0.39419344373473975}. Best is trial 0 with value: 0.8559194684028626.
[I 2024-08-22 16:54:35,740] Trial 1 finished with value: 0.9013333320617676 and parameters: {'filters': 52, 'kernel_size': 3, 'pool_size': 3, 'dense_units': 111, 'dropout_rate': 0.22879006961789408}. Best is trial 1 with value: 0.9013333320617676.
[I 2024-08-22 16:54:48,055] Trial 2 finished with value: 0.9199731469154357 and parameters: {'filters': 64, 'kernel_size': 4, 'pool_size': 2, 'dense_units': 105, 'dropout_rate': 0.23479520196155776}. Best is trial 2 with value: 0.9199731469154357.
[I 20

Melhores parâmetros para CNN: {'filters': 64, 'kernel_size': 4, 'pool_size': 2, 'dense_units': 105, 'dropout_rate': 0.23479520196155776}
Epoch 1/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.3782 - loss: 2.0233 - val_accuracy: 0.5882 - val_loss: 1.2307
Epoch 2/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.6626 - loss: 1.1116 - val_accuracy: 0.6524 - val_loss: 1.0106
Epoch 3/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7455 - loss: 0.7337 - val_accuracy: 0.6952 - val_loss: 0.9197
Epoch 4/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7642 - loss: 0.6200 - val_accuracy: 0.7059 - val_loss: 0.8399
Epoch 5/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.8387 - loss: 0.4665 - val_accuracy: 0.6524 - val_loss: 0.8580
Epoch 6/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [8]:
# Exibindo resultados da CNN
print(f'\nCNN - Acurácia (Treinamento): {train_accuracy:.4f}')
print(f'CNN - Acurácia (Teste): {cnn_accuracy:.4f}')
print(f'Acurácia média na validação cruzada (CNN): {cv_mean_accuracy:.4f}')
print(f'\nMatriz de Confusão (CNN):\n{conf_matrix}')
print(f'\nClassification Report (CNN):\n{cnn_report}')


CNN - Acurácia (Treinamento): 0.9532
CNN - Acurácia (Teste): 0.6578
Acurácia média na validação cruzada (CNN): 0.9200

Matriz de Confusão (CNN):
[[78 13  1  5  0]
 [21 14  1  2  0]
 [ 2  2 24  1  0]
 [ 4  2  2  7  0]
 [ 5  2  0  1  0]]

Classification Report (CNN):
              precision    recall  f1-score   support

        LumA       0.71      0.80      0.75        97
        LumB       0.42      0.37      0.39        38
       Basal       0.86      0.83      0.84        29
        Her2       0.44      0.47      0.45        15
      Normal       1.00      0.00      0.00         8

    accuracy                           0.66       187
   macro avg       0.69      0.49      0.49       187
weighted avg       0.66      0.66      0.64       187

