In [8]:
# pip install pandas numpy matplotlib scikit-learn optuna tensorflow keras seaborn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, learning_curve

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, SpatialDropout1D, SimpleRNN, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.utils import to_categorical


In [9]:
# Carregar os dados
df = pd.read_csv("C:/Users/ana_v/OneDrive/Documentos/Mestrado/MachineLearning/TCGA.csv", low_memory=False)

# Obtém os valores únicos na coluna 'Type'
unique_types = df['Type'].unique()

# Cria um dicionário mapeando cada tipo único para um número
type_to_numeric = {type_name: index for index, type_name in enumerate(unique_types)}

# Aplica a substituição usando o método map
df['Type'] = df['Type'].map(type_to_numeric)

# Armazena a coluna 'Type' para adicioná-la de volta posteriormente
type_column = df['Type']

# Prepara o DataFrame para normalização (remover colunas desnecessárias)
df_num = df.drop(columns=["Sample", "Type"])

# Normalizar os dados
scaler = StandardScaler()
dados_normalizados = scaler.fit_transform(df_num)

# Aplicação do PCA
pca = PCA(n_components=0.8)  
pca.fit(dados_normalizados)
dados_pca = pca.transform(dados_normalizados)

In [32]:
# Fixar a seed para garantir a reprodutibilidade
seed = 1

# Divisão de treino e teste com random_state para garantir a reprodutibilidade dos resultados
X_train, X_test, y_train, y_test = train_test_split(dados_pca, df['Type'], test_size=0.2, random_state=seed)

# Criação e treinamento do MLP com random_state fixo
mlp_model = MLPClassifier(hidden_layer_sizes=(100, 75, 50, 25), max_iter=500, random_state=seed)
mlp_model.fit(X_train, y_train)

# Predições no conjunto de treino e teste
mlp_predictions_train = mlp_model.predict(X_train)
mlp_predictions_test = mlp_model.predict(X_test)

# Avaliação do MLP
mlp_accuracy_train = accuracy_score(y_train, mlp_predictions_train)
mlp_accuracy_test = accuracy_score(y_test, mlp_predictions_test)
mlp_report = classification_report(y_test, mlp_predictions_test, zero_division=1)

# Criar um objeto de validação cruzada com random_state fixo
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

# Executar a validação cruzada com random_state fixo
mlp_scores = cross_val_score(mlp_model, dados_pca, df['Type'], cv=cv, scoring='accuracy')

# Exibindo resultados do MLP
print(f'MLP - Acurácia (Treinamento): {mlp_accuracy_train}')
print(f'MLP - Acurácia (Teste): {mlp_accuracy_test}')
print(f'Acurácia média na validação cruzada (MLP): {mlp_scores.mean()}')
print('')
print(f'Classification Report MLP:\n{mlp_report}')

# Definir as classes com base nos seus dados
classes = df['Type'].unique()

# Matriz de Confusão do MLP
conf_matrix_mlp = confusion_matrix(y_test, mlp_predictions_test)
print("Matriz de Confusão:\n", conf_matrix_mlp)

MLP - Acurácia (Treinamento): 1.0
MLP - Acurácia (Teste): 0.7272727272727273
Acurácia média na validação cruzada (MLP): 0.7497712194005948

Classification Report MLP:
              precision    recall  f1-score   support

           0       0.78      0.89      0.83        90
           1       0.64      0.51      0.57        41
           2       0.75      0.77      0.76        31
           3       0.62      0.38      0.47        21
           4       0.50      0.75      0.60         4

    accuracy                           0.73       187
   macro avg       0.66      0.66      0.65       187
weighted avg       0.72      0.73      0.72       187

Matriz de Confusão:
 [[80  6  2  2  0]
 [14 21  2  3  1]
 [ 3  3 24  0  1]
 [ 5  3  4  8  1]
 [ 1  0  0  0  3]]


In [33]:
# Fixar a seed para garantir a reprodutibilidade
np.random.seed(seed)
tf.random.set_seed(seed)

# Supondo que dados_pca e df['Type'] estejam definidos corretamente
unique_types = df['Type'].unique()
num_classes = len(unique_types)

# Convertendo unique_types para strings
target_names = [str(cls) for cls in unique_types]

# Expansão das dimensões de X_train e X_test (somente uma vez)
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

# Verificar as dimensões de X_train e X_test
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of X_test: {X_test.shape}')

# Função para criar o modelo CNN
def create_cnn_model():
    model = Sequential([
        Conv1D(32, 3, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(2),
        Conv1D(64, 3, activation='relu'),
        MaxPooling1D(2),
        Flatten(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')  # num_classes deve ser o número de classes únicas
    ])
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',  # Use sparse_categorical_crossentropy se y_train não estiver one-hot
                  metrics=['accuracy'])
    return model

# Validação cruzada
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
cross_val_scores = []

for train_index, val_index in kfold.split(X_train):
    X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]

    # Criar o modelo para cada dobra
    model = create_cnn_model()
    history = model.fit(X_fold_train, y_fold_train, epochs=25, batch_size=32, verbose=0, validation_data=(X_fold_val, y_fold_val))

    # Capturar a última acurácia de validação
    val_accuracy = history.history['val_accuracy'][-1]
    cross_val_scores.append(val_accuracy)

# Calcular média e desvio padrão da validação cruzada
cross_val_mean = np.mean(cross_val_scores)
cross_val_std = np.std(cross_val_scores)

# Treinamento final da CNN
model = create_cnn_model()
history = model.fit(X_train, y_train, epochs=25, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Avaliação da CNN no conjunto de teste
cnn_loss, cnn_accuracy = model.evaluate(X_test, y_test)

# Exibindo resultados da CNN
train_accuracy = history.history['accuracy'][-1]
val_accuracy = history.history['val_accuracy'][-1]

# Predições da CNN no conjunto de teste
cnn_predictions = model.predict(X_test)
cnn_predictions_classes = np.argmax(cnn_predictions, axis=1)


Shape of X_train: (748, 184, 1)
Shape of X_test: (187, 184, 1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 53ms/step - accuracy: 0.3768 - loss: 1.9427 - val_accuracy: 0.5455 - val_loss: 1.2962
Epoch 2/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5406 - loss: 1.2670 - val_accuracy: 0.5722 - val_loss: 1.1338
Epoch 3/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.6030 - loss: 1.0461 - val_accuracy: 0.6043 - val_loss: 0.9570
Epoch 4/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.6571 - loss: 0.8874 - val_accuracy: 0.6898 - val_loss: 0.9002
Epoch 5/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.6960 - loss: 0.7951 - val_accuracy: 0.6845 - val_loss: 0.8128
Epoch 6/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.7309 - loss: 0.6836 - val_accuracy: 0.6791 - val_loss: 0.8414
Epoch 7/25
[1m24/24[0m [32m━━━━━

In [34]:
# Exibindo resultados da CNN
print(f'CNN - Acurácia (Treinamento): {train_accuracy}')
print(f'CNN - Acurácia (Teste): {cnn_accuracy}')
print(f'CNN - Validação Cruzada - Média: {cross_val_mean} - Desvio Padrão: {cross_val_std}')
print("Matriz de Confusão:\n", confusion_matrix(y_test, cnn_predictions_classes))

# Classification report da CNN
print(f'Classification Report CNN:\n{cnn_report}')

CNN - Acurácia (Treinamento): 0.9237967729568481
CNN - Acurácia (Teste): 0.6951871514320374
CNN - Validação Cruzada - Média: 0.7046174526214599 - Desvio Padrão: 0.03775987367344366
Matriz de Confusão:
 [[84  4  1  1  0]
 [26 13  0  2  0]
 [ 6  1 24  0  0]
 [ 8  2  2  9  0]
 [ 3  0  1  0  0]]
Classification Report CNN:
              precision    recall  f1-score   support

           0       0.73      0.76      0.74        97
           1       0.37      0.47      0.41        38
           2       0.83      0.83      0.83        29
           3       0.83      0.33      0.48        15
           4       0.00      0.00      0.00         8

    accuracy                           0.65       187
   macro avg       0.55      0.48      0.49       187
weighted avg       0.65      0.65      0.64       187



In [38]:
# Função para plotar e salvar a matriz de confusão com letras maiores
def plot_confusion_matrix(conf_matrix, title, filename):
    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Greys', 
                xticklabels=target_names, yticklabels=target_names, 
                annot_kws={"size": 16})  
    plt.title(title, fontsize=14)
    plt.xlabel('Predicted Label', fontsize=14)
    plt.ylabel('True Label', fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.savefig(filename)
    plt.close()

# Plotar e salvar a matriz de confusão para CNN
plot_confusion_matrix(conf_matrix_cnn, 'Confusion Matrix - CNN', 'confusion_matrix_cnn.png')

# Plotar e salvar a matriz de confusão para MLP
plot_confusion_matrix(conf_matrix_mlp, 'Confusion Matrix - MLP', 'confusion_matrix_mlp.png')
