In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import label_binarize

import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, SpatialDropout1D, SimpleRNN, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
# Carregar os dados
df = pd.read_csv("C:/Users/ana_v/OneDrive/Documentos/Mestrado/MachineLearning/TCGA.csv", low_memory=False)

# Obtém os valores únicos na coluna 'Type'
unique_types = df['Type'].unique()

# Cria um dicionário mapeando cada tipo único para um número
type_to_numeric = {type_name: index for index, type_name in enumerate(unique_types)}

# Aplica a substituição usando o método map
df['Type'] = df['Type'].map(type_to_numeric)

# Armazena a coluna 'Type' para adicioná-la de volta posteriormente
type_column = df['Type']

# Prepara o DataFrame para normalização (remover colunas desnecessárias)
df_num = df.drop(columns=["Sample", "Type"])

# Normalizar os dados
scaler = StandardScaler()
dados_normalizados = scaler.fit_transform(df_num)

# Aplicação do PCA
pca = PCA(n_components=0.8)  
pca.fit(dados_normalizados)
dados_pca = pca.transform(dados_normalizados)

In [9]:
# Divisão de treino e teste com a função random_state usada para garantir a reprodutibilidade dos resultados
X_train, X_test, y_train, y_test = train_test_split(dados_pca, df['Type'], test_size=0.2, random_state=42)

# Criação e treinamento do MLP
mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
mlp_model.fit(X_train, y_train)

# Predições no conjunto de treino e teste
mlp_predictions_train = mlp_model.predict(X_train)
mlp_predictions_test = mlp_model.predict(X_test)

# Avaliação MLP
mlp_accuracy_train = accuracy_score(y_train, mlp_predictions_train)
mlp_accuracy_test = accuracy_score(y_test, mlp_predictions_test)
mlp_report = classification_report(y_test, mlp_predictions_test, zero_division=1)

# Criar um objeto de validação cruzada
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Substituir a divisão de treino e teste pelo código de validação cruzada
mlp_scores = cross_val_score(mlp_model, dados_pca, df['Type'], cv=cv, scoring='accuracy')

# Exibindo resultados do MLP
print(f'MLP - Acurácia (Treinamento): {mlp_accuracy_train}')
print(f'MLP - Acurácia (Teste): {mlp_accuracy_test}')
print(f'Acurácia média na validação cruzada (MLP): {mlp_scores.mean()}')
print('')
print(f'Classification Report MLP:\n{mlp_report}')

# Defina as classes com base nos seus dados
classes = df['Type'].unique()

# Matriz de Confusão do MLP
# Avaliação para MLP
print("Matriz de Confusão:\n", confusion_matrix(y_test, mlp_predictions_test))


MLP - Acurácia (Treinamento): 1.0
MLP - Acurácia (Teste): 0.732620320855615
Acurácia média na validação cruzada (MLP): 0.7294097460535347

Classification Report MLP:
              precision    recall  f1-score   support

           0       0.82      0.81      0.82        97
           1       0.56      0.63      0.59        38
           2       0.88      0.79      0.84        29
           3       0.56      0.60      0.58        15
           4       0.33      0.25      0.29         8

    accuracy                           0.73       187
   macro avg       0.63      0.62      0.62       187
weighted avg       0.74      0.73      0.73       187

Matriz de Confusão:
 [[79 15  1  2  0]
 [11 24  1  2  0]
 [ 2  1 23  1  2]
 [ 0  3  1  9  2]
 [ 4  0  0  2  2]]


In [10]:
# Supondo que dados_pca e df['Type'] estejam definidos corretamente
unique_types = df['Type'].unique()
num_classes = len(unique_types)

# Convertendo unique_types para strings
target_names = [str(cls) for cls in unique_types]

# Divisão de treino e teste com random_state para reprodutibilidade
X_train, X_test, y_train, y_test = train_test_split(dados_pca, df['Type'], test_size=0.2, random_state=42)

# Expansão das dimensões de X_train e X_test (somente uma vez)
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

# Verificar as dimensões de X_train e X_test
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of X_test: {X_test.shape}')

# Função para criar o modelo CNN
def create_cnn_model():
    model = Sequential([
        Conv1D(32, 3, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(2),
        Conv1D(64, 3, activation='relu'),
        MaxPooling1D(2),
        Flatten(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')  # num_classes deve ser o número de classes únicas
    ])
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',  # Use sparse_categorical_crossentropy se y_train não estiver one-hot
                  metrics=['accuracy'])
    return model

# Validação cruzada
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = []

for train_index, val_index in kfold.split(X_train):
    X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]

    model = create_cnn_model()
    history = model.fit(X_fold_train, y_fold_train, epochs=25, batch_size=32, verbose=0, validation_data=(X_fold_val, y_fold_val))

    val_accuracy = history.history['val_accuracy'][-1]
    cross_val_scores.append(val_accuracy)

cross_val_mean = np.mean(cross_val_scores)
cross_val_std = np.std(cross_val_scores)

# Treinamento final da CNN
model = create_cnn_model()
history = model.fit(X_train, y_train, epochs=25, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Avaliação da CNN no conjunto de teste
cnn_loss, cnn_accuracy = model.evaluate(X_test, y_test)

# Exibindo resultados da CNN
train_accuracy = history.history['accuracy'][-1]
val_accuracy = history.history['val_accuracy'][-1]

# Predições da CNN no conjunto de teste
cnn_predictions = model.predict(X_test)
cnn_predictions_classes = np.argmax(cnn_predictions, axis=1)

# Classification report da CNN
cnn_report = classification_report(y_test, cnn_predictions_classes, target_names=target_names, zero_division=1, digits=2)



Shape of X_train: (748, 184, 1)
Shape of X_test: (187, 184, 1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.3611 - loss: 1.9988 - val_accuracy: 0.5187 - val_loss: 1.2978
Epoch 2/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5244 - loss: 1.2796 - val_accuracy: 0.5882 - val_loss: 1.1272
Epoch 3/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5935 - loss: 1.0781 - val_accuracy: 0.5989 - val_loss: 1.0187
Epoch 4/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6354 - loss: 0.9878 - val_accuracy: 0.6043 - val_loss: 0.9891
Epoch 5/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6503 - loss: 0.9058 - val_accuracy: 0.6203 - val_loss: 0.9548
Epoch 6/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6919 - loss: 0.8334 - val_accuracy: 0.6364 - val_loss: 0.8573
Epoch 7/25
[1m24/24[0m [32m━━━━━━━━━

In [11]:
# Exibindo resultados da CNN
print(f'CNN - Acurácia (Treinamento): {train_accuracy}')
print(f'CNN - Acurácia (Teste): {cnn_accuracy}')
print(f'CNN - Validação Cruzada - Média: {cross_val_mean} - Desvio Padrão: {cross_val_std}')
print("Matriz de Confusão:\n", confusion_matrix(y_test, cnn_predictions_test))

# Classification report da CNN
print(f'Classification Report CNN:\n{cnn_report}')

CNN - Acurácia (Treinamento): 0.8368983864784241
CNN - Acurácia (Teste): 0.7058823704719543
CNN - Validação Cruzada - Média: 0.7393378019332886 - Desvio Padrão: 0.018406364498221115
Matriz de Confusão:
 [[79 15  1  2  0]
 [11 24  1  2  0]
 [ 2  1 23  1  2]
 [ 0  3  1  9  2]
 [ 4  0  0  2  2]]
Classification Report CNN:
              precision    recall  f1-score   support

           0       0.69      0.90      0.78        97
           1       0.58      0.29      0.39        38
           2       0.90      0.93      0.92        29
           3       0.64      0.47      0.54        15
           4       1.00      0.00      0.00         8

    accuracy                           0.71       187
   macro avg       0.76      0.52      0.52       187
weighted avg       0.71      0.71      0.67       187

