In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.datasets import make_classification

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.init as init

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_knn = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/df_knn.csv')

In [None]:
# split feature and label
X = df_knn.drop('CVD0010', axis=1)
y = df_knn['CVD0010']

In [None]:
# normalize X_train, X_test
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
# split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [None]:
# hyperparameter
n_epochs = 1000
batch_size = 128
latent_dim = X_train.shape[1]

In [None]:
# generator
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()

        self.gen = nn.Sequential(
            nn.ConvTranspose1d(1, 32, 15, 2, 7),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(0.2, inplace=True),

            nn.ConvTranspose1d(32, 64, 15, 2, 7),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(0.2, inplace=True),

            nn.ConvTranspose1d(64, 128, 15, 1, 7),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(0.2, inplace=True),

            nn.ConvTranspose1d(128, 64, 15, 1, 7),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(0.2, inplace=True),

            nn.ConvTranspose1d(64, 32, 15, 1, 7),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(0.2, inplace=True),

            nn.ConvTranspose1d(32, 1, 15, 1, 7),
        )

        self.init_weights()

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                init.kaiming_uniform_(m.weight.data, nonlinearity='leaky_relu')
                if m.bias is not None:
                    init.zeros_(m.bias.data)

    def forward(self, noise):
        noise = noise.unsqueeze(1)
        return self.gen(noise).squeeze(1)

In [None]:
# discriminator
class Discriminator(nn.Module):
    def __init__(self, data_dim=latent_dim):
        super(Discriminator, self).__init__()
        self.disc = nn.Sequential(
            nn.Conv1d(1, 32, 15, 2, 7),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv1d(32, 64, 15, 2, 7),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv1d(64, 32, 15, 1, 7),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv1d(32, 1, 15, 1, 7),

            nn.Linear(128, 1),

            nn.Sigmoid(),
        )
    def forward(self, data):
        data = data.unsqueeze(1)
        return self.disc(data).squeeze(1)

In [None]:
# Loss function
criterion = nn.BCELoss()

# Initialize generator and discriminator
generator = Generator()
discriminator = Discriminator()

# Optimizers
g_optimizer = torch.optim.Adam(generator.parameters(), lr=0.00001, betas=(0.5, 0.999))
d_optimizer = torch.optim.Adam(discriminator.parameters(), lr=0.00001, betas=(0.5, 0.999))


indices = (y_train == 1)
X_real = X_train[indices]
y_real = y_train[indices]
my_dataset = TensorDataset(torch.Tensor(X_real.values), torch.Tensor(y_real.values))
# dataloader
dataloader = DataLoader(my_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# original gan Train
gen_losses = []
disc_losses = []
for epoch in range(n_epochs):
    for i, real_data in enumerate(dataloader):
        real_data = real_data[0].float()

        # Adversarial ground truths
        real_labels = torch.ones(real_data.size(0), 1, requires_grad=False)
        fake_labels = torch.zeros(real_data.size(0), 1, requires_grad=False)

        # ---------------------
        #  First Train Discriminator
        # ---------------------

        d_optimizer.zero_grad()

        # Measure discriminator's ability to classify real from generated samples
        real_loss = criterion(discriminator(real_data).reshape([X_real.shape[0], 1]), real_labels)

        # Sample noise as generator input
        z = torch.randn(real_data.size(0), 128)

        # Generate a batch of datas
        gen_datas = generator(z)

        fake_loss = criterion(discriminator(gen_datas.detach()), fake_labels)
        d_loss = (real_loss + fake_loss) / 2

        d_loss.backward()
        d_optimizer.step()

        # -----------------
        #  Train Generator
        # -----------------

        g_optimizer.zero_grad()

        # Sample noise as generator input
        z = torch.randn(real_data.size(0), 128)

        # Generate a batch of datas
        gen_datas = generator(z)

        g_loss = criterion(discriminator(gen_datas), real_labels)

        g_loss.backward()
        g_optimizer.step()

        gen_losses.append(g_loss.item())
        disc_losses.append(d_loss.item())

        print(
            "[Epoch %d/%d] [Batch %d/%d] [D loss: %f] [G loss: %f]"
            % (epoch, n_epochs, i, len(dataloader), d_loss.item(), g_loss.item())
        )

In [None]:
# Convert lists to NumPy arrays
gen_losses = np.array(gen_losses)
disc_losses = np.array(disc_losses)

# Plot loss curves
plt.figure(figsize=(10, 8))
plt.plot(range(len(gen_losses)), gen_losses, label='Generator loss')
plt.plot(range(len(disc_losses)), disc_losses, label='Discriminator loss')
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# count the frequency of each value in y_train.
value_counts = y_train.value_counts()
# calculate how much fewer the number of minority classes is than that of majority classes.
diff = value_counts[0] - value_counts[1]

z = torch.randn(diff, 128)

# Generate a batch of datas
generated_data_dcgan = generator(z)
generated_data_dcgan_df = pd.DataFrame(generated_data_dcgan.detach().numpy(), columns=X_train.columns)

# add the generated data to the training set, X_train_dcgan, y_train_gan
X_train_dcgan = X_train.append(generated_data_dcgan_df, ignore_index=True)
# Supplement y_train with 1.0
ones = pd.Series([1.0] * diff)
y_train_dcgan = y_train.append(ones, ignore_index=True)

In [None]:
from sklearn.utils import shuffle
# shuffle
X_train_dcgan, y_train_dcgan = shuffle(X_train_dcgan, y_train_dcgan, random_state=42)

In [None]:
real_one = X[y == 1]

# original data (label==1)
tsne_real = real_one # num == 76

# smote data (label==1)
tsne_real_dcgan = generated_data_dcgan_df # num == 392

# TSNE
combined_data = pd.concat([tsne_real_dcgan, tsne_real], axis=0)
labels = np.array([0] * len(tsne_real_dcgan) + [1] * len(tsne_real))
tsne = TSNE(n_components=2, random_state=42)
tsne_data = tsne.fit_transform(combined_data)

# show
plt.figure(figsize=(8, 6))
plt.scatter(tsne_data[labels == 0, 0], tsne_data[labels == 0, 1], c='b', label='DCGAN data')
plt.scatter(tsne_data[labels == 1, 0], tsne_data[labels == 1, 1], c='r', label='Real data')
plt.legend()
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('TSNE comparing real data with DCGAN generated data')
plt.show()

In [None]:
# PCA
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

merged_data = np.concatenate((real_one, generated_data_dcgan_df), axis=0)

pca = PCA(n_components=2)
reduced_data = pca.fit_transform(merged_data)

plt.scatter(reduced_data[real_one.shape[0]:, 0], reduced_data[real_one.shape[0]:, 1], label='DCGAN data')
plt.scatter(reduced_data[:real_one.shape[0], 0], reduced_data[:real_one.shape[0], 1], label='Real data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()

In [None]:
# SVM
import torch
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score, accuracy_score
from sklearn.svm import SVC

def train_and_evaluate_svm(X_train, y_train, X_test, y_test):
    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)

    # SVM
    svm_classifier = SVC(probability=True)

    # train
    svm_classifier.fit(X_train_tensor.numpy(), y_train_tensor.numpy())

    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

    # test
    y_scores = svm_classifier.predict_proba(X_test_tensor.numpy())[:, 1]
    y_pred = svm_classifier.predict(X_test_tensor.numpy())

    # FPR and TPR for ROC curve
    fpr, tpr, thresholds = roc_curve(y_test_tensor.numpy(), y_scores)

    # AUC
    auc = roc_auc_score(y_test_tensor.numpy(), y_scores)

    return auc


In [None]:
# original data
roc_results = []

# Repeat the training and evaluation process 30 times.
num_repeats = 30
for _ in range(num_repeats):
    auc = train_and_evaluate_svm(X_train, y_train, X_test, y_test)
    roc_results.append(auc)  # Store the AUC value in the tuple

# Calculate the average value of evaluation indicators
mean_auc = np.mean([result for result in roc_results])  # Retrieve the AUC value from the tuple

print("Mean AUC:", mean_auc)

In [None]:
# original data
roc_results_dcgan = []

# Repeat the training and evaluation process 30 times.
num_repeats = 30
for _ in range(num_repeats):
    auc = train_and_evaluate_svm(X_train_dcgan, y_train_dcgan, X_test, y_test)
    roc_results_dcgan.append(auc)  # Store the AUC value in the tuple

# Calculate the average value of evaluation indicators
mean_auc_dcgan = np.mean([result for result in roc_results_dcgan])  # Retrieve the AUC value from the tuple

print("Mean AUC:", mean_auc_dcgan)