In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import OneHotEncoder
from sklearn.datasets import make_classification

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.init as init

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_knn = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/df_knn.csv')

In [None]:
# split feature and label
X = df_knn.drop('CVD0010', axis=1)
y = df_knn['CVD0010']

In [None]:
# normalize X_train, X_test
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
# split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [None]:
# SMOTE
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

X_mean_resampled, y_mean_resampled = SMOTE().fit_resample(X_train,y_train)

print('After OverSampling, the shape of train_X: {}'.format(X_mean_resampled.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_mean_resampled.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_mean_resampled==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_mean_resampled==0)))

In [None]:
# Get new data generated by smote
X_resampled = X_mean_resampled[(X_train.shape[0]):]

In [None]:
# t-SNE

real_one = X[y == 1]
tsne_real_1 = real_one

tsne_smote_1 = X_resampled

combined_data = pd.concat([tsne_smote_1, tsne_real_1], axis=0)

labels = np.array([0] * len(tsne_smote_1) + [1] * len(tsne_real_1))

tsne = TSNE(n_components=2, random_state=42)
tsne_data = tsne.fit_transform(combined_data)

plt.figure(figsize=(8, 6))
plt.scatter(tsne_data[labels == 0, 0], tsne_data[labels == 0, 1], c='b', label='smote label 1')
plt.scatter(tsne_data[labels == 1, 0], tsne_data[labels == 1, 1], c='r', label='real label 1')
plt.legend()
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('TSNE Visualization')
plt.show()

In [None]:
# PCA

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# merge two datas
merged_data = np.concatenate((tsne_real_1, tsne_smote_1), axis=0)

# PCA
pca = PCA(n_components=2)  # dim==2
pca.fit(merged_data)
reduced_data = pca.transform(merged_data)

# show
plt.scatter(reduced_data[tsne_real_1.shape[0]:, 0], reduced_data[tsne_real_1.shape[0]:, 1], label='smote data')
plt.scatter(reduced_data[:tsne_real_1.shape[0], 0], reduced_data[:tsne_real_1.shape[0], 1], label='real data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()

In [None]:
# SVM

import torch
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score, accuracy_score
from sklearn.svm import SVC

def train_and_evaluate_svm(X_train, y_train, X_test, y_test):
    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)

    # SVM
    svm_classifier = SVC(probability=True)

    # train
    svm_classifier.fit(X_train_tensor.numpy(), y_train_tensor.numpy())

    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

    # test
    y_scores = svm_classifier.predict_proba(X_test_tensor.numpy())[:, 1]
    y_pred = svm_classifier.predict(X_test_tensor.numpy())

    # FPR and TPR for ROC curve
    fpr, tpr, thresholds = roc_curve(y_test_tensor.numpy(), y_scores)

    # AUC
    auc = roc_auc_score(y_test_tensor.numpy(), y_scores)

    return auc

In [None]:
# original data
roc_results = []

# Repeat the training and evaluation process 30 times.
num_repeats = 30
for _ in range(num_repeats):
    auc = train_and_evaluate_svm(X_train, y_train, X_test, y_test)
    roc_results.append(auc)  # Store the AUC value in the tuple

# Calculate the average value of evaluation indicators
mean_auc = np.mean([result for result in roc_results])  # Retrieve the AUC value from the tuple

print("Mean AUC:", mean_auc)

In [None]:
# gan data
roc_results_smote = []

# Repeat the training and evaluation process 30 times.
num_repeats = 30
for _ in range(num_repeats):
    auc = train_and_evaluate_svm(X_mean_resampled, y_mean_resampled, X_test, y_test)
    roc_results_smote.append(auc)  # Store the AUC value in the tuple

# Calculate the average value of evaluation indicators
mean_auc_smote = np.mean([result for result in roc_results_smote])  # Retrieve the AUC value from the tuple

print("GAN Mean AUC:", mean_auc_smote)