In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Preprocessing

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/bruneck.csv', )

In [None]:
df.shape

In [None]:
# Delete duplicate rows and columns
df = df.drop_duplicates()
df = df.transpose().drop_duplicates().transpose()

# Delete index columns and useless columns
df = df.iloc[:, 1:-2]
df = df.drop('CVD0010W', axis=1)
df = df.drop('TYPE1DM', axis=1)

# Put the label in the last column
cvd_0010 = df.CVD0010
df = df.drop('CVD0010', axis=1)
df['CVD0010'] = cvd_0010

# Change the abnormal value' #DIV/0!' replace with NaN value
df.replace('#DIV/0!', np.nan, inplace=True)

# Convert all features to numeric types
df = df.apply(pd.to_numeric, errors='coerce')

In [None]:
# # Experiment: randomly select ten complete features to draw a heat map and check the correlation.
# cleaned_data = df.dropna()
# correlation_matrix = cleaned_data.corr()

# import random

# random.seed(42)

# features = list(cleaned_data.columns)

# random_features = random.sample(features, 10)

# selected_data = cleaned_data[random_features]

# correlation_matrix = selected_data.corr()

# plt.figure(figsize=(10, 8))
# sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
# plt.title("Correlation Heatmap of Selected Features")
# plt.show()

In [None]:
df.shape

In [None]:
# Observe the missing values of columns.
missing_values = df.isnull().sum()
sorted_missing_values = missing_values.sort_values()
sorted_missing_values

In [None]:
# Delete Gln, Glol, Pyr columns (the missing value reaches about 30%)
columns_to_drop = ['Gln', 'Glol', 'Pyr']
df = df.drop(columns=columns_to_drop)

In [None]:
df.shape

In [None]:
# Observe the missing values of rows
def count_nan(row):
    return row.isna().sum()

nan_counts = df.apply(count_nan, axis=1)
count_distribution = nan_counts.value_counts()
count_distribution_sorted_index = count_distribution.sort_index()
print(count_distribution_sorted_index)

In [None]:
# Delete rows with the missing value reaches about 50%)
nan_counts = df.isnull().sum(axis=1)
# rows_to_drop = df[nan_counts > df.shape[1] / 2]
rows_to_drop = df[nan_counts > df.shape[1] * 0.5]
df = df.drop(rows_to_drop.index)

In [None]:
# Observe the missing values of rows
def count_nan(row):
    return row.isna().sum()

nan_counts = df.apply(count_nan, axis=1)
count_distribution = nan_counts.value_counts()
count_distribution_sorted_index = count_distribution.sort_index()
print(count_distribution_sorted_index)

In [None]:
df.shape

In [None]:
# Delete the row with the label value of NaN
df = df.dropna(subset=['CVD0010'])

In [None]:
df.shape

# knnImpute

In [None]:
df_knn_undisposed = df

In [None]:
X = df_knn_undisposed.drop('CVD0010', axis=1) 
y = df_knn_undisposed['CVD0010'] 

In [None]:
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.svm import SVC

# Create a KNNImputer object
imputer = KNNImputer()

# Interpolation on the training set
X_imputed = imputer.fit_transform(X)

# Define a list of candidate k values.
k_values = list(range(1, 31))

# Create KNN classifier object
knn = KNeighborsClassifier()

# Perform GridSearchCV to find the best k value
param_grid = {'n_neighbors': k_values}
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_imputed, y)

# Find all k values and their corresponding scores in the GridSearchCV results
k_scores = [(k, score) for k, score in zip(grid_search.cv_results_['param_n_neighbors'], grid_search.cv_results_['mean_test_score'])]

# Sort the k_scores by the mean test score (descending order)
k_scores.sort(key=lambda x: x[1], reverse=True)

# Find the best accuracy (maximum mean test score)
best_accuracy = k_scores[0][1]

# Find all k values that have the same best accuracy
best_k_values = [k for k, score in k_scores if score == best_accuracy]

print("Best K values", best_k_values)
print("Best Accuracy {}".format(best_accuracy))

In [None]:
# Create a KNNImputer object and set the best k=20
imputer = KNNImputer(n_neighbors=20)

# Interpolation on the data set
df_knn = imputer.fit_transform(df_knn_undisposed)

In [None]:
df_knn = pd.DataFrame(df_knn)
df_knn.columns = df_knn_undisposed.columns

In [None]:
df_knn

In [None]:
# # output
# df_knn.to_csv('df_knn.csv', index=False)

# SVM

In [None]:
# Delete the missing values of different thresholds and test them on SVM after interpolation.
# Used to select an appropriate threshold.

In [None]:
# split feature and label
X = df_knn.drop('CVD0010', axis=1)  
y = df_knn['CVD0010'] 

In [None]:
from sklearn.preprocessing import StandardScaler
# normalize X_train, X_test
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
# split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [None]:
import torch
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score, accuracy_score
from sklearn.svm import SVC

def train_and_evaluate_svm(X_train, y_train, X_test, y_test):
    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)

    # SVM
    svm_classifier = SVC(probability=True)

    # train
    svm_classifier.fit(X_train_tensor.numpy(), y_train_tensor.numpy())

    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

    # test
    y_scores = svm_classifier.predict_proba(X_test_tensor.numpy())[:, 1]
    y_pred = svm_classifier.predict(X_test_tensor.numpy())

    # FPR and TPR for ROC curve
    fpr, tpr, thresholds = roc_curve(y_test_tensor.numpy(), y_scores)

    # AUC
    auc = roc_auc_score(y_test_tensor.numpy(), y_scores)

    return auc

In [None]:
torch.manual_seed(42)

# original data
roc_results = []

# Repeat the training and evaluation process 30 times.
num_repeats = 30
for _ in range(num_repeats):
    auc = train_and_evaluate_svm(X_train, y_train, X_test, y_test)
    roc_results.append(auc)  # Store the AUC value in the tuple

# Calculate the average value of evaluation indicators
mean_auc = np.mean([result for result in roc_results])  # Retrieve the AUC value from the tuple

print("Mean AUC:", mean_auc)