In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.semi_supervised import LabelPropagation, LabelSpreading

In [32]:
data = pd.read_csv("data/creditcard.csv", index_col=0)

# Ignore 'Time' and 'Amount' columns, keep only the encoded values (V1 to V28) and 'Class' (labels)
X = data.drop(columns=['Class', 'Amount'])
y = data['Class']

In [33]:
# Source: https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

In [34]:
# Split the data into 80% train and 20% test, stratify to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42)

# Further split train set into 30% labeled and 70% unlabeled
X_train_lab, X_train_unlab, y_train_lab, y_train_unlab = train_test_split(X_train, y_train, test_size=0.7, stratify=y_train, random_state=42)

In [35]:
# Standardize features (important for SVM and other classifiers)
scaler = StandardScaler()
X_train_lab = scaler.fit_transform(X_train_lab)
X_train_unlab = scaler.transform(X_train_unlab)
X_test = scaler.transform(X_test)

print(f'Train set (labeled): {X_train_lab.shape}, Train set (unlabeled): {X_train_unlab.shape}, Test set: {X_test.shape}')

Train set (labeled): (136471, 28), Train set (unlabeled): (318433, 28), Test set: (113726, 28)


In [36]:
# svm = SVC()

# param_grid = {
#     'C': [0.1, 1, 10],
#     'gamma': ['scale', 'auto'],
#     'degree': [2, 3],
#     'kernel': ['linear', 'rbf', 'poly']
# }

# grid_search = GridSearchCV(svm, param_grid, cv=3, verbose=2, n_jobs=-1)
# grid_search.fit(X_train_lab, y_train_lab)

# print(f'Best parameters: {grid_search.best_params_}')
# Best parameters: {'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}

best_svm = SVC(C=10, degree=2, gamma='scale', kernel='rbf')
best_svm.fit(X_train_lab, y_train_lab)

# best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy on the test set: {accuracy:.4f}')
print(f'F1 Score on the test set: {f1:.4f}')
print(classification_report(y_test, y_pred))

Accuracy on the test set: 0.9954
F1 Score on the test set: 0.9954
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     56863
           1       0.99      1.00      1.00     56863

    accuracy                           1.00    113726
   macro avg       1.00      1.00      1.00    113726
weighted avg       1.00      1.00      1.00    113726



In [37]:
y_train_unlab[:] = -1

X_train_combined = np.vstack((X_train_lab, X_train_unlab))
y_train_combined = np.hstack((y_train_lab, y_train_unlab))

# label_spread_model = LabelSpreading(kernel='knn', n_neighbors=7, max_iter=1000)
label_prop_model = LabelPropagation(kernel='knn', n_neighbors=7, max_iter=1000)

# label_spread_model.fit(X_train_combined, y_train_combined)
label_prop_model.fit(X_train_combined, y_train_combined)

# y_pred_test = label_spread_model.predict(X_test)
y_pred_test = label_prop_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)

print(f'Accuracy on the test set: {accuracy:.4f}')
print(f'F1 Score on the test set: {f1:.4f}')
print(classification_report(y_test, y_pred_test))

Accuracy on the test set: 0.9972
F1 Score on the test set: 0.9972
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       1.00      1.00      1.00     56863

    accuracy                           1.00    113726
   macro avg       1.00      1.00      1.00    113726
weighted avg       1.00      1.00      1.00    113726



  probabilities /= normalizer


In [38]:
print(f'Number of zero rows in X_train_combined: {np.sum(np.all(X_train_combined == 0, axis=1))}')



Number of zero rows in X_train_combined: 0


In [45]:
X_train_complete = np.vstack((X_train_lab, X_train_unlab))
y_train_complete = label_prop_model.transduction_

print(f'Shape of X_train_complete: {X_train_complete.shape}')
print(f'Shape of y_train_complete: {y_train_complete.shape}')

assert X_train_complete.shape[0] == y_train_complete.shape[0], "Mismatch between features and labels"

best_svm.fit(X_train_complete, y_train_complete)

y_pred_test_final = best_svm.predict(X_test)

accuracy_final = accuracy_score(y_test, y_pred_test_final)
f1_final = f1_score(y_test, y_pred_test_final)

print(f'Final Accuracy on the test set: {accuracy_final:.4f}')
print(f'Final F1 Score on the test set: {f1_final:.4f}')
print(classification_report(y_test, y_pred_test_final))

Shape of X_train_complete: (454904, 28)
Shape of y_train_complete: (454904,)
Final Accuracy on the test set: 0.9975
Final F1 Score on the test set: 0.9975
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       1.00      1.00      1.00     56863

    accuracy                           1.00    113726
   macro avg       1.00      1.00      1.00    113726
weighted avg       1.00      1.00      1.00    113726

