In [1]:
# --- SVM Experiments Notebook ---

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC # Support Vector Classifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# ==============================================================================
# --- 1. Load and Preprocess the Data ---
# ==============================================================================
print("--- Loading and Preprocessing Data ---")
file_path = '../data/raw/leish_dataset.csv'
df = pd.read_csv(file_path)
df_processed = df.copy()
for col in df_processed.select_dtypes(include=['object']).columns:
    df_processed[col] = df_processed[col].fillna('Unknown')
target_map = {'positivo': 1, 'negativo': 0, 'Unknown': 0}
df_processed['diagnosis'] = df_processed['diagnosis'].map(target_map).astype(int)
X_categorical = df_processed.drop('diagnosis', axis=1)
y_numeric = df_processed['diagnosis']
X_numeric = pd.get_dummies(X_categorical, drop_first=True, dtype=int)
print("--- Data Preprocessing Complete ---")
print(f"Shape of numerical features (X): {X_numeric.shape}")

# ==============================================================================
# --- 2. Split Data AFTER Preprocessing ---
# ==============================================================================
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y_numeric, test_size=0.2, random_state=42, stratify=y_numeric
)
print("\n--- Data Split ---")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Original positive cases in training set: {y_train.sum()} ({y_train.mean()*100:.1f}%)")

# ==============================================================================
# --- 3. Experiment 1: SMOTE + SVM ---
# ==============================================================================
print("\n--- Experiment 1: SMOTE + SVM ---")

# --- 3.1 SMOTE + SVM (Linear Kernel) ---
pipeline_svm_linear_smote = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    # C=1.0 is the default regularization parameter
    ('classifier', SVC(kernel='linear', C=1.0, probability=True, random_state=42)) 
])
print("\n--- Training SVM (Linear Kernel) with SMOTE ---")
pipeline_svm_linear_smote.fit(X_train, y_train)
y_pred_svm_linear_smote = pipeline_svm_linear_smote.predict(X_test)

print("\n--- SVM (Linear Kernel, SMOTE) Performance ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm_linear_smote))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm_linear_smote))

# --- 3.2 SMOTE + SVM (RBF Kernel) ---
pipeline_svm_rbf_smote = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    # RBF is the default kernel for SVC
    ('classifier', SVC(kernel='rbf', probability=True, random_state=42)) 
])
print("\n--- Training SVM (RBF Kernel) with SMOTE ---")
pipeline_svm_rbf_smote.fit(X_train, y_train)
y_pred_svm_rbf_smote = pipeline_svm_rbf_smote.predict(X_test)

print("\n--- SVM (RBF Kernel, SMOTE) Performance ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm_rbf_smote))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm_rbf_smote))


# ==============================================================================
# --- 4. Experiment 2: RandomUnderSampler + SVM ---
# ==============================================================================
print("\n--- Experiment 2: RandomUnderSampler + SVM ---")

# --- 4.1 RUS + SVM (Linear Kernel) ---
pipeline_svm_linear_rus = ImbPipeline([
    ('undersampler', RandomUnderSampler(random_state=42)),
    ('classifier', SVC(kernel='linear', C=1.0, probability=True, random_state=42)) 
])
print("\n--- Training SVM (Linear Kernel) with RandomUnderSampler ---")
pipeline_svm_linear_rus.fit(X_train, y_train)
y_pred_svm_linear_rus = pipeline_svm_linear_rus.predict(X_test)

print("\n--- SVM (Linear Kernel, RUS) Performance ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm_linear_rus))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm_linear_rus))

# --- 4.2 RUS + SVM (RBF Kernel) ---
pipeline_svm_rbf_rus = ImbPipeline([
    ('undersampler', RandomUnderSampler(random_state=42)),
    ('classifier', SVC(kernel='rbf', probability=True, random_state=42)) 
])
print("\n--- Training SVM (RBF Kernel) with RandomUnderSampler ---")
pipeline_svm_rbf_rus.fit(X_train, y_train)
y_pred_svm_rbf_rus = pipeline_svm_rbf_rus.predict(X_test)

print("\n--- SVM (RBF Kernel, RUS) Performance ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm_rbf_rus))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm_rbf_rus))

--- Loading and Preprocessing Data ---
--- Data Preprocessing Complete ---
Shape of numerical features (X): (456, 43)

--- Data Split ---
Training set shape: (364, 43)
Test set shape: (92, 43)
Original positive cases in training set: 109 (29.9%)

--- Experiment 1: SMOTE + SVM ---

--- Training SVM (Linear Kernel) with SMOTE ---

--- SVM (Linear Kernel, SMOTE) Performance ---
Confusion Matrix:
[[39 26]
 [16 11]]

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.60      0.65        65
           1       0.30      0.41      0.34        27

    accuracy                           0.54        92
   macro avg       0.50      0.50      0.50        92
weighted avg       0.59      0.54      0.56        92


--- Training SVM (RBF Kernel) with SMOTE ---

--- SVM (RBF Kernel, SMOTE) Performance ---
Confusion Matrix:
[[41 24]
 [14 13]]

Classification Report:
              precision    recall  f1-score   support

           0       0.75    