In [22]:
# Import all the necessary Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [23]:
# Load the Dataset

df = pd.read_csv("data.csv")
print("Dataset shape:", df.shape)
print(df.head())

Dataset shape: (569, 33)
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_wors

In [34]:
# NaN-proof Preprocessing
df_encoded = df.copy()

# Handling the categorical columns
for col in df_encoded.select_dtypes(include=['object']).columns:
    df_encoded[col] = df_encoded[col].fillna(df_encoded[col].mode()[0])
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])

# Force all columns to numeric
df_encoded = df_encoded.apply(pd.to_numeric, errors='coerce')

# Filling any remaining NaNs with column mean
df_encoded = df_encoded.fillna(df_encoded.mean())

# Final fallback: replacing any still-remaining NaNs with 0
if df_encoded.isnull().sum().sum() > 0:
    print("⚠ Still NaNs present! Filling with 0 as fallback.")
    df_encoded = df_encoded.fillna(0)

# Preparing Features (X) & Target (y)
X = df_encoded.drop('diagnosis', axis=1).values  # Change if target name differs
y = df_encoded['diagnosis'].values

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Double-check for NaNs
print("NaNs in X_train:", np.isnan(X_train).sum())
print("NaNs in X_test:", np.isnan(X_test).sum())


⚠ Still NaNs present! Filling with 0 as fallback.
NaNs in X_train: 0
NaNs in X_test: 0


In [35]:
# Linear Kernel SVM

linear_svm = SVC(kernel='linear', C=1)
linear_svm.fit(X_train, y_train)
y_pred_linear = linear_svm.predict(X_test)

print("\n--- Linear Kernel SVM ---")
print(confusion_matrix(y_test, y_pred_linear))
print(classification_report(y_test, y_pred_linear))
print("Accuracy:", accuracy_score(y_test, y_pred_linear))



--- Linear Kernel SVM ---
[[68  3]
 [ 2 41]]
              precision    recall  f1-score   support

           0       0.97      0.96      0.96        71
           1       0.93      0.95      0.94        43

    accuracy                           0.96       114
   macro avg       0.95      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114

Accuracy: 0.956140350877193


In [36]:
# RBF Kernel SVM

rbf_svm = SVC(kernel='rbf', C=1, gamma='scale')
rbf_svm.fit(X_train, y_train)
y_pred_rbf = rbf_svm.predict(X_test)

print("\n--- RBF Kernel SVM ---")
print(confusion_matrix(y_test, y_pred_rbf))
print(classification_report(y_test, y_pred_rbf))
print("Accuracy:", accuracy_score(y_test, y_pred_rbf))



--- RBF Kernel SVM ---
[[71  0]
 [ 2 41]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        71
           1       1.00      0.95      0.98        43

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

Accuracy: 0.9824561403508771


In [37]:
# Hyperparameter Tuning
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=1, cv=5)
grid.fit(X_train, y_train)

print("\nBest Parameters:", grid.best_params_)
print("Best Estimator:", grid.best_estimator_)

# Evaluate tuned model
y_pred_best = grid.predict(X_test)
print("\n--- Tuned SVM Model ---")
print(confusion_matrix(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))
print("Accuracy:", accuracy_score(y_test, y_pred_best))


Fitting 5 folds for each of 32 candidates, totalling 160 fits

Best Parameters: {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
Best Estimator: SVC(C=0.1, gamma=1, kernel='linear')

--- Tuned SVM Model ---
[[71  0]
 [ 2 41]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        71
           1       1.00      0.95      0.98        43

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

Accuracy: 0.9824561403508771


In [38]:
# Cross-validation
scores = cross_val_score(grid.best_estimator_, X_scaled, y, cv=5)
print("\nCross-validation scores:", scores)
print("Mean CV Accuracy:", scores.mean())



Cross-validation scores: [0.97368421 0.97368421 0.98245614 0.96491228 0.98230088]
Mean CV Accuracy: 0.9754075454122031


In [39]:
# Decision Boundary Plot

if X.shape[1] == 2:
    def plot_decision_boundary(model, X, y, title):
        h = .02
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))
        Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        plt.contourf(xx, yy, Z, alpha=0.3)
        plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', marker='o')
        plt.title(title)
        plt.show()

    plot_decision_boundary(linear_svm, X_scaled, y, "Linear SVM Decision Boundary")
    plot_decision_boundary(rbf_svm, X_scaled, y, "RBF SVM Decision Boundary")
