In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time
import seaborn as sns

import matplotlib.pyplot as plt


In [2]:
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names
target_names = data.target_names

print(f"Samples: {X.shape[0]}, Features: {X.shape[1]}")
print(f"Classes: {target_names}")


Samples: 569, Features: 30
Classes: ['malignant' 'benign']


In [3]:
class CustomGaussianNB:
    def __init__(self):
        self.classes = None
        self.priors = None
        self.means = None
        self.vars = None
        
    def fit(self, X, y):
        # Get unique classes and their count
        self.classes = np.unique(y)
        n_classes = len(self.classes)
        n_features = X.shape[1]
        n_samples = X.shape[0]
        
        # Initialize parameters
        self.priors = np.zeros(n_classes)
        self.means = np.zeros((n_classes, n_features))
        self.vars = np.zeros((n_classes, n_features))
        
        # Calculate mean, variance, and priors for each class
        for i, c in enumerate(self.classes):
            X_c = X[y == c]
            self.priors[i] = X_c.shape[0] / n_samples
            self.means[i] = np.mean(X_c, axis=0)
            self.vars[i] = np.var(X_c, axis=0) + 1e-9  # Add small value to avoid division by zero
            
        return self
    
    def _calculate_likelihood(self, x, mean, var):
        # Gaussian probability density function
        return (1 / np.sqrt(2 * np.pi * var)) * np.exp(-((x - mean) ** 2) / (2 * var))
    
    def _calculate_class_probability(self, X, class_idx):
        # Calculate probability of X belonging to given class
        likelihood = np.ones((X.shape[0]))
        for feature_idx in range(X.shape[1]):
            likelihood *= self._calculate_likelihood(
                X[:, feature_idx], 
                self.means[class_idx, feature_idx], 
                self.vars[class_idx, feature_idx]
            )
        return likelihood * self.priors[class_idx]
    
    def predict_proba(self, X):
        # Get probabilities for each class
        probs = np.zeros((X.shape[0], len(self.classes)))
        for i, c in enumerate(self.classes):
            probs[:, i] = self._calculate_class_probability(X, i)
        
        # Normalize probabilities
        probs_sum = np.sum(probs, axis=1, keepdims=True)
        return probs / probs_sum
    
    def predict(self, X):
        # Return class with highest probability
        return self.classes[np.argmax(self.predict_proba(X), axis=1)]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
custom_nb = CustomGaussianNB()
custom_start_time = time.time()
custom_nb.fit(X_train_scaled, y_train)
custom_train_time = time.time() - custom_start_time
print(f"Custom model training time: {custom_train_time:.6f} seconds")

Custom model training time: 0.000563 seconds


In [7]:
custom_predictions = custom_nb.predict(X_test_scaled)
custom_accuracy = accuracy_score(y_test, custom_predictions)
print(f"Custom model accuracy: {custom_accuracy:.4f}")

Custom model accuracy: 0.9649


In [8]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
custom_cv_scores = []

custom_cv_start_time = time.time()
for train_idx, test_idx in k_fold.split(X):
    X_fold_train, X_fold_test = X[train_idx], X[test_idx]
    y_fold_train, y_fold_test = y[train_idx], y[test_idx]
    
    # Scale the data
    X_fold_train_scaled = scaler.fit_transform(X_fold_train)
    X_fold_test_scaled = scaler.transform(X_fold_test)
    
    # Train and evaluate
    model = CustomGaussianNB()
    model.fit(X_fold_train_scaled, y_fold_train)
    predictions = model.predict(X_fold_test_scaled)
    custom_cv_scores.append(accuracy_score(y_fold_test, predictions))

custom_cv_time = time.time() - custom_cv_start_time
print(f"Custom model CV accuracy: {np.mean(custom_cv_scores):.4f} (±{np.std(custom_cv_scores):.4f})")
print(f"Custom model CV time: {custom_cv_time:.6f} seconds")

Custom model CV accuracy: 0.9367 (±0.0152)
Custom model CV time: 0.013715 seconds


In [9]:
sklearn_nb = GaussianNB()
sklearn_start_time = time.time()
sklearn_nb.fit(X_train_scaled, y_train)
sklearn_train_time = time.time() - sklearn_start_time
print(f"Scikit-learn training time: {sklearn_train_time:.6f} seconds")

Scikit-learn training time: 0.003001 seconds


In [10]:
sklearn_predictions = sklearn_nb.predict(X_test_scaled)
sklearn_accuracy = accuracy_score(y_test, sklearn_predictions)
print(f"Scikit-learn model accuracy: {sklearn_accuracy:.4f}")

Scikit-learn model accuracy: 0.9649


In [11]:
sklearn_cv_start_time = time.time()
sklearn_cv_scores = cross_val_score(GaussianNB(), X, y, cv=k_fold)
sklearn_cv_time = time.time() - sklearn_cv_start_time
print(f"Scikit-learn CV accuracy: {np.mean(sklearn_cv_scores):.4f} (±{np.std(sklearn_cv_scores):.4f})")
print(f"Scikit-learn CV time: {sklearn_cv_time:.6f} seconds")

Scikit-learn CV accuracy: 0.9385 (±0.0201)
Scikit-learn CV time: 0.012004 seconds


In [12]:
print(f"Training time: Custom is {custom_train_time/sklearn_train_time:.2f}x slower than scikit-learn")
print(f"CV time: Custom is {custom_cv_time/sklearn_cv_time:.2f}x slower than scikit-learn")
print(f"Accuracy difference: {custom_accuracy - sklearn_accuracy:.6f}")
print(f"CV accuracy difference: {np.mean(custom_cv_scores) - np.mean(sklearn_cv_scores):.6f}")

Training time: Custom is 0.19x slower than scikit-learn
CV time: Custom is 1.14x slower than scikit-learn
Accuracy difference: 0.000000
CV accuracy difference: -0.001739
