In [1]:
import numpy as np
import pandas as pd
import datetime
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB

In [2]:
class GaussianNaiveBayes:
    def __init__(self):
        self.classes = None
        self.priors = {}
        self.mean = {}
        self.var = {}

    def fit(self, X, y):
        self.classes = np.unique(y)

        for cls in self.classes:
            X_cls = X[y == cls]
            self.priors[cls] = X_cls.shape[0] / X.shape[0]
            self.mean[cls] = X_cls.mean(axis=0)
            self.var[cls] = X_cls.var(axis=0)

    def predict(self, X):
        predictions = []
        for x in X:
            posteriors = []
            for cls in self.classes:
                prior = np.log(self.priors[cls])
                likelihood = np.sum(np.log(norm.pdf(x, loc=self.mean[cls], scale=np.sqrt(self.var[cls])) + 1e-9))
                posterior = prior + likelihood
                posteriors.append(posterior)
            best_class = self.classes[np.argmax(posteriors)]
            predictions.append(best_class)
        return predictions

In [3]:
data = pd.read_csv('breast-cancer.csv')
data = data.drop(['id'], axis=1)
X = data.drop(['diagnosis'], axis=1)
y = data['diagnosis']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)

In [4]:
def cross_validate(model, X, y, k=5):
    np.random.seed(42)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    
    fold_size = len(indices) // k
    scores = []
    
    for i in range(k):
        test_start = i * fold_size
        test_end = (i + 1) * fold_size

        test_idx = indices[test_start:test_end]
        train_idx = np.concatenate([indices[:test_start], indices[test_end:]])
        
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = np.mean(y_pred == y_test)
        scores.append(accuracy)
    
    return scores

In [5]:
gnb = GaussianNaiveBayes()
start_time = datetime.datetime.now()
scores = cross_validate(gnb, X.values, y, k=5)
end_time = datetime.datetime.now()
print(f"Mean accuracy: {np.mean(scores):.4f} (+-{np.std(scores):.4f}), time: {end_time - start_time}")

Mean accuracy: 0.9363 (+-0.0172), time: 0:00:00.107992


In [6]:
gnb = GaussianNB()
start_time = datetime.datetime.now()
scores = cross_val_score(gnb, X.values, y, cv=5)
end_time = datetime.datetime.now()
print(f"Mean accuracy: {np.mean(scores):.4f} (+-{np.std(scores):.4f}), time: {end_time - start_time}")

Mean accuracy: 0.9279 (+-0.0204), time: 0:00:00.017666
