In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from NaiveBayes import GaussianNBClassifier as CustomNBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [24]:
data = pd.read_csv('./titanic.csv')

features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
target = "Survived"

X = data[features]
y = data[target]

X = pd.get_dummies(X, columns=["Sex", "Embarked"])

X["Age"] = X["Age"].fillna(X["Age"].median())
X["Fare"] = X["Fare"].fillna(X["Fare"].median())

# for column in X.columns:
#     X[column] = encoder.fit_transform(X[column])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
custom_nb = CustomNBClassifier()
custom_start_time = time.time()
custom_nb.fit(X_train, y_train)
custom_train_time = time.time() - custom_start_time
print(f"Custom model training time: {custom_train_time:.3f} seconds")

Custom model training time: 0.002 seconds


In [26]:
custom_pred = custom_nb.predict(X_test)
custom_accuracy = accuracy_score(y_test, custom_pred)
print(f"Custom model accuracy: {custom_accuracy:.3f}")

Custom model accuracy: 0.777


In [27]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
custom_cv_scores = []

custom_cv_start_time = time.time()
for train_idx, test_idx in k_fold.split(X):
    X_fold_train, X_fold_test = X.iloc[train_idx], X.iloc[test_idx]
    y_fold_train, y_fold_test = y.iloc[train_idx], y.iloc[test_idx]

    model = CustomNBClassifier()
    model.fit(X_fold_train, y_fold_train)
    predictions = model.predict(X_fold_test)
    custom_cv_scores.append(accuracy_score(y_fold_test, predictions))

custom_cv_time = time.time() - custom_cv_start_time
print(f"Custom model CV accuracy: {np.mean(custom_cv_scores):.3f} (±{np.std(custom_cv_scores):.3f})")
print(f"Custom model CV time: {custom_cv_time:.3f} seconds")

Custom model CV accuracy: 0.789 (±0.024)
Custom model CV time: 0.023 seconds


In [28]:
sklearn_nb = GaussianNB()
sklearn_start_time = time.time()
sklearn_nb.fit(X_train, y_train)
sklearn_train_time = time.time() - sklearn_start_time
print(f"Scikit-learn training time: {sklearn_train_time:.3f} seconds")

Scikit-learn training time: 0.002 seconds


In [29]:
sklearn_predictions = sklearn_nb.predict(X_test)
sklearn_accuracy = accuracy_score(y_test, sklearn_predictions)
print(f"Scikit-learn model accuracy: {sklearn_accuracy:.3f}")

Scikit-learn model accuracy: 0.777


In [30]:
sklearn_cv_start_time = time.time()
sklearn_cv_scores = cross_val_score(GaussianNB(), X, y, cv=k_fold)
sklearn_cv_time = time.time() - sklearn_cv_start_time
print(f"Scikit-learn CV accuracy: {np.mean(sklearn_cv_scores):.3f} (±{np.std(sklearn_cv_scores):.3f})")
print(f"Scikit-learn CV time: {sklearn_cv_time:.3f} seconds")

Scikit-learn CV accuracy: 0.789 (±0.024)
Scikit-learn CV time: 0.012 seconds


In [31]:
print(f"Training time: Custom is {custom_train_time/sklearn_train_time:.3f}x slower than scikit-learn")
print(f"CV time: Custom is {custom_cv_time/sklearn_cv_time:.3f}x slower than scikit-learn")
print(f"Accuracy difference: {custom_accuracy - sklearn_accuracy:.3f}")
print(f"CV accuracy difference: {np.mean(custom_cv_scores) - np.mean(sklearn_cv_scores):.3f}")

Training time: Custom is 0.858x slower than scikit-learn
CV time: Custom is 1.906x slower than scikit-learn
Accuracy difference: 0.000
CV accuracy difference: 0.000
