In [1]:
import copy
from time import time
import numpy as np
import pandas as pd
from NaiveBayesClassifier import NaiveBayesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('../data/iris.csv')
data['species'], _ = data['species'].factorize()
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [3]:
X = data.drop('species', axis=1)
y = data['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((120, 4), (120,), (30, 4), (30,))

In [4]:
custom_model = NaiveBayesClassifier()
start1 = time()
custom_model.fit(X_train, y_train)
end1 = time()

y_pred = custom_model.predict(X_test.to_numpy())
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [5]:
sklearn_model = GaussianNB()
start2 = time()
sklearn_model.fit(X_train, y_train)
end2 = time()

y_pred = sklearn_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [6]:
print(f'Время обучения кастомного алгоритма: {end1-start1:.3f} с')
print(f'Время обучения библиотечной реализации: {end2-start2:.3f} c')

Время обучения кастомного алгоритма: 0.01 с
Время обучения библиотечной реализации: 0.00 c


In [7]:
def cross_validate(model, X, y, n_folds=5):
    scores = []
    for n in range(n_folds):
        model_to_fit = copy.deepcopy(model)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        model_to_fit.fit(X_train, y_train)
        scores += [accuracy_score(y_test, model_to_fit.predict(X_test.to_numpy()))]
    return scores

In [8]:
n_folds = 20

custom_model = NaiveBayesClassifier()
sklearn_model = GaussianNB()

custom_scores = cross_validate(custom_model, X, y, n_folds=n_folds)
sklearn_scores = cross_validate(sklearn_model, X, y, n_folds=n_folds)

print(f"Средняя accuracy кастомного алгоритма на {n_folds} выборках: {np.mean(custom_scores):.3f}")
print(f"Средняя accuracy библиотечной реализации на {n_folds} выборках: {np.mean(sklearn_scores):.3f}")

Средняя accuracy кастомного алгоритма на 20 выборках: 0.966
Средняя accuracy библиотечной реализации на 20 выборках: 0.942


