# Задание 2

In [12]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()
X, y = iris.data, iris.target


data = np.column_stack((X, y.reshape(-1, 1)))
np.random.shuffle(data)
print(data.shape)

(150, 5)


In [13]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

X_train, y_train = train_data[:, :-1], train_data[:, -1]
X_test, y_test = test_data[:, :-1], test_data[:, -1]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(120, 4)
(120,)
(30, 4)
(30,)


1. На трейне по каждому классу считаем среднее, стандартное отклонение и априорную вероятность
2. На тесте считаем апостериорную вероятность для каждого класса
3. Выбираем класс с максимальной апостериорной вероятностью
4. Профит

In [21]:
class NaiveBayesClassifier:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.n_features = X.shape[1]

        self.means = np.zeros((len(self.classes), self.n_features))
        self.stds = np.zeros((len(self.classes), self.n_features))
        self.priors = np.zeros(len(self.classes))
        
        for i, c in enumerate(self.classes):
            X_c = X[y == c]
            self.means[i, :] = X_c.mean(axis=0)
            self.stds[i, :] = X_c.std(axis=0)
            self.priors[i] = len(X_c) / len(X)


    @staticmethod
    def gaussian_pdf(x, mean, std):
        exponent = np.exp(-((x - mean) ** 2) / (2 * std ** 2))
        return (1 / (np.sqrt(2 * np.pi) * std)) * exponent
    
    def predict(self, X):
        y_pred = np.zeros(len(X))
        for i, x in enumerate(X):
            posteriors = []
            for j in range(len(self.classes)):
                posterior = np.log(self.priors[j]) + \
                    np.sum(np.log(self.gaussian_pdf(x, self.means[j], self.stds[j])))
                posteriors.append(posterior)
                
            y_pred[i] = self.classes[np.argmax(posteriors)]
            
        return y_pred
    
    def score(self, X, y):
        y_pred = self.predict(X)
        return np.mean(y_pred == y)

clf = NaiveBayesClassifier()
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)
print(accuracy)


0.8666666666666667
