In [2]:
!pip install scikit-learn



In [None]:
import matplotlib.pyplot as plt
import numpy as np
import mnist
import sklearn.metrics
import sklearn.model_selection

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
STEP                  = 1

In [None]:
ROWS                  = 4
COLUMNS               = 4

def display_MNIST_digits(images_matrices, digits, image_number):
    fig, axes = plt.subplots(ROWS, COLUMNS, figsize = (10, 10), dpi = 800)
    axes = axes.ravel()

    if ROWS * COLUMNS % image_number != 0:
        print("Неверно задано количество MNIST-цифр для их отображения")
        return
    else:
        indexes = np.random.choice(len(images_matrices), image_number, replace = False)
        for (i, index) in enumerate(indexes):
            axes[i].imshow(images_matrices[index].reshape(28, 28), cmap = "gray")
            axes[i].set_title(f"MNIST-цифра: {digits[index]}")
            axes[i].axis("off")
        plt.suptitle("MNIST-цифры")
        plt.show()

In [None]:
class my_PCA():

    def __init__(self, n_components = None):

        self.__n_components                 = n_components
        self.__mean                         = None
        self.__eigvals                      = None
        self.__ratio_eigvals                = None
        self.__ratio_cumsum_eigvals         = None
        self.__eigvecs                      = None
        self.__eigvecs_choosen              = None

    def fit(self, X):
        self.__mean = np.mean(X, axis = 0)
        X_centered = X - self.__mean
        U, S, V_T = np.linalg.svd(X_centered.T @ X_centered, full_matrices = False)
        F = V_T
        self.__eigenvecs = np.zeros((F.shape[0], F.shape[1]))
        for i in range(0, F.shape[0], 1):
            self.__eigenvecs[i] = F[ : , i]

        S = np.diag(S)
        Lambda_matrix = S
        self.__eigenvals = np.zeros((F.shape[0]))
        for i in range(0, Lambda_matrix.shape[0], STEP):
            self.__eigenvals[i] = Lambda_matrix[i, i]

        sorted_indexes                           = np.argsort(self.__eigenvals)[ : : -1]
        self.__eigenvals                         = self.__eigenvals[sorted_indexes]
        self.__eigenvecs                         = self.__eigenvecs[ : , sorted_indexes]
        if self.__n_components is not None:
            self.__eigvecs_choosen               = self.__eigenvecs[ : , : self.__n_components]
        else:
            self.__eigvecs_choosen               = self.__eigenvecs

        self.__ratio_eigvals                     = self.__eigenvals / np.sum(self.__eigenvals)
        self.__ratio_cumsum_eigvals              = np.cumsum(self.__ratio_eigvals)

        return self

    def transform(self, X):
        X_centered = X - self.__mean
        return X_centered @ self.__eigvecs_choosen

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

    def get_ratio_eigvals(self):
        return self.__ratio_eigvals

    def get_ratio_cumsum_eigvals(self):
        return self.__ratio_cumsum_eigvals

In [None]:
class my_kNN:

    def __init__(self, n_neighbours = 10):

        self.__n_neighbours                          = n_neighbours
        self.__X_train                               = None
        self.__y_train                               = None

    def fit(self, X, y):
        self.__X_train = X
        self.__y_train = y

        return self

    def predict(self, X):
        predictions = []
        for x in X:
            distances                                    = self.__calc_euclidean_distance(self.__X_train, x)
            k_nearest_indexes                            = np.argsort(distances)[ : self.__n_neighbours]
            k_nearest_labels                             = self.__y_train[k_nearest_indexes]
            unique_labels, unique_labels_number          = np.unique(k_nearest_labels, return_counts = True)
            most_common_label                            = unique_labels[np.argmax(unique_labels_number)]
            predictions.append(most_common_label)

        return np.array(predictions)

    def __calc_euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2, axis = 1))

In [None]:
#    train - кортеж, 1-ый элемент которого - это изображения размером 28 x 28 пикселей, каждому из которых поставлена в соответствие матрциа 28 x 28
#                    2-ой элемент которого - это сами цифры в обычном числовом формате, закодированные соответствующими изображениями.

train, validation, test      = mnist.load_mnist()
train_images                 = train[0].reshape(train[0].shape[0], -1).astype("int32")
train_labels                 = train[1].reshape(train[1].shape[0]).astype("int32")
print(validation[0].shape)
print(validation[1].shape)
print()

display_MNIST_digits(
    images_matrices     = train_images,
    digits              = train_labels,
    image_number        = 16,
)

In [None]:
UPPER_BOUND           = 100

my_pca                = my_PCA()
train_images_pca      = my_pca.fit_transform(train_images)

# График собственных значений (объясненной дисперсии)
plt.figure(figsize = (20, 10), dpi = 1000, edgecolor = "blue", facecolor = "grey")

# График 1: Зависимость относительной составляющей каждого значения характиристического числа от числа учтённых компонент
plt.subplot(1, 2, 1)
plt.plot(range(1, UPPER_BOUND + 1), my_pca.get_ratio_eigvals()[ : UPPER_BOUND], color = "darkgreen", linestyle = "-", linewidth = 2.0)

plt.axhline(y = 0.0, color = "black", linestyle = "-", linewidth = 2.5)
plt.axvline(x = 0.0, color = "black", linestyle = "-", linewidth = 2.5)

plt.xlabel("Число учтённых компонент", color = "yellow", fontsize = 20)
plt.ylabel("Отношения характеристических чисел ко всей суммы", color = "yellow", fontsize = 20)

plt.xticks(color = "white", fontsize = 18)
plt.yticks(color = "white", fontsize = 18)

plt.title("Зависимость " + r"$\frac{\lambda_{i}}{\sum_{i}{\lambda_{i}}}$" + " от числа учтённых компонент", color = "darkblue", fontsize = 18)
plt.grid(True, color = "gray", linestyle = "-")

# График 2: Зависимость кумулятивной суммы от числа компонент
plt.subplot(1, 2, 2)
plt.plot(range(1, UPPER_BOUND + 1), my_pca.get_ratio_cumsum_eigvals()[ : UPPER_BOUND], color = "darkgreen", linestyle = "-", linewidth = 2.0)

plt.axhline(y = 1.0, color = "black", linestyle = "-", label = "100%")
plt.axhline(y = 0.95, color = "green", linestyle = "-", label = "95%")
plt.axhline(y = 0.90, color = "orange", linestyle = "-", label = "90%")
plt.axhline(y = 0.85, color = "purple", linestyle = "-", label = "85%")
plt.axhline(y = 0.80, color = "blue", linestyle = "-", label = "80%")

plt.axhline(y = 0.0, color = "black", linestyle = "-", linewidth = 2.5)
plt.axvline(x = 0.0, color = "black", linestyle = "-", linewidth = 2.5)

plt.xlabel("Число учтённых компонент", color = "yellow", fontsize = 20)
plt.ylabel("Кумулятивная сумма собственных значений", color = "yellow", fontsize = 20)

plt.xticks(color = "white", fontsize = 16)
plt.yticks(color = "white", fontsize = 16)

plt.title("Зависимость кумулятивной суммы от числа учтённых компонент", color = "darkblue", fontsize = 18)
plt.legend()
plt.grid(True, color = "gray", linestyle = "-")

plt.show()

print("\nЗначение кумулятивной суммы в зависимости от числа учтённых компонент:")
for n_comp in [2, 5, 10, 15, 20, 30, 50, 100]:
    cum_sum = my_pca.get_ratio_cumsum_eigvals()[n_comp - 1]
    print(f"{n_comp:3d} компонент: {cum_sum:.3f}")

In [None]:
# Применяем PCA ко всем компонентам для анализа
my_plane_pca               = my_PCA(n_components = 2)
train_images_plane_pca     = my_plane_pca.fit_transform(train_images)

plt.figure(
    figsize    = (20, 10),
    dpi        = 1000,
    edgecolor  = "blue",
    facecolor  = "gray",
)
scatter = plt.scatter(
    train_images_plane_pca[ : , 0], train_images_plane_pca[ : , 1],
    c          = train_labels,
    cmap       = "tab10",
    s          = 2,
)

plt.colorbar(scatter, label = "Класс MNIST-цифры")

plt.xlabel("1-ая главная компонента (x')", color = "yellow", fontsize = 20)
plt.ylabel("2-ая главная компонента (y')", color = "yellow", fontsize = 20)

plt.xticks(color = "white", fontsize = 16)
plt.yticks(color = "white", fontsize = 16)

plt.title("Распределение MNIST-цифр по классам в пространстве первых 2 главных компонент", color = "darkblue", fontsize = 24)
plt.show()

In [None]:
sample_sizes   = [100, 500, 1000, 2000, 5000, 10000, 20000, 30000, 40000, 50000]
sample_size    = np.random.choice(sample_sizes)

sub_train_images             = train_images[ : min(sample_size, len(train_images))]
sub_train_labels             = train_labels[ : min(sample_size, len(train_images))]

validation_images            = validation[0].reshape(validation[0].shape[0], -1).astype("int32")
validation_labels            = validation[1].reshape(validation[1].shape[0]).astype("int32")

validation_sample_sizes      = [100, 500, 1000, 2500, 5000]
validation_sample_size       = np.random.choice(validation_sample_sizes)

sub_validation_images        = validation_images[ : min(validation_sample_size, len(validation_images))]
sub_validation_labels        = validation_labels[ : min(validation_sample_size, len(validation_labels))]

knn_original = my_kNN(n_neighbours = 5)
knn_original.fit(sub_train_images, sub_train_labels)

y_pred_original              = knn_original.predict(sub_validation_images)
accuracy_original            = accuracy_score(sub_validation_labels, y_pred_original)

print(f"Точность алгоритма kNN на заданных данных: {accuracy_original:.4f}")

In [None]:
# Оптимизация алгоритма kNN с помощью метода главных компонент
n_components_list       = [2, 5, 10, 15, 20, 30, 50, 100, 200, 500, 1000]
n_neighbours_list       = [1, 2, 5, 10, 20, 30, 40, 50, 60]

best_accuracy = 0.0
best_params = {
    "n_component":      None,
    "n_neighbour":      None,
}
results = []

print("Число компонент | Соседи | Точность")

for n_component in n_components_list:
    pca = my_PCA(
        n_components = n_component,
    )
    sub_train_images_pca = pca.fit_transform(sub_train_images)
    sub_validation_images_pca = pca.transform(sub_validation_images)

    for n_neighbour in n_neighbours_list:
        knn = my_kNN(
            n_neighbours = n_neighbour,
        )
        knn.fit(sub_train_images_pca, sub_train_labels)

        y_pred = knn.predict(sub_validation_images_pca)
        accuracy = accuracy_score(sub_validation_labels, y_pred)

        results.append((n_component, n_neighbour, accuracy))
        print(f"{n_component:15d} | {n_neighbour:6d} | {accuracy:.4f}")

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params["n_component"] = n_component
            best_params["n_neighbour"] = n_neighbour

print(f"\nЛучшая точность:     {best_accuracy:.4f}")
print(f"Лучшие параметры:      {best_params}")

In [None]:
# График зависимости точности от числа компонент
plt.figure(
    figsize     = (20, 10),
    dpi         = 1000,
    facecolor   = "grey",
    edgecolor   = "blue"
)

# Для фиксированного числа соседей (лучшего)
best_neighbours_num = best_params["n_neighbour"]
accuracies_by_components = []

for n_component in n_components_list:
    accuracy = next((acc for (comp, k, acc) in results if comp == n_component and k == best_neighbours_num), 0)
    accuracies_by_components.append(accuracy)

# График зависимости точности от числа компонент
plt.subplot(1, 2, 1)
plt.plot(
    n_components_list, accuracies_by_components,
    "bo-",
    linewidth = 2.0
)
plt.axhline(
    y           = accuracy_original,
    color       = "r",
    linestyle   = "-",
    linewidth   = 2.0,
    label       = f"Без PCA: {accuracy_original:.3f}"
)
plt.axvline(x = 0.0, color = "black", linestyle = "-", linewidth = 2.5)
plt.axhline(y = 0.0, color = "black", linestyle = "-", linewidth = 2.5)

plt.xlabel("Число главных компонент", color = "yellow", fontsize = 20)
plt.ylabel("Точность", color = "yellow", fontsize = 20)

plt.xticks(color = "white", fontsize = 16)
plt.yticks(color = "white", fontsize = 16)

plt.title(f"Точность и число компонент (n_neighbour = {best_neighbours_num})", color = "darkblue", fontsize = 24)
plt.legend()
plt.grid(True)

best_n_component = best_params["n_component"]
accuracies_by_neighbours = []
for n_neighbour in n_neighbours_list:
    accuracy = next((acc for (comp, k, acc) in results if comp == best_n_component and k == n_neighbour), 0)
    accuracies_by_neighbours.append(accuracy)

# График зависимости точности от числа соседей
plt.subplot(1, 2, 2)
plt.plot(
    n_neighbours_list, accuracies_by_neighbours,
    "ro-",
    linewidth = 2.0
)
plt.axvline(x = 0.0, color = "black", linestyle = "-", linewidth = 2.5)
plt.axhline(y = 0.0, color = "black", linestyle = "-", linewidth = 2.5)

plt.xlabel("Число соседей (k)", color = "yellow", fontsize = 20)
plt.ylabel("Точность", color = "yellow", fontsize = 20)

plt.xticks(color = "white", fontsize = 16)
plt.yticks(color = "white", fontsize = 16)

plt.title(f"Точность и число соседей (n_components = {best_n_component})", color = "darkblue", fontsize = 24)
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
print("Обучение финальной модели")

# Применяем метод главных компонент PCA числом компонент, дающими наилучший результат
pca_final = my_PCA(
    n_components = best_params["n_component"],
)
train_images_final = pca_final.fit_transform(train_images)

test_images          = train[0].reshape(train[0].shape[0], -1).astype("int32")
test_labels          = train[1].reshape(train[1].shape[0]).astype("int32")
test_images_final    = pca_final.transform(test_images)

# Обучаем алгоритм kNN числом соседей, дающими наилучший результат
knn_final = my_kNN(
    n_neighbours = best_params["n_neighbour"],
)
knn_final.fit(test_images_final, train_labels)

# Предсказания на тестовой выборке
y_pred_final        = knn_final.predict(test_images_final)
accuracy_final      = accuracy_score(test_labels, y_pred_final)

print(f"Точность на финальной тестовой выборке: {accuracy_final:.4f}")