In [7]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score
from sklearn.metrics import roc_curve, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="ticks")

In [8]:
data = pd.read_csv('C:\\Users\\icosane\\Documents\\univ\\archive (2)\\apple_quality.csv')

In [9]:
data.dtypes

A_id             int64
Size           float64
Weight         float64
Sweetness      float64
Crunchiness    float64
Juiciness      float64
Ripeness       float64
Acidity        float64
Quality         object
dtype: object

In [10]:
data.isnull().sum()

A_id           0
Size           0
Weight         0
Sweetness      0
Crunchiness    0
Juiciness      0
Ripeness       0
Acidity        0
Quality        0
dtype: int64

In [11]:
data.head()

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.49159,good
1,1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809,good
2,2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,bad
3,3,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723,good
4,4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,good


In [12]:
target = []
for i in data['Quality']:
  if i == 'good':
    target.append(0)
  if i == 'bad':
    target.append(1)

#target = np.array(target)
data['target'] = target

In [13]:
data.head()

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality,target
0,0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.49159,good,0
1,1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809,good,0
2,2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,bad,1
3,3,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723,good,0
4,4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,good,0


In [14]:
# Значения целевого признака
np.unique(data.target)

array([0, 1], dtype=int64)

In [15]:
#data = pd.DataFrame(data)

In [16]:
q = data[['Size','Weight','Sweetness','Crunchiness','Juiciness','Ripeness','Acidity']]

In [17]:
q.iloc[:5].to_numpy()

dd = q.iloc[:].to_numpy()

Разделение выборки на обучающую и тестовую

In [19]:
apple_X_train, apple_X_test, apple_y_train, apple_y_test = train_test_split(
dd, data.target, test_size=0.3, random_state=1)

In [20]:
# Размер обучающей выборки
apple_X_train.shape, apple_y_train.shape

((2800, 7), (2800,))

In [21]:
# Размер тестовой выборки
apple_X_test.shape, apple_y_test.shape

((1200, 7), (1200,))

In [22]:
from operator import itemgetter
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import math
from enum import Enum
class PredictionType(Enum):
    CLASSIFICATION = 1
    REGRESSION = 2

In [None]:
class SimpleKNN:
    
    def fit(self, X_train: np.matrix, y_train: np.ndarray):
        #Сохраняем параметры в переменных класса
        self._X_train = X_train
        self._y_train = y_train
          
    def eucl_dist(self, p: np.ndarray, q: np.ndarray) -> float:
        return math.sqrt(sum([(pi - qi) ** 2 for pi, qi in zip (p, q)]))
            
            
    def predict_for_single_object(self, K: int, \
                prediction_type: PredictionType, \
                X_o: np.ndarray, \
                verbose = True) -> np.ndarray:
        
        neighbors_list = []
        
        for i in range(self._X_train.shape[0]):
            
            data_train_current_x = [x for x in self._X_train[['x1','x2']].iloc[i]]
           
            data_train_current_y = self._y_train[i]
            
            dist = self.eucl_dist(X_o, data_train_current_x)
           
            temp_res = (data_train_current_y, dist, data_train_current_x)
            neighbors_list.append(temp_res)
         
        neighbors_list_sorted = sorted(neighbors_list, key=itemgetter(1))
        if verbose:
            print()
            print('**************************************')
            print('Проверяемая точка: ', X_o)
            print('**************************************')
            print('Вывод отсортированного списка соседей:')    
            dist_list = []
            for cur_y, cur_dist, temp_x_1_2 in neighbors_list_sorted:
                temp_x1, temp_x2 = temp_x_1_2
                print('X1={0}, X2={1}, y={2}, расстояние={3:.2f}'.format(temp_x1, temp_x2, cur_y, cur_dist))
                dist_list.append(cur_dist)
            print()
            print('Вывод расстояния для отсортированного списка соседей:')    
            plt.plot(dist_list)
            plt.show()
        # Оставим только K ближайших соседей
        K_neighbors_list_sorted = neighbors_list_sorted[:K]
        if verbose:
            print('Вывод К ближайших соседей:')
            x1_list = []
            x2_list = []
            for cur_y, cur_dist, temp_x_1_2 in K_neighbors_list_sorted:
                temp_x1, temp_x2 = temp_x_1_2
                x1_list.append(temp_x1)
                x2_list.append(temp_x2)
                print('X1={0}, X2={1}, y={2}, расстояние={3:.2f}'.format(temp_x1, temp_x2, cur_y, cur_dist))
            print()
            print('Визуализация К ближайших соседей:')
            plt.plot(self._X_train['x1'], self._X_train['x2'], 'b.', \
                     x1_list, x2_list,  'g*', \
                    [X_o[0]], [X_o[1]], 'ro')
            plt.show()   
        
        if prediction_type == PredictionType.REGRESSION: 
            arr = np.array([x for x,_,_ in K_neighbors_list_sorted])
            return np.mean(arr)          
        elif prediction_type == PredictionType.CLASSIFICATION:
            k_y_list = [y for y,_,_ in K_neighbors_list_sorted]
            
            k_y_list_grouped_temp = np.unique(k_y_list, return_counts=True)
            k_y_list_grouped = [[key, cnt] for key, cnt in zip(k_y_list_grouped_temp[0], k_y_list_grouped_temp[1])]
            # сортируем по количеству по убыванию
            k_y_list_grouped_sorted = sorted(k_y_list_grouped, key=itemgetter(1), reverse=True)
            if verbose:
                print('Классы, соответствующие К ближайшим соседям:')
                for i in k_y_list_grouped_sorted:
                    print('класс={0}, количество элементов={1}'.format(i[0], i[1]))
            return k_y_list_grouped_sorted[0][0]
        else:
            raise Exception('Неизвестный тип предсказания')
                   
    
    def predict(self, K: int, \
                prediction_type: PredictionType, \
                X_test: np.matrix, 
                verbose = True) -> np.ndarray:
        # Перебираем все точки тестовой выборки
        test_data_temp = []
        for i in range(X_test.shape[0]):
            # получаем текущую точку
            data_test_current_x = [x for x in X_test.iloc[i]]
            test_data_temp.append(data_test_current_x)       
        return [self.predict_for_single_object(K=K, \
                prediction_type=prediction_type, \
                X_o=i, verbose=verbose) for i in test_data_temp]
                      
            


In [27]:
np.unique(apple_y_train)

array([0, 1], dtype=int64)

In [28]:
np.unique(apple_y_test)

array([0, 1], dtype=int64)

In [30]:
def class_proportions(array: np.ndarray) -> Dict[int, Tuple[int, float]]:
    labels, counts = np.unique(array, return_counts=True)

    counts_perc = counts/array.size

    res = dict()
    for label, count2 in zip(labels, zip(counts, counts_perc)):
        res[label] = count2
    return res
def print_class_proportions(array: np.ndarray):
    proportions = class_proportions(array)
    if len(proportions)>0:
        print('Метка \t Количество \t Процент встречаемости')
    for i in proportions:
        val, val_perc = proportions[i]
        val_perc_100 = round(val_perc * 100, 2)
        print('{} \t {} \t \t {}%'.format(i, val, val_perc_100))

In [32]:
# В исходной выборке нет явного дисбаланса классов для целевого признака
print_class_proportions(data.target)

Метка 	 Количество 	 Процент встречаемости
0 	 2004 	 	 50.1%
1 	 1996 	 	 49.9%


In [33]:
# Для обучающей выборки
print_class_proportions(apple_y_train)

Метка 	 Количество 	 Процент встречаемости
0 	 1422 	 	 50.79%
1 	 1378 	 	 49.21%


In [34]:
# Для тестовой выборки
print_class_proportions(apple_y_test)

Метка 	 Количество 	 Процент встречаемости
0 	 582 	 	 48.5%
1 	 618 	 	 51.5%


Использование классов scikit-learn

In [40]:
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

Классификация

In [41]:
# 2 ближайших соседа
cl1_1 = KNeighborsClassifier(n_neighbors=2)
cl1_1.fit(apple_X_train, apple_y_train)
target1_1 = cl1_1.predict(apple_X_test)
len(target1_1), target1_1

(1200, array([1, 0, 0, ..., 0, 0, 1], dtype=int64))

In [42]:
# 10 ближайших соседей
cl1_2 = KNeighborsClassifier(n_neighbors=10)
cl1_2.fit(apple_X_train, apple_y_train)
target1_2 = cl1_2.predict(apple_X_test)
len(target1_2), target1_2

(1200, array([1, 0, 1, ..., 0, 0, 1], dtype=int64))

Регрессия

In [43]:
KNeighborsRegressorObj = KNeighborsRegressor()
KNeighborsRegressorObj

In [44]:
KNeighborsRegressorObj.fit(apple_X_train, apple_y_train)
KNeighborsRegressorObj.predict(apple_y_test)

ValueError: Expected 2D array, got 1D array instead:
array=[1 0 1 ... 0 0 1].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.