In [9]:
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import warnings
import pandas as pd
import random

# Dataset

In [10]:
df = pd.read_csv('breast-cancer-wisconsin.data')
df.replace('?', -999999, inplace=True)
df.drop(['id'], axis=1, inplace=True)
full_data = df.astype(float).values.tolist()
random.shuffle(full_data)

test_size = 0.4
train_set = {2: [], 4: []}
test_set = {2: [], 4: []}
train_data = full_data[:-int(test_size*len(full_data))]
test_data = full_data[-int(test_size*len(full_data)):]

for i in train_data:
    train_set[i[-1]].append(i[:-1])

for i in test_data:
    test_set[i[-1]].append(i[:-1])

# Define KNN model

In [11]:
def k_nearest_neighboors(data, predict, k=3):
    if len(data) >= 3:
        warnings.warn('K is set to a value less than total voting groups!')

    distances = []
    
    for group in data:
        for features in data[group]:
            euclidean_distance = np.linalg.norm(np.array(features) - np.array(predict))
            distances.append([euclidean_distance, group])
    
    votes = [i[1] for i in sorted(distances)[:k]]
    #print(Counter(votes).most_common(1))
    vote_result = Counter(votes).most_common(1)[0][0]
    confidence = Counter(votes).most_common(1)[0][1] / k
    
    return vote_result, confidence

# Train model and evaluate error

In [12]:

correct = 0
total = 0

for group in test_set:
    for data in test_set[group]:
        
        vote, confidence = k_nearest_neighboors(train_set, data, k=5)
        
        if group == vote:
            correct += 1
        total += 1


print('Accuracy:', correct / total)

Accuracy: 0.953405017921147


# Compare with KNN from scikit-learn

## Clean and process data

In [13]:
from sklearn import preprocessing, neighbors
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier  # Добавлен импорт

In [14]:
df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()

Unnamed: 0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,MitosesNormal_nucleoli,Diagnose
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [15]:
X = df.drop(['Diagnose'], axis=1)
y = df['Diagnose']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True)

# Create and fit model

In [16]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

# Scoring model

In [17]:
scoring = ['accuracy', 'precision_macro', 'recall_macro']
cv_results = cross_validate(model, X_test, y_test, cv=5, scoring=scoring, return_train_score=True)

print("Test Accuracy:", cv_results['test_accuracy'])
print("Mean Test Accuracy: {:.4f}".format(cv_results['test_accuracy'].mean()))
print("Train Accuracy:", cv_results['train_accuracy'])

Test Accuracy: [0.96428571 0.96428571 1.         0.92857143 1.        ]
Mean Test Accuracy: 0.9714
Train Accuracy: [0.98214286 0.98214286 0.97321429 0.98214286 0.98214286]
