# What is classification?

Classification is a prediction problem of approximating a mapping fuction (f) from input variables (**X**) to **discrete** output variables (**y**). In classification, we have defined classes to which we intend to map our features to.

In [None]:
#import necessary libraries
import random
import warnings
import numpy as np
import pandas as pd
from matplotlib import style
from collections import Counter
import matplotlib.pyplot as plt
from sklearn import preprocessing, neighbors
from sklearn.model_selection import train_test_split

## KNN from scratch
The principle behind KNN is computing distance between points in order to classify them on the basis of their closeness.

One metric of distance measure is the Euclidean distance. Given two points *a* and *b* of *N* dimension, the Euclidean distance between the two points is given by:

$\sqrt{\sum_{i=1}^{N} (a_i-b_i)^2}$


In [None]:
#compute euclidean distance between two points
a, b = [1, 3], [2, 5]

euc_d = np.sqrt( (a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2)
euc_d

In [None]:
dataset = {'green' : [[1, 2], [2, 3], [3, 1]], 'red' : [[6, 5], [7, 7], [8, 6]]}
new_features = [5, 7]

[[plt.scatter(ii[0], ii[1], s=100, color=i[0]) for ii in dataset[i]] for i in dataset]
#plt.scatter(new_features[0], new_features[1], color='b', s=100 )
plt.show()

In [None]:
# let's perform a simple knn classification on the data

def k_nearest_neighbors(data, test_data, k=3):
    if len(data) >= k:
        warnings.warn('k is set to a value less than total voting groups!')
        
    distances = []
    # compute euclidean distance between the test point and all data points in the dataset
    for group in data:
        for features in data[group]:
            euc_d = np.linalg.norm(np.array(features) - np.array(test_data))
            distances.append([euc_d, group])
            
    votes = [i[1] for i in sorted(distances)[: k]]  #pick the least k groups with the least distances
    vote_result = Counter(votes).most_common(1)[0][0] #pick the most common group
    
    return vote_result

result = k_nearest_neighbors(dataset, new_features, k=3)
print(result)

In [None]:
#load Wisconsin breast cancer data
df  = pd.read_csv('data/breast-cancer-wisconsin.data') 
df.head()

In [None]:
#replace missing data with an abitrary huge or small number
df.replace('?', -99999, inplace=True) 

#df.dropna(inplace=True)

In [None]:
#drop 'useless' data
df.drop(['id'], 1, inplace=True) 
df.head()

In [None]:
# convert strings to float
full_data = df.values.tolist()
full_data[: 10]

full_data = df.astype(float).values.tolist()
full_data[:10]

In [None]:
# shuffle the data
random.shuffle(full_data)

In [None]:
test_size = 0.2

# create dictionaries for train and test data with classes as the keys
train_set = {2: [], 4: []}
test_set = {2: [], 4: []}

#split data into train and test data
train_data = full_data[: -int(test_size * len(full_data))]
test_data = full_data[-int(test_size * len(full_data)) : ]

for i in train_data:
    train_set[i[-1]].append(i[:-1])
    
for i in test_data:
    test_set[i[-1]].append(i[:-1])

In [None]:
# let's test the model with our data

correct = 0
total = 0

for group in test_set:
    for data in test_set[group]:
        vote = k_nearest_neighbors(train_set, data, k=5)
        if group == vote:
            correct += 1
        total += 1
        
accuracy = correct / total  
print('Accuracy: ', accuracy)
        

In [None]:
# split the data into features, X, and targets, y.

X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])

# split data into training and validation data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# define classifier
clf = neighbors.KNeighborsClassifier()
clf.fit(X_train, y_train)

# test the model on the test data and output accuracy
accuracy = clf.score(X_test, y_test)
print('Accuracy: ', accuracy)