## Lecture 6

In [1]:
#Load libraries
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from scipy import stats


### Determining Computer Brand in Lost and Found

In [2]:
#Define column names
names = ['Keyboard_Light', 'Edges', 'Number_of_Ports', 'Number_of_Hinges', 'Brand']

#Loading data
#data_frame = pd.read_csv('./LapTop_Classification.csv', header=None, names=names)
data_frame = pd.read_csv('./LapTop_Classification_PCvsMac.csv', header=None, names=names)
#data_frame = pd.read_csv('./LapTop_Classification_Dell_Apple_Other.csv', header=None, names=names)

#Show data
data_frame

Unnamed: 0,Keyboard_Light,Edges,Number_of_Ports,Number_of_Hinges,Brand
0,W,S,6,1,PC
1,W,R,8,2,PC
2,W,R,6,1,Mac
3,W,R,6,1,PC
4,W,R,8,1,Mac
5,W,R,7,2,PC
6,NW,S,10,1,PC
7,W,R,9,1,PC
8,W,R,8,1,PC
9,W,R,7,1,PC


### Process the data and normalize between 0 and 1:

In [3]:
#Process the Brand column to have an encoding as a new column
#Change Brand type to be category
data_frame["Brand"] = data_frame["Brand"].astype('category')
#Create the encoding column using the class column
data_frame["Brand_Encoding"] = data_frame["Brand"].cat.codes

#Change Keyboard light type to be category
data_frame["Keyboard_Light"] = data_frame["Keyboard_Light"].astype('category')
#Change the column to be encoded
data_frame["Keyboard_Light"] = data_frame["Keyboard_Light"].cat.codes

#Change Edges type to be category
data_frame["Edges"] = data_frame["Edges"].astype('category')
#Change the column to be encoded
data_frame["Edges"] = data_frame["Edges"].cat.codes

#Change Number_of_Hinges to be between 0 and 1
data_frame["Number_of_Hinges"] = data_frame["Number_of_Hinges"] - 1

#Change Number_of_Ports to be between 0 and 1
ports = data_frame["Number_of_Ports"]
ports = ports - np.min(ports)
data_frame["Number_of_Ports"] = ports/(np.max(ports) - np.min(ports))

#Change Keyboard_Light to be between 0 and 1
data_frame["Keyboard_Light"] = data_frame["Keyboard_Light"]/np.max(data_frame["Keyboard_Light"])

class_label = np.array(data_frame['Brand'])
class_encoding = np.array(data_frame['Brand_Encoding'])

#Show data_frame
data_frame

Unnamed: 0,Keyboard_Light,Edges,Number_of_Ports,Number_of_Hinges,Brand,Brand_Encoding
0,1.0,1,0.333333,0,PC,1
1,1.0,0,0.555556,1,PC,1
2,1.0,0,0.333333,0,Mac,0
3,1.0,0,0.333333,0,PC,1
4,1.0,0,0.555556,0,Mac,0
5,1.0,0,0.444444,1,PC,1
6,0.5,1,0.777778,0,PC,1
7,1.0,0,0.666667,0,PC,1
8,1.0,0,0.555556,0,PC,1
9,1.0,0,0.444444,0,PC,1


In [4]:
#Functions for KNN

def predict(X_train, Y_train, X_test, k):
    #Create list for distances and k_closest
    distances = []
    k_closest = []

    for i in range(len(X_train)):
        #First we compute the euclidean distance
        distance = np.sqrt(np.sum(np.square(X_test - X_train[i, :])))
        #Add it to list of distances
        distances.append([distance, i])
    
    #Sort the list
    distances = sorted(distances)

    #Make a list of the k neighbors' targets
    for i in range(k):
        index = distances[i][1]
        k_closest.append(Y_train[index])
    
    #Return most common out of k_closest
    prediction, _ = stats.mode(k_closest)
    return prediction


def kNearestNeighbor(X_train, Y_train, X_test, k):
    #Stores predictions 
    predictions = []

    #Loop over all observations
    for i in range(len(X_test)):
        ith_prediction = predict(X_train, Y_train, X_test[i, :], k)
        predictions.append(ith_prediction)
        
    #Remove extra dimension
    predictions = np.squeeze(predictions, axis=2)
    true_labels = Y_test[:,0]
    
    return predictions, true_labels

#### The code below is what you will change to modify your algorithm:

In [5]:
#_________________________Set up data_________________________________
#Example of using all the features from the data
#computer_data_X = np.array(data_frame[['Keyboard_Light', 'Edges', 'Number_of_Ports', 'Number_of_Hinges']])

#Example of using one feature from the data
computer_data_X = np.array(data_frame[['Keyboard_Light']])

computer_data_Y = np.array(data_frame[['Brand_Encoding']])

#Save 33% of the data as testing and the rest for training
X_train, X_test, Y_train, Y_test = train_test_split(computer_data_X, computer_data_Y, test_size = 0.33, random_state = 42)


#____________________________Run KNN_________________________________
#Compute KNN and obtain predictions
k = 1
predictions, true_labels = kNearestNeighbor(X_train, Y_train, X_test, k)

#Show our predictions and labels
print('Predictions:\n', predictions)
print('\nTrue Labels:\n', true_labels)

#Confusion matrix
print('\nConfusion Matrix:\n', confusion_matrix(true_labels, predictions))

accuracy = accuracy_score(true_labels, predictions)
print('\nThe accuracy of your classifier is: ', accuracy*100, '%')

Predictions:
 [1 1 1 1 1 1 1 1 1 1 1 1 1]

True Labels:
 [1 0 0 1 1 1 1 0 1 1 1 1 1]

Confusion Matrix:
 [[ 0  3]
 [ 0 10]]

The accuracy of your classifier is:  76.92307692307693 %


### Exercise 1

#### Getting familiar with KNN code 1

With your team, change k in the provided code to obtain the best accuracy. If you have questions about the code ask a TA. Now is the time to ask questions to understand how to use the code. Once you are satisfied with your results, please provide them to a TA. Time: 10 minutes

### Exercise 2

#### Getting familiar with KNN code 2

With your team, change the iris_data_x data to have only one feature. Change the feature and k in the provided code to obtain the best accuracy. If you have questions about the code ask a TA. Once you are satisfied with your results, please provide them to a TA. Time: 20 minutes