In [19]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [20]:
df = pd.read_excel('Data.xlsx')
df = df.fillna(0)
X = df.iloc[:, [1, 4]].values
y = df.iloc[:, 5].values

In [21]:
X

array([[44.0, 'M'],
       [27.0, 'M'],
       [30.0, 'F'],
       [38.0, 'M'],
       [40.0, 'F'],
       [35.0, 'F'],
       [0.0, 'F'],
       [48.0, 'M'],
       [50.0, 'M'],
       [37.0, 'M']], dtype=object)

In [23]:
y

array([72000., 48000., 54000., 61000.,     0., 58000., 52000., 79000.,
       83000., 67000.])

### One-Hot Encode

In [24]:
labelencoder = LabelEncoder()
X[: , 0] = labelencoder.fit_transform(X[ : , 0])
onehotencoder = OneHotEncoder(categories='auto')
X = onehotencoder.fit_transform(X).toarray()

In [25]:
X

array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.]])

### Train Test Split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [27]:
X_train

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.]])

In [28]:
X_test

array([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.]])

In [29]:
y_train

array([67000., 48000., 52000., 79000., 61000., 72000., 58000.])

In [46]:
y_test

array([54000., 83000.,     0.])

### Scaling Features By taking z-scores

In [32]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [33]:
X_train

array([[-0.40824829, -0.40824829,  0.        , -0.40824829,  2.44948974,
        -0.40824829,  0.        , -0.40824829, -0.40824829,  0.        ,
        -0.63245553,  0.63245553],
       [-0.40824829,  2.44948974,  0.        , -0.40824829, -0.40824829,
        -0.40824829,  0.        , -0.40824829, -0.40824829,  0.        ,
        -0.63245553,  0.63245553],
       [ 2.44948974, -0.40824829,  0.        , -0.40824829, -0.40824829,
        -0.40824829,  0.        , -0.40824829, -0.40824829,  0.        ,
         1.58113883, -1.58113883],
       [-0.40824829, -0.40824829,  0.        , -0.40824829, -0.40824829,
        -0.40824829,  0.        , -0.40824829,  2.44948974,  0.        ,
        -0.63245553,  0.63245553],
       [-0.40824829, -0.40824829,  0.        , -0.40824829, -0.40824829,
         2.44948974,  0.        , -0.40824829, -0.40824829,  0.        ,
        -0.63245553,  0.63245553],
       [-0.40824829, -0.40824829,  0.        , -0.40824829, -0.40824829,
        -0.40824829,  

In [34]:
X_test

array([[-0.40824829, -0.40824829,  1.        , -0.40824829, -0.40824829,
        -0.40824829,  0.        , -0.40824829, -0.40824829,  0.        ,
         1.58113883, -1.58113883],
       [-0.40824829, -0.40824829,  0.        , -0.40824829, -0.40824829,
        -0.40824829,  0.        , -0.40824829, -0.40824829,  1.        ,
        -0.63245553,  0.63245553],
       [-0.40824829, -0.40824829,  0.        , -0.40824829, -0.40824829,
        -0.40824829,  1.        , -0.40824829, -0.40824829,  0.        ,
         1.58113883, -1.58113883]])

### Applying K-Nearest Neighbours

In [35]:
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

### Predicting The Result

In [36]:
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

In [38]:
X_test

array([[-0.40824829, -0.40824829,  1.        , -0.40824829, -0.40824829,
        -0.40824829,  0.        , -0.40824829, -0.40824829,  0.        ,
         1.58113883, -1.58113883],
       [-0.40824829, -0.40824829,  0.        , -0.40824829, -0.40824829,
        -0.40824829,  0.        , -0.40824829, -0.40824829,  1.        ,
        -0.63245553,  0.63245553],
       [-0.40824829, -0.40824829,  0.        , -0.40824829, -0.40824829,
        -0.40824829,  1.        , -0.40824829, -0.40824829,  0.        ,
         1.58113883, -1.58113883]])

In [37]:
y_pred

array([52000., 48000., 52000.])

In [16]:
cm

array([[0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0]], dtype=int64)