In [2]:
# import libraries
import pandas as pd
import numpy as np
from statistics import mode

In [30]:
help(mode)

Help on function mode in module statistics:

mode(data)
    Return the most common data point from discrete or nominal data.
    
    ``mode`` assumes discrete data, and returns a single value. This is the
    standard treatment of the mode as commonly taught in schools:
    
        >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
        3
    
    This also works with nominal (non-numeric) data:
    
        >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
        'red'
    
    If there are multiple modes with same frequency, return the first one
    encountered:
    
        >>> mode(['red', 'red', 'green', 'blue', 'blue'])
        'red'
    
    If *data* is empty, ``mode``, raises StatisticsError.



In [112]:
df1 = pd.DataFrame({
    'age':np.random.randint(18, 30, 50),
    'salary' : np.random.randint(15000, 20000, 50),
    'label' : np.random.randint(0, 2, 50)
})

In [113]:
df1

Unnamed: 0,age,salary,label
0,18,16198,0
1,23,19946,0
2,19,15485,0
3,19,15556,1
4,29,17971,1
5,28,19662,0
6,23,19002,0
7,22,15845,1
8,22,15808,0
9,26,17422,0


In [114]:
x_train = df1.iloc[:-20]
x_test = df1.iloc[-20:]

In [115]:
x_train

Unnamed: 0,age,salary,label
0,18,16198,0
1,23,19946,0
2,19,15485,0
3,19,15556,1
4,29,17971,1
5,28,19662,0
6,23,19002,0
7,22,15845,1
8,22,15808,0
9,26,17422,0


In [116]:
x_test

Unnamed: 0,age,salary,label
30,27,16133,0
31,18,19592,1
32,22,17460,0
33,26,15244,0
34,23,16092,0
35,19,17847,0
36,23,15427,1
37,19,18672,1
38,18,16794,1
39,21,17234,1


In [117]:
def get_distance(p, t): # Euclidean
  return np.sqrt(np.sum((p-t)**2))  # np.sum()

In [118]:
# test my get_distance function
a = np.array([5, 2])
b = np.array([2, 5])
get_distance(a, b)

4.242640687119285

In [119]:
print(x_train['label'].values)

[0 0 0 1 1 0 0 1 0 0 0 1 1 0 1 1 1 0 1 1 0 1 0 1 1 1 0 1 1 1]


In [120]:
y_train = x_train['label'].values

In [121]:
y_train

array([0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1])

In [122]:
y_train[[2, 6, 9,12]]

array([0, 0, 0, 1])

In [137]:
def get_predicted_output(test_data, k=3):
  distances = [get_distance(train_data, test_data) for train_data in x_train.drop('label', axis=1).values]
  k_nearest_neighbors = np.argsort(distances)[:k]
  print( k_nearest_neighbors)
  labels = y_train[k_nearest_neighbors]
  # print(labels)
  return mode(labels)

NameError: ignored

In [138]:
y_pred = []
for test in x_test.drop('label', axis=1).values:
  y_pred.append(get_predicted_output(test))

[0 7 8]
[16  5 27]
[ 9 21 15]
[23 24 18]
[0 7 8]
[26  4 28]
[ 2 24 12]
[11  6 19]
[17 14 10]
[15 21  9]
[ 7  8 20]
[17 14 10]
[16  5 13]
[26  4 28]
[ 9 21 15]
[13 16  5]
[22 28 29]
[13  6 16]
[0 7 8]
[ 6 11 13]


In [125]:
y_pred

[0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0]

In [126]:
y_act = x_test['label']

In [127]:
print(y_act.values)

[0 1 0 0 0 0 1 1 1 1 0 1 1 1 0 1 0 0 1 0]


In [128]:
pd.crosstab(y_act, np.array(y_pred))

col_0,0,1
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5,5
1,5,5


In [129]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
confusion_matrix(y_act, y_pred)

array([[5, 5],
       [5, 5]])

In [130]:
confusion_matrix(y_act, y_pred).T

array([[5, 5],
       [5, 5]])

In [131]:
accuracy_score(y_act, y_pred)

0.5

In [95]:
print(classification_report(y_act, y_pred))

              precision    recall  f1-score   support

           0       0.33      0.36      0.35        11
           1       0.12      0.11      0.12         9

    accuracy                           0.25        20
   macro avg       0.23      0.24      0.23        20
weighted avg       0.24      0.25      0.24        20



In [135]:
y_pred = []
for test in x_test.drop('label', axis=1).values:
  y_pred.append(get_predicted_output(test, 5))
accuracy_score(y_act, y_pred)

0.55

In [None]:
confusion_matrix(y_act, y_pred).T

array([[4, 3],
       [4, 9]])

In [None]:
confusion_matrix(y_pred, y_act)

array([[4, 3],
       [4, 9]])

In [None]:
print(classification_report(y_act, y_pred))

              precision    recall  f1-score   support

           0       0.57      0.50      0.53         8
           1       0.69      0.75      0.72        12

    accuracy                           0.65        20
   macro avg       0.63      0.62      0.63        20
weighted avg       0.64      0.65      0.65        20

