In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
import os 

from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv(os.getcwd() +"/df_all.csv")
df

Unnamed: 0,peak_amplitude,integral,phase_angle,cycle_number,rise_time,pulse_width,type
0,11.474359,9243,204,29966,0,0,pos
1,17.516900,4035,335,31027,0,1,neg
2,7.479021,-3559,235,23657,0,0,neg
3,24.262821,46927,296,30273,0,0,neg
4,9.280886,-20127,113,33473,0,0,neg
...,...,...,...,...,...,...,...
4274320,0.967366,779,33,2599,0,0,pos
4274321,4.389277,2591,166,9821,0,401,neg
4274322,22.870047,21574,161,25780,0,0,neg
4274323,5.526224,6580,346,24110,2,4,neg


In [21]:
test = df[:1000000]

In [43]:
test

Unnamed: 0,peak_amplitude,integral,phase_angle,cycle_number,rise_time,type
0,11.474359,9243,204,29966,0,pos
1,17.516900,4035,335,31027,0,neg
2,7.479021,-3559,235,23657,0,neg
3,24.262821,46927,296,30273,0,neg
4,9.280886,-20127,113,33473,0,neg
...,...,...,...,...,...,...
999995,13.301282,-7934,59,58426,0,neg
999996,1.533217,-2341,313,51314,0,neg
999997,7.030886,-12974,88,29703,0,neg
999998,5.861305,-12369,21,39828,0,pos


In [24]:
le = preprocessing.LabelEncoder()

X = test.iloc[:,:-1]
y = le.fit_transform(test.iloc[:,-1])


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [26]:
n = int(math.sqrt(X.shape[0])) + 1 if (int(math.sqrt(X.shape[0])) % 2 == 0) else int(math.sqrt(X.shape[0]))
n

1001

## Supervised KNN

In [27]:
# instantiate model
knn = KNeighborsClassifier(n_neighbors = n , metric='euclidean', p=2)
# fit model
knn.fit(X_train,y_train)

In [None]:
y_pred =  knn.predict(X_test)
y_pred

In [17]:
cm= confusion_matrix(y_test,y_pred)
print(cm)
print(f1_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[164310   2660]
 [ 21675  11355]]
0.48272930173238393
0.878325


## Dropping Columns Iteratively

In [42]:
for i in range(6):
    test = df[:1000000]
    test = test.drop(test.columns[i], axis=1)
    le = preprocessing.LabelEncoder()

    X = test.iloc[:,:-1]
    y = le.fit_transform(test.iloc[:,-1])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    n = int(math.sqrt(X.shape[0])) + 1 if (int(math.sqrt(X.shape[0])) % 2 == 0) else int(math.sqrt(X.shape[0]))
    
    # instantiate model
    knn = KNeighborsClassifier(n_neighbors = n , metric='euclidean', p=2)
    # fit model
    knn.fit(X_train,y_train)
    y_pred =  knn.predict(X_test)
    cm= confusion_matrix(y_test,y_pred)
    print(cm)
    print(f1_score(y_test,y_pred))
    print(accuracy_score(y_test,y_pred))

[[162152   5127]
 [ 19960  12761]]
0.5042976545673693
0.874565
[[163823   3001]
 [ 24792   8384]]
0.3762931711586365
0.861035
[[162075   5020]
 [ 20016  12889]]
0.5073011374817963
0.87482
[[166932    191]
 [ 32673    204]]
0.012262563116133688
0.83568
[[162101   4940]
 [ 20067  12892]]
0.5076489929318186
0.874965
[[161686   5125]
 [ 20150  13039]]
0.5078184331976711
0.873625


## Supervised KNN without 1st column 

In [7]:
# instantiate model
knn = KNeighborsClassifier(n_neighbors = n , metric='euclidean', p=2)
# fit model
knn.fit(X_train,y_train)

In [8]:
y_pred =  knn.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [9]:
cm= confusion_matrix(y_test,y_pred)
print(cm)
print(f1_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[161675   5156]
 [ 20103  13066]]
0.5084937051234651
0.873705


## Unsupervised KNN

In [None]:
# create model
nbrs = NearestNeighbors(n_neighbors = n)
# fit model
nbrs.fit(test.iloc[:, :-1])

In [None]:
# distances and indexes of k-neaighbors from model outputs
distances, indexes = nbrs.kneighbors(test.iloc[:, :-1])
# plot
plt.figure(figsize=(15, 7))
plt.plot(distances.mean(axis =1))

In [None]:
distances = pd.DataFrame(distances)
distances_mean = distances.mean(axis =1)
distances_mean

In [None]:
distances_mean.describe()

In [None]:
threshold = 1000
outlier_index = np.where(distances_mean > threshold)
outlier_index

In [None]:
outlier_values = test.iloc[outlier_index]
outlier_values