This notebook creates and trains a KNN model which deals with count-based encoded data.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Possible critical alarm types
critical_alarm_types = [7,15,16,21,33,56,68,95,1000,1001]
df = pd.DataFrame()

for i in critical_alarm_types:
    # Replace the link according to the data you want to read (All, Unique Samples, Random Samples)
    type_df = pd.read_csv("../Data/Train/Random Samples/Count-based/" + str(i) + "_countbased.csv")
    df = pd.concat([df, type_df], ignore_index=True)

X = df.drop(columns=['y']).values
y = df['y'].values

If you wish to perform an upsampling using SMOTE, run the following cell. If you do not wish to perform an upsampling, bypass the following cell and run the next one.

In [16]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the dataset
sm = SMOTE(k_neighbors=4, random_state=42)
X, y = sm.fit_resample(X, y)

In [2]:
# Train-test-split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [3]:
# Creating model

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=15, weights='distance', metric='manhattan')
knn.fit(X_train, y_train)

In [8]:
# Saving model

import joblib

joblib.dump(knn, 'knn_unique_countbased.joblib')

['knn_unique_countbased.joblib']

In [4]:
# Prediction with test data

y_pred = knn.predict(X_test)

KeyboardInterrupt: 

In [7]:
# Printing classification report

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           7       0.89      0.94      0.91      4351
          15       0.46      0.22      0.30        96
          16       0.61      0.56      0.58        71
          21       0.10      0.03      0.05        29
          33       0.17      0.06      0.09      2746
          56       0.08      0.02      0.03      1290
          68       0.70      0.88      0.78     63539
          95       0.00      0.00      0.00        50
        1000       0.55      0.35      0.43       427
        1001       0.46      0.24      0.32     28218

    accuracy                           0.66    100817
   macro avg       0.40      0.33      0.35    100817
weighted avg       0.62      0.66      0.62    100817



In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred, labels=knn.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=knn.classes_)
disp.plot()
plt.show()