In [4]:
# Packages importieren
import pickle
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [2]:
# Datenabrufen
X_train = pd.read_csv("../../Resampling/X_train.csv", index_col=0)
X_test = pd.read_csv("../../Resampling/X_test.csv", index_col=0)
X_val = pd.read_csv("../../Resampling/X_val.csv", index_col=0)

X_train_stm = pd.read_csv("../../Resampling/X_train_stm.csv", index_col=0)
X_test_stm = pd.read_csv("../../Resampling/X_test_stm.csv", index_col=0)
X_val_stm = pd.read_csv("../../Resampling/X_val_stm.csv", index_col=0)

y_train = pd.read_csv("../../Resampling/y_train.csv", index_col=0)
y_train_stm = pd.read_csv("../../Resampling/y_train_stm.csv", index_col=0)

y_test = pd.read_csv("../../Resampling/y_test.csv", index_col=0)
y_val = pd.read_csv("../../Resampling/y_val.csv", index_col=0)

print("Alle Datensätze wurden importiert.")

Alle Datensätze wurden importiert.


## E_MT_KNN_1: kNN mit resampeltem Datensatz

In [6]:
knn = KNeighborsClassifier()

param_grid = {
    'n_neighbors': [50, 150, 300, 450],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'p': [1,2],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [30, 50, 100]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid = RandomizedSearchCV(knn, param_grid, cv=cv, scoring='f1', verbose=10, n_jobs=-1, random_state=42)

grid.fit(X_train_stm, np.ravel(y_train_stm))

Fitting 3 folds for each of 10 candidates, totalling 30 fits


0,1,2
,estimator,KNeighborsClassifier()
,param_distributions,"{'algorithm': ['auto', 'ball_tree', ...], 'leaf_size': [30, 50, ...], 'metric': ['euclidean', 'manhattan', ...], 'n_neighbors': [50, 150, ...], ...}"
,n_iter,10
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,10
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,n_neighbors,50
,weights,'distance'
,algorithm,'ball_tree'
,leaf_size,100
,p,2
,metric,'manhattan'
,metric_params,
,n_jobs,


In [7]:
print("Best Params: ", grid.best_params_)

Best Params:  {'weights': 'distance', 'p': 2, 'n_neighbors': 50, 'metric': 'manhattan', 'leaf_size': 100, 'algorithm': 'ball_tree'}


In [3]:
knn_model = KNeighborsClassifier(
    n_neighbors=50,
    weights='distance',
    metric='manhattan',
    leaf_size=100,
    p=2,
    algorithm='ball_tree')
knn_model.fit(X_train_stm, np.ravel(y_train_stm))

0,1,2
,n_neighbors,50
,weights,'distance'
,algorithm,'ball_tree'
,leaf_size,100
,p,2
,metric,'manhattan'
,metric_params,
,n_jobs,


In [10]:
y_pred = knn_model.predict(X_val_stm)

print(classification_report(np.ravel(y_val), y_pred))
print(confusion_matrix(np.ravel(y_val), y_pred))

              precision    recall  f1-score   support

           0       0.94      0.69      0.79     21833
           1       0.28      0.75      0.40      3535

    accuracy                           0.69     25368
   macro avg       0.61      0.72      0.60     25368
weighted avg       0.85      0.69      0.74     25368

[[14963  6870]
 [  897  2638]]


In [5]:
y_pred_test = knn_model.predict(X_test_stm)
y_proba = knn_model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(np.ravel(y_test), y_proba)
print(f"AUC-ROC: {auc_score:.4f}")

print(classification_report(np.ravel(y_test), y_pred_test))
print(confusion_matrix(np.ravel(y_test), y_pred_test))

AUC-ROC: 0.7882
              precision    recall  f1-score   support

           0       0.94      0.68      0.79     43667
           1       0.28      0.75      0.40      7069

    accuracy                           0.69     50736
   macro avg       0.61      0.72      0.60     50736
weighted avg       0.85      0.69      0.74     50736

[[29799 13868]
 [ 1781  5288]]


## E_MT_KNN_2: kNN mit originärem Datensatz

In [6]:
knn_model2 = KNeighborsClassifier(
    n_neighbors=50,
    weights='distance',
    metric='manhattan',
    leaf_size=100,
    p=2,
    algorithm='ball_tree')
knn_model2.fit(X_train, np.ravel(y_train))

0,1,2
,n_neighbors,50
,weights,'distance'
,algorithm,'ball_tree'
,leaf_size,100
,p,2
,metric,'manhattan'
,metric_params,
,n_jobs,


In [13]:
y_pred2 = knn_model2.predict(X_val)

print(classification_report(np.ravel(y_val), y_pred2))
print(confusion_matrix(np.ravel(y_val), y_pred2))

              precision    recall  f1-score   support

           0       0.87      0.98      0.92     21833
           1       0.52      0.10      0.17      3535

    accuracy                           0.86     25368
   macro avg       0.69      0.54      0.55     25368
weighted avg       0.82      0.86      0.82     25368

[[21490   343]
 [ 3166   369]]


In [7]:
y_pred2_test = knn_model2.predict(X_test)
y_proba2 = knn_model2.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(np.ravel(y_test), y_proba2)
print(f"AUC-ROC: {auc_score:.4f}")

print(classification_report(np.ravel(y_test), y_pred2_test))
print(confusion_matrix(np.ravel(y_test), y_pred2_test))

AUC-ROC: 0.7913
              precision    recall  f1-score   support

           0       0.87      0.98      0.92     43667
           1       0.51      0.10      0.17      7069

    accuracy                           0.86     50736
   macro avg       0.69      0.54      0.55     50736
weighted avg       0.82      0.86      0.82     50736

[[42965   702]
 [ 6330   739]]


In [4]:
joblib.dump(knn_model, 'knn.pkl')

['knn.pkl']