This file creates a .csv file with accuracy and log loss of different k-NN models with k from 1-25.
Out of this .csv, a plot can be created which displays the impact of the parameter k.

In [1]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss, precision_score, recall_score, f1_score, balanced_accuracy_score
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import LabelBinarizer

In [2]:
# Example of loading data into a pandas DataFrame (replace with your actual data loading)
df = pd.read_csv('data/tmp/encoded_train.csv')
test_df = pd.read_csv('data/tmp/encoded_test.csv')

# Display the first few rows of the dataframe
df.head()


Unnamed: 0,Day,Month,Hour,Night,Holiday,Block,lat,long,Category,DayOfWeek-Monday,...,StreetType-LN,StreetType-DR,StreetType-CT,StreetType-WAY,StreetType-RW,StreetType-PZ,Season-Winter,Season-Fall,Season-Spring,Season-Summer
0,29,11,0,1,0,0,0.584478,-1.557336,OTHER OFFENSES,0,...,0,0,0,0,0,0,0,1,0,0
1,1,6,8,0,0,1,0.911468,0.775401,OTHER OFFENSES,0,...,0,0,0,0,0,0,0,0,0,1
2,27,4,6,0,0,1,-2.045603,0.570183,OTHER OFFENSES,0,...,0,0,0,0,0,0,0,0,1,0
3,1,4,15,0,0,32,1.510611,-0.150875,ASSAULT,0,...,0,0,0,0,0,0,0,0,1,0
4,25,7,16,0,0,0,0.718501,0.532851,ASSAULT,0,...,0,0,0,0,0,0,0,0,0,1


In [3]:
# Define features (X) and target (y)
X = df.drop('Category', axis=1)  # Features: all columns except 'Category'
y = df['Category'] 

x_test = test_df.drop('Category', axis=1) # Features: all columns except 'Category'
y_test = test_df['Category']


In [4]:
param_grid = {
    'n_neighbors': [i for i in range(1, 26)],  # Number of neighbors to use
    'metric': ['euclidean', 'manhattan']  # Distance metric for tree
}

# Create a grid of parameters
param_list = list(ParameterGrid(param_grid))

In [5]:
# Track the best model and its score
results = []
case_nr = 1

# Iterate over each combination of parameters
for params in param_list:
    random.seed(42)
    result = {
        "params": params,
        "score": -1,
        "log_loss": -1
    }
    print("")
    print(f"Case {case_nr} / {len(param_list)}")
    print(f"Testing parameters: {params}")
    
    model = KNeighborsClassifier(n_jobs=8, **params)
    model.fit(X, y)

    print("Model fitted")
    
    # Evaluate the model on the test set: Score
    y_pred = model.predict(x_test)
    result["score"] = accuracy_score(y_test, y_pred)

    print("Prediction done")

    # Evaluate Log Loss
    y_pred_proba = model.predict_proba(x_test)
    label_binarizer = LabelBinarizer()
    labelTransformed = label_binarizer.fit_transform(y_test)

    print("Predict Proba done")

    df = pd.DataFrame(y_pred_proba, columns=model.classes_)
    df = df[label_binarizer.classes_]
    result["log_loss"] = log_loss(labelTransformed, df)

    results.append(result)
    
    case_nr += 1

df = pd.DataFrame(results)
df["n_neighbors"] = df["params"].apply(lambda x: x["n_neighbors"])
df["metric"] = df["params"].apply(lambda x: x["metric"])
df = df.drop("params", axis=1)

df.head()



Case 1 / 2
Testing parameters: {'metric': 'euclidean', 'n_neighbors': 1}
Model fitted
Prediction done
Predict Proba done

Case 2 / 2
Testing parameters: {'metric': 'manhattan', 'n_neighbors': 1}
Model fitted
Prediction done


In [None]:
df.to_csv()

### Plotting

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("knn_result.csv")

In [None]:
manhattan_df = df[df["metric"] == "manhattan"]
euclidean_df = df[df["metric"] == "euclidean"]

In [None]:
manhattan_df["score"] = manhattan_df["score"] * 100
euclidean_df["score"] = euclidean_df["score"] * 100

In [None]:
manhattan_df.head()

In [None]:
euclidean_df.head()

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(manhattan_df['n_neighbors'], manhattan_df['score'], marker='s', label='Manhatten Accuracy', color='blue')
plt.plot(euclidean_df['n_neighbors'], euclidean_df['score'], marker='s', label='Euclidean Accuracy', color='green')
plt.plot(manhattan_df['n_neighbors'], manhattan_df['log_loss'], marker='o', label='Manhatten Log Loss', color='red')
plt.plot(euclidean_df['n_neighbors'], euclidean_df['log_loss'], marker='o', label='Euclidean Log Loss', color='black')

# Adding titles and labels
plt.title('Accuracy and Log Loss vs. n_neighbors')
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy in % / Log Loss')
# Show legend
plt.legend()
# Show grid
plt.grid(True)
plt.savefig('knn_plot.png', dpi=150, bbox_inches='tight')
# Display the plot
plt.show()