In [9]:
import pandas as pd
import random
import joblib
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:

train = pd.read_csv('data/tmp/encoded_train.csv')
test = pd.read_csv('data/tmp/encoded_test.csv')

train.head()


Unnamed: 0,Day,Month,Hour,Minute,Night,Holiday,Block,lat,long,Category,...,StreetType-PL,StreetType-TR,StreetType-OTHER,StreetType-ST,StreetType-RW,StreetType-DR,Season-Summer,Season-Spring,Season-Fall,Season-Winter
0,29,11,0,17,1,0,0,0.584478,-1.557336,OTHER OFFENSES,...,0,0,0,0,0,0,0,0,1,0
1,1,6,8,0,0,0,1,0.911468,0.775401,OTHER OFFENSES,...,0,0,0,1,0,0,1,0,0,0
2,27,4,6,29,0,0,1,-2.045603,0.570183,OTHER OFFENSES,...,0,0,0,1,0,0,0,1,0,0
3,1,4,15,49,0,0,32,1.510611,-0.150875,ASSAULT,...,0,0,0,1,0,0,0,1,0,0
4,25,7,16,19,0,0,0,0.718501,0.532851,ASSAULT,...,0,0,0,0,0,0,1,0,0,0


In [3]:
X_train = train.drop('Category', axis=1)
y_train = train['Category']
X_test = test.drop('Category', axis=1)
y_test = test['Category']

In [4]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19],  # Number of neighbors to use
    'metric': ['euclidean', 'manhattan']  # Distance metric for tree
}

In [5]:
random.seed(42)
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=8, verbose=1)

In [6]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [8]:

# Step 7: Evaluate the best model on the test set
print(f"Grid Search Results: {grid.cv_results_}")
best_model = grid.best_estimator_
print(f"Best Model's Params: {grid.best_params_}")
print(f"Best Model's Score: {grid.best_score_}")

test_accuracy = best_model.score(X_test, y_test)
print(f"Best Model's Test Accuracy: {test_accuracy}")

Best Model's Params: {'metric': 'manhattan', 'n_neighbors': 19}
Best Model's Score: 0.2405837494755505


KeyboardInterrupt: 

In [5]:
from tqdm import tqdm
for metric in ['euclidean', 'manhattan', 'minkowski']:
    for k in tqdm([1, 3], desc=f'K Progress - {metric}'):
        knn = KNeighborsClassifier(n_neighbors=k, metric=metric, n_jobs=16)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        prob_predictions = knn.predict_proba(X_test)
        # export predictions to csv
        pd.DataFrame(y_pred).to_csv(f'data/tmp/knn_pred/knn_{k}_{metric}.csv', index=False)
        pd.DataFrame(prob_predictions, columns=knn.classes_).to_csv(f'data/tmp/knn_proba/knn_probas_{k}_{metric}.csv', index=False)

K Progress - euclidean: 100%|██████████| 2/2 [04:58<00:00, 149.48s/it]
K Progress - manhattan: 100%|██████████| 2/2 [33:05<00:00, 992.72s/it]
K Progress - minkowski: 100%|██████████| 2/2 [05:09<00:00, 154.66s/it]


In [6]:
# Training 25-nn model
knn = KNeighborsClassifier(n_neighbors=25, metric='manhattan', n_jobs=8)

In [7]:
knn.fit(X_train, y_train)

In [10]:
joblib.dump(knn, 'models/knn/knn.pkl')

['models/knn/knn.pkl']