In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn import neighbors

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Data

In [3]:
train = pd.read_csv("location_train.csv")
test = pd.read_csv("location_test.csv")

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Columns: 448 entries, ID to 446
dtypes: int64(448)
memory usage: 13.7 MB


In [5]:
train.head()

Unnamed: 0,ID,class,1,2,3,4,5,6,7,8,...,437,438,439,440,441,442,443,444,445,446
0,0,11,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,8,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,4,3,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Columns: 447 entries, ID to 446
dtypes: int64(447)
memory usage: 3.4 MB


In [7]:
test.head()

Unnamed: 0,ID,1,2,3,4,5,6,7,8,9,...,437,438,439,440,441,442,443,444,445,446
0,4000,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,4001,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,4002,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4003,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,4004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
X_train = train.drop(["ID", "class"], axis=1)
y_train = train["class"]

X_test = test.drop(["ID"], axis=1)

# Pre-processing

In [9]:
# not necessary

# Exploratory data analysis

In [10]:
X_train.isnull().sum()

1      0
2      0
3      0
4      0
5      0
      ..
442    0
443    0
444    0
445    0
446    0
Length: 446, dtype: int64

# Model selection

In [11]:
# Grid Search

cv = 10         # number of folds
verbose = 1     # information shown during training

## KNN

In [12]:
parameters = {
"n_neighbors":[1, 5, 10, 20], 
"weights":["uniform", "distance"],
"metric":["euclidean", "manhattan", "chebyshev", "minkowski", "wminkowski", "seuclidean", "mahalanobis"]}
model = GridSearchCV(neighbors.KNeighborsClassifier(), parameters, cv=cv, verbose=verbose, scoring="f1_weighted")
model.fit(X_train, y_train)

results = pd.DataFrame(model.cv_results_)
results= results[["param_n_neighbors", "param_weights", "param_metric", "mean_test_score"]]    
results.sort_values(["mean_test_score"], ascending=False).head(10)

Fitting 10 folds for each of 56 candidates, totalling 560 fits


Unnamed: 0,param_n_neighbors,param_weights,param_metric,mean_test_score
31,20,distance,minkowski,0.455965
7,20,distance,euclidean,0.455965
15,20,distance,manhattan,0.455965
6,20,uniform,euclidean,0.438488
30,20,uniform,minkowski,0.438488
14,20,uniform,manhattan,0.438488
5,10,distance,euclidean,0.430624
29,10,distance,minkowski,0.430624
13,10,distance,manhattan,0.430624
28,10,uniform,minkowski,0.401235


# Final model

In [13]:
best_model = model.best_estimator_
best_model.fit(X_train, y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=20, weights='distance')

In [14]:
predictions = pd.DataFrame(test["ID"])
predictions["class"] = best_model.predict(X_test)

In [15]:
predictions.to_csv("submission.csv", index=False)