In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import neighbors
from sklearn import svm

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Data set

In [3]:
train = pd.read_csv("location_train.csv")
test = pd.read_csv("location_test.csv")
train.head()

Unnamed: 0,ID,class,1,2,3,4,5,6,7,8,...,437,438,439,440,441,442,443,444,445,446
0,0,11,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,8,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,4,3,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Columns: 448 entries, ID to 446
dtypes: int64(448)
memory usage: 13.7 MB


In [5]:
X = train.drop(["ID", "class"], axis=1)
y = train["class"]

X_test = test.drop(["ID"], axis=1)

In [6]:
X.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,437,438,439,440,441,442,443,444,445,446
0,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [7]:
y.head()

0    11
1     3
2     9
3     8
4     3
Name: class, dtype: int64

# Exploration

## Overview

In [8]:
X.isnull().sum().sum()

0

In [9]:
# Appearence of each class
y.value_counts().sort_index()

1     132
2     141
3     124
4     135
5      80
6     150
7      90
8     250
9     114
10    173
11    149
12    142
13    103
14     99
15    171
16     83
17    144
18    103
19    146
20    211
21    181
22     98
23    128
24    133
25    100
26    119
27    126
28    127
29    113
30    135
Name: class, dtype: int64

# Model selection

In [10]:
cv = 10
verbose = 0

## KNN

In [11]:
parameters = {"n_neighbors":[1, 5, 10, 20, 30, 40, 50], "weights":["uniform", "distance"]}
knn = GridSearchCV(neighbors.KNeighborsClassifier(), parameters, cv=cv, verbose=verbose)
knn.fit(X, y)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 5, 10, 20, 30, 40, 50],
                         'weights': ['uniform', 'distance']})

In [12]:
knn_results = pd.DataFrame(knn.cv_results_)
knn_results= knn_results[["param_n_neighbors", "param_weights", "mean_test_score"]]
knn_results.sort_values(["mean_test_score"], ascending=False)

Unnamed: 0,param_n_neighbors,param_weights,mean_test_score
13,50,distance,0.50675
11,40,distance,0.5055
9,30,distance,0.498
12,50,uniform,0.49525
10,40,uniform,0.49275
7,20,distance,0.48825
8,30,uniform,0.48725
6,20,uniform,0.476
5,10,distance,0.45975
4,10,uniform,0.4375


## Decision Tree

In [13]:
parameters = {"n_estimators":[100, 200, 300, 400, 500], "criterion":["gini", "entropy"], "max_depth":["None", 10, 20]}
dt = GridSearchCV(RandomForestClassifier(), parameters, cv=cv, verbose=verbose)
dt.fit(X, y)

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': ['None', 10, 20],
                         'n_estimators': [100, 200, 300, 400, 500]})

In [14]:
dt_results = pd.DataFrame(dt.cv_results_)
dt_results = dt_results[["param_n_estimators", "param_criterion", "param_max_depth", "mean_test_score"]]
dt_results.sort_values(["mean_test_score"], ascending=False)

Unnamed: 0,param_n_estimators,param_criterion,param_max_depth,mean_test_score
12,300,gini,20.0,0.5705
28,400,entropy,20.0,0.5695
29,500,entropy,20.0,0.5695
13,400,gini,20.0,0.566
14,500,gini,20.0,0.56475
27,300,entropy,20.0,0.5635
26,200,entropy,20.0,0.55875
11,200,gini,20.0,0.55475
10,100,gini,20.0,0.53875
24,500,entropy,10.0,0.53175


## Random Forest

In [15]:
parameters = {"n_estimators":[100, 200, 300, 400, 500], "criterion":["gini", "entropy"], "max_depth":["None", 10, 20]}
rf = GridSearchCV(RandomForestClassifier(), parameters, cv=cv, verbose=verbose)
rf.fit(X, y)

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': ['None', 10, 20],
                         'n_estimators': [100, 200, 300, 400, 500]})

In [16]:
rf_results = pd.DataFrame(rf.cv_results_)
rf_results = rf_results[["param_n_estimators", "param_criterion", "param_max_depth", "mean_test_score"]]
rf_results.sort_values(["mean_test_score"], ascending=False)

Unnamed: 0,param_n_estimators,param_criterion,param_max_depth,mean_test_score
29,500,entropy,20.0,0.5705
14,500,gini,20.0,0.5695
12,300,gini,20.0,0.5675
28,400,entropy,20.0,0.56525
11,200,gini,20.0,0.55875
27,300,entropy,20.0,0.5585
13,400,gini,20.0,0.55825
26,200,entropy,20.0,0.5495
10,100,gini,20.0,0.5445
24,500,entropy,10.0,0.536


## SVM

In [17]:
parameters = {"kernel":["sigmoid", "rbf"], "C":[1, 5, 10, 20], "gamma":["scale", "auto"]}
svm = GridSearchCV(svm.SVC(), parameters, cv=cv, verbose=verbose)
svm.fit(X, y)

GridSearchCV(cv=10, estimator=SVC(),
             param_grid={'C': [1, 5, 10, 20], 'gamma': ['scale', 'auto'],
                         'kernel': ['sigmoid', 'rbf']})

In [18]:
svm_results = pd.DataFrame(svm.cv_results_)
svm_results = svm_results[["param_C", "param_kernel", "param_gamma", "mean_test_score"]]
svm_results.sort_values(["mean_test_score"], ascending=False)

Unnamed: 0,param_C,param_kernel,param_gamma,mean_test_score
5,5,rbf,scale,0.7135
9,10,rbf,scale,0.7135
13,20,rbf,scale,0.7135
11,10,rbf,auto,0.70625
15,20,rbf,auto,0.70625
7,5,rbf,auto,0.70475
10,10,sigmoid,auto,0.70475
0,1,sigmoid,scale,0.70425
14,20,sigmoid,auto,0.7015
6,5,sigmoid,auto,0.698


# Final model

In [22]:
model = svm.SVC(C=5, kernel="rbf", gamma="scale")
model.fit(X, y)

SVC(C=5)

In [23]:
predictions = pd.DataFrame(test["ID"])
predictions["class"] = model.predict(X_test)

In [24]:
predictions.to_csv("submission.csv", index=False)