In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [4]:
import warnings
warnings.filterwarnings("ignore")

# Data

In [5]:
train = pd.read_csv("location_train.csv")
test = pd.read_csv("location_test.csv")

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Columns: 448 entries, ID to 446
dtypes: int64(448)
memory usage: 13.7 MB


In [7]:
train.head()

Unnamed: 0,ID,class,1,2,3,4,5,6,7,8,...,437,438,439,440,441,442,443,444,445,446
0,0,11,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,8,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,4,3,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Columns: 447 entries, ID to 446
dtypes: int64(447)
memory usage: 3.4 MB


In [9]:
test.head()

Unnamed: 0,ID,1,2,3,4,5,6,7,8,9,...,437,438,439,440,441,442,443,444,445,446
0,4000,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,4001,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,4002,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4003,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,4004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [10]:
X_train = train.drop(["ID", "class"], axis=1)
y_train = train["class"]

X_test = test.drop(["ID"], axis=1)

# Model selection

In [11]:
# Grid Search

cv = 10         # number of folds
verbose = 3     # information shown during training

## Decision Tree

In [12]:
parameters = {
    "criterion":["gini", "entropy"],
    "splitter":["best", "random"], 
    "max_depth":["None", 100, 500, 1000],
    "max_features":["auto", "sqrt", "log2"]}    
model = GridSearchCV(DecisionTreeClassifier(), parameters, cv=cv, verbose=verbose, scoring="accuracy")
model.fit(X_train, y_train)

results = pd.DataFrame(model.cv_results_)
results = results[["param_criterion", "param_splitter", "param_max_depth", "param_max_features", "mean_test_score"]]
results.sort_values(["mean_test_score"], ascending=False).head(10)

Fitting 10 folds for each of 48 candidates, totalling 480 fits
[CV 1/10] END criterion=gini, max_depth=None, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 2/10] END criterion=gini, max_depth=None, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 3/10] END criterion=gini, max_depth=None, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 4/10] END criterion=gini, max_depth=None, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 5/10] END criterion=gini, max_depth=None, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 6/10] END criterion=gini, max_depth=None, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 7/10] END criterion=gini, max_depth=None, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 8/10] END criterion=gini, max_depth=None, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 9/10] END criterion=gini, max_depth=None, max_feature

Unnamed: 0,param_criterion,param_splitter,param_max_depth,param_max_features,mean_test_score
15,gini,random,500,sqrt,0.23075
9,gini,random,100,sqrt,0.22425
14,gini,best,500,sqrt,0.22275
6,gini,best,100,auto,0.21925
7,gini,random,100,auto,0.21825
20,gini,best,1000,sqrt,0.21825
19,gini,random,1000,auto,0.21475
21,gini,random,1000,sqrt,0.21475
13,gini,random,500,auto,0.21425
8,gini,best,100,sqrt,0.21025


# Final model

In [24]:
best_model = model.best_estimator_
best_model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=50, max_features='sqrt', splitter='random')

In [25]:
predictions = pd.DataFrame(test["ID"])
predictions["class"] = best_model.predict(X_test)

In [26]:
predictions.to_csv("submission.csv", index=False)