In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn import svm

In [81]:
import warnings
warnings.filterwarnings("ignore")

# Data

In [82]:
train = pd.read_csv("location_train.csv")
test = pd.read_csv("location_test.csv")

In [83]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Columns: 448 entries, ID to 446
dtypes: int64(448)
memory usage: 13.7 MB


In [84]:
train.head()

Unnamed: 0,ID,class,1,2,3,4,5,6,7,8,...,437,438,439,440,441,442,443,444,445,446
0,0,11,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,8,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,4,3,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [85]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Columns: 447 entries, ID to 446
dtypes: int64(447)
memory usage: 3.4 MB


In [86]:
test.head()

Unnamed: 0,ID,1,2,3,4,5,6,7,8,9,...,437,438,439,440,441,442,443,444,445,446
0,4000,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,4001,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,4002,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4003,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,4004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [87]:
X_train = train.drop(["ID", "class"], axis=1)
y_train = train["class"]

X_test = test.drop(["ID"], axis=1)

# Model selection

In [88]:
# Grid Search

cv = 5          # number of folds
verbose = 1     # information shown during training

## SVM

In [89]:
parameters = {
    "C":[0.1, 1, 10, 25, 50],
    "kernel":["rbf", "sigmoid"],
    "gamma":["scale", "auto", 0.1, 0.01, 0.001],
    "class_weight":["balanced"]}
model = GridSearchCV(svm.SVC(), parameters, cv=cv, verbose=verbose, scoring="f1_weighted")
model.fit(X_train, y_train)

results = pd.DataFrame(model.cv_results_)
results = results[["param_C", "param_kernel", "param_gamma", "param_class_weight", "mean_test_score"]]
results.sort_values(["mean_test_score"], ascending=False).head(10)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


Unnamed: 0,param_C,param_kernel,param_gamma,param_class_weight,mean_test_score
46,50,rbf,0.01,balanced,0.701018
26,10,rbf,0.01,balanced,0.701018
36,25,rbf,0.01,balanced,0.701018
16,1,rbf,0.01,balanced,0.699168
28,10,rbf,0.001,balanced,0.698833
30,25,rbf,scale,balanced,0.697599
40,50,rbf,scale,balanced,0.697599
20,10,rbf,scale,balanced,0.697599
23,10,sigmoid,auto,balanced,0.696726
10,1,rbf,scale,balanced,0.693648


# Final model

In [90]:
best_model = model.best_estimator_
best_model.fit(X_train, y_train)

SVC(C=10, class_weight='balanced', gamma=0.01)

In [91]:
predictions = pd.DataFrame(test["ID"])
predictions["class"] = best_model.predict(X_test)

In [92]:
predictions.to_csv("submission.csv", index=False)