In [1]:
import pandas as pd
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier

In [2]:
RANDOM_SEED = 257
CV = 10
MODELS = [
    LinearSVC(),
    GaussianNB(),
    LogisticRegression(multi_class="multinomial"),
    LogisticRegression(multi_class="ovr"),
    RidgeClassifier(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(max_depth=5),
    AdaBoostClassifier(DecisionTreeClassifier(max_depth=5)),
    lgb.LGBMClassifier(),
    RandomForestClassifier(),
    CatBoostClassifier(logging_level="Silent"),
    GradientBoostingClassifier(),
    xgb.XGBClassifier(objective='multi:softprob', metric='logloss')
]

In [3]:
train = pd.read_csv("data/train.csv")

In [4]:
X, y = train[['latitude', 'longitude']], train['label']

## Cross validation of models

In [5]:
kf = KFold(n_splits=CV, shuffle=True, random_state=3)

In [10]:
%%capture output
scores = []
for model in MODELS:
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index, ], X.iloc[test_index, ]
        y_train, y_test = y.iloc[train_index, ], y.iloc[test_index, ]
        model_name = model.__class__.__name__

        # model fitting
        model.fit(X_train, y_train)

        #predictions
        y_pred = model.predict(X_test)

        score = f1_score(y_test, y_pred, average="macro")
        scores.append((model_name, score))


Learning rate set to 0.081118
0:	learn: 1.0669644	total: 997us	remaining: 997ms
1:	learn: 1.0413190	total: 2.35ms	remaining: 1.17s
2:	learn: 1.0202576	total: 3.33ms	remaining: 1.11s
3:	learn: 0.9988116	total: 4.33ms	remaining: 1.08s
4:	learn: 0.9794805	total: 5.41ms	remaining: 1.07s
5:	learn: 0.9621327	total: 6.25ms	remaining: 1.03s
6:	learn: 0.9483044	total: 7.04ms	remaining: 999ms
7:	learn: 0.9366570	total: 7.98ms	remaining: 990ms
8:	learn: 0.9267382	total: 8.85ms	remaining: 974ms
9:	learn: 0.9169474	total: 9.67ms	remaining: 957ms
10:	learn: 0.9062855	total: 10.5ms	remaining: 948ms
11:	learn: 0.8966573	total: 11.5ms	remaining: 950ms
12:	learn: 0.8883683	total: 12.4ms	remaining: 942ms
13:	learn: 0.8819800	total: 13.3ms	remaining: 938ms
14:	learn: 0.8766169	total: 14.3ms	remaining: 937ms
15:	learn: 0.8710003	total: 15.2ms	remaining: 934ms
16:	learn: 0.8654528	total: 16.1ms	remaining: 930ms
17:	learn: 0.8609480	total: 16.9ms	remaining: 923ms
18:	learn: 0.8567690	total: 17.9ms	remaining:

In [7]:
scores_df = pd.DataFrame(scores, columns=["Model", "F1_score"])

In [8]:
pd.concat([scores_df.groupby("Model").mean(), scores_df.groupby("Model").std()], axis=1)

Unnamed: 0_level_0,F1_score,F1_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
AdaBoostClassifier,0.536309,0.063123
CatBoostClassifier,0.566972,0.070085
DecisionTreeClassifier,0.498554,0.060411
GaussianNB,0.345088,0.047561
GradientBoostingClassifier,0.50503,0.06687
KNeighborsClassifier,0.595256,0.054783
LGBMClassifier,0.577145,0.045409
LinearSVC,0.217658,0.081998
LogisticRegression,0.291441,0.041737
RandomForestClassifier,0.589675,0.058953


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=RANDOM_SEED, stratify=y)

In [9]:
knm = KNeighborsClassifier()

In [10]:
knm.fit(X_train, y_train)

In [11]:
knm_preds = knm.predict(X_test)

In [12]:
f1_score(y_test, knm_preds, average="macro")

0.5550902345469408

In [13]:
knm.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [15]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [16]:
rfc_preds = rfc.predict(X_test)

In [17]:
f1_score(y_test, rfc_preds, average="macro")

0.6665508446321864

In [18]:
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}