In [2]:
import pandas as pd
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier

In [3]:
RANDOM_SEED = 257
CV = 10
MODELS = [
    LinearSVC(),
    GaussianNB(),
    LogisticRegression(multi_class="multinomial"),
    LogisticRegression(multi_class="ovr"),
    RidgeClassifier(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(max_depth=5),
    AdaBoostClassifier(DecisionTreeClassifier(max_depth=5)),
    lgb.LGBMClassifier(),
    RandomForestClassifier(),
    CatBoostClassifier(),
    GradientBoostingClassifier(),
    xgb.XGBClassifier(objective='multi:softprob', metric='logloss')
]

In [5]:
train = pd.read_csv("data/train.csv")

In [6]:
X, y = train[['latitude', 'longitude']], train['label']

## Cross validation of models

In [11]:
kf = KFold(n_splits=CV, shuffle=True, random_state=3)

In [14]:
scores = []
for model in MODELS:
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index, ], X.iloc[test_index, ]
        y_train, y_test = y.iloc[train_index, ], y.iloc[test_index, ]
        model_name = model.__class__.__name__

        # model fitting
        model.fit(X_train, y_train)

        #predictions
        y_pred = model.predict(X_test)

        score = f1_score(y_test, y_pred, average="macro")
        scores.append((model_name, score))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Learning rate set to 0.080894
0:	learn: 1.0697857	total: 2.48ms	remaining: 2.48s
1:	learn: 1.0417684	total: 5.94ms	remaining: 2.96s
2:	learn: 1.0210125	total: 8.4ms	remaining: 2.79s
3:	learn: 1.0012885	total: 10.8ms	remaining: 2.69s
4:	learn: 0.9808205	total: 15.3ms	remaining: 3.05s
5:	learn: 0.9635293	total: 18.4ms	remaining: 3.04s
6:	learn: 0.9466687	total: 21.4ms	remaining: 3.03s
7:	learn: 0.9357391	total: 23.9ms	remaining: 2.97s
8:	learn: 0.9237817	total: 26.3ms	remaining: 2.9s
9:	learn: 0.9149934	total: 29.1ms	remaining: 2.88s
10:	learn: 0.9057442	total: 31.7ms	remaining: 2.85s
11:	learn: 0.8972013	total: 34.1ms	remaining: 2.81s
12:	learn: 0.8889850	total: 37.3ms	remaining: 2.83s
13:	learn: 0.8826797	total: 39.5ms	remaining: 2.78s
14:	learn: 0.8765202	total: 41.9ms	remaining: 2.75s
15:	learn: 0.8706800	total: 44.3ms	remaining: 2.73s
16:	learn: 0.8650349	total: 47.1ms	remaining: 2.72s
17:	learn: 0.8601827	total: 49.9ms	remaining: 2.72s
18:	learn: 0.8562040	total: 52.2ms	remaining: 

In [17]:
scores_df = pd.DataFrame(scores, columns=["Model", "F1_score"])

In [22]:
pd.concat([scores_df.groupby("Model").mean(), scores_df.groupby("Model").std()], axis=1)

Unnamed: 0_level_0,F1_score,F1_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
AdaBoostClassifier,0.534684,0.044207
CatBoostClassifier,0.580843,0.049868
DecisionTreeClassifier,0.489996,0.052803
GaussianNB,0.343847,0.035538
GradientBoostingClassifier,0.50885,0.061181
KNeighborsClassifier,0.594214,0.040882
LGBMClassifier,0.587058,0.046596
LinearSVC,0.246187,0.094688
LogisticRegression,0.29447,0.029968
RandomForestClassifier,0.590371,0.043542


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=RANDOM_SEED, stratify=y)

In [9]:
knm = KNeighborsClassifier()

In [10]:
knm.fit(X_train, y_train)

In [11]:
knm_preds = knm.predict(X_test)

In [12]:
f1_score(y_test, knm_preds, average="macro")

0.5550902345469408

In [13]:
knm.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [15]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [16]:
rfc_preds = rfc.predict(X_test)

In [17]:
f1_score(y_test, rfc_preds, average="macro")

0.6665508446321864

In [18]:
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}