In [83]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score,roc_curve, auc
from sklearn.model_selection import GridSearchCV


In [68]:
df = pd.read_csv('citibike_data/citibike_2018.csv')

### Test of dropping tripduration:

In [69]:
df = df.drop('tripduration', axis=1)

In [70]:
df_small = df.sample(15000)

In [71]:
df_small.head()

Unnamed: 0,start station id,usertype,birth year,gender,start_month,start_day_of_week,start_hour,ave_temp,precip,snow_depth,stop_nhbr
1753873,382.0,1,1974,1,12,3,19,35.0,0.0,0,NoHo
618234,495.0,1,1966,1,6,2,17,64.5,0.0,0,Financial District
527857,3308.0,1,2000,2,5,3,6,60.0,0.25,0,Greenwich Village
1323561,3235.0,1,1963,1,9,2,9,75.5,0.21,0,Clinton
499956,505.0,1,1986,1,5,3,9,73.0,0.0,0,Lower East Side


In [72]:
labels = df_small.stop_nhbr
labels_removed_df = df_small.drop('stop_nhbr', axis=1)

In [73]:
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(labels_removed_df.astype(float))

In [74]:
X_train, X_test, y_train, y_test = train_test_split(scaled_df, labels, stratify=labels )

In [75]:
clf = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.5, max_delta_step=0,
       max_depth=2, min_child_weight=4, missing=None, n_estimators=300,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [76]:
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.5, max_delta_step=0,
       max_depth=2, min_child_weight=4, missing=None, n_estimators=300,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [77]:
training_preds = clf.predict(X_train)

In [78]:
val_preds = clf.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)

print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

Training Accuracy: 34.94%
Validation accuracy: 5.893%


In [58]:
param_grid = {
    "learning_rate": [0.5, 0.7, 0.1],
    'max_depth': [2, 3, 4],
    'min_child_weight': [4, 5],
    'n_estimators': [300, 400, 500],
}

In [59]:
grid_clf = GridSearchCV(clf, param_grid, scoring='accuracy', cv=None, n_jobs=1)
eval_metric = ["auc", 'error']
eval_set = [(X_train, y_train), (X_test, y_test)]
grid_clf.fit(X_train, y_train, eval_set= eval_set, early_stopping_rounds=5)
best_parameters = grid_clf.best_params_

print("Grid Search found the following optimal parameters: ")
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

training_preds = grid_clf.predict(X_train)
val_preds = grid_clf.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)

print("")
print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))



[0]	validation_0-merror:0.920356	validation_1-merror:0.930133
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.916622	validation_1-merror:0.930133
[2]	validation_0-merror:0.913778	validation_1-merror:0.931733
[3]	validation_0-merror:0.911378	validation_1-merror:0.9304
[4]	validation_0-merror:0.909156	validation_1-merror:0.929867
[5]	validation_0-merror:0.909244	validation_1-merror:0.928
[6]	validation_0-merror:0.907733	validation_1-merror:0.9296
[7]	validation_0-merror:0.905333	validation_1-merror:0.9296
[8]	validation_0-merror:0.903289	validation_1-merror:0.927733
[9]	validation_0-merror:0.901778	validation_1-merror:0.928267
[10]	validation_0-merror:0.902578	validation_1-merror:0.927733
[11]	validation_0-merror:0.8984	validation_1-merror:0.926933
[12]	validation_0-merror:0.897244	validation_1-merror:0.927733
[13]	validation_0-merror:0.896978	valida

[0]	validation_0-merror:0.925067	validation_1-merror:0.9392
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.918844	validation_1-merror:0.933067
[2]	validation_0-merror:0.915378	validation_1-merror:0.930667
[3]	validation_0-merror:0.911822	validation_1-merror:0.9296
[4]	validation_0-merror:0.9096	validation_1-merror:0.931467
[5]	validation_0-merror:0.907644	validation_1-merror:0.931467
[6]	validation_0-merror:0.907289	validation_1-merror:0.932533
[7]	validation_0-merror:0.903022	validation_1-merror:0.929333
[8]	validation_0-merror:0.902044	validation_1-merror:0.930133
[9]	validation_0-merror:0.897867	validation_1-merror:0.932
[10]	validation_0-merror:0.896178	validation_1-merror:0.932
[11]	validation_0-merror:0.895022	validation_1-merror:0.9312
[12]	validation_0-merror:0.893067	validation_1-merror:0.930933
Stopping. Best iteration:
[7]	validation_0-

[0]	validation_0-merror:0.924978	validation_1-merror:0.940533
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.918311	validation_1-merror:0.9336
[2]	validation_0-merror:0.914667	validation_1-merror:0.9312
[3]	validation_0-merror:0.911556	validation_1-merror:0.930667
[4]	validation_0-merror:0.909956	validation_1-merror:0.934133
[5]	validation_0-merror:0.909867	validation_1-merror:0.9352
[6]	validation_0-merror:0.908178	validation_1-merror:0.933867
[7]	validation_0-merror:0.904	validation_1-merror:0.930933
[8]	validation_0-merror:0.900533	validation_1-merror:0.9312
Stopping. Best iteration:
[3]	validation_0-merror:0.911556	validation_1-merror:0.930667

[0]	validation_0-merror:0.914044	validation_1-merror:0.938133
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't 

[8]	validation_0-merror:0.870667	validation_1-merror:0.939467
[9]	validation_0-merror:0.8656	validation_1-merror:0.9368
[10]	validation_0-merror:0.864533	validation_1-merror:0.937067
[11]	validation_0-merror:0.863644	validation_1-merror:0.936
[12]	validation_0-merror:0.859644	validation_1-merror:0.933867
[13]	validation_0-merror:0.857156	validation_1-merror:0.936267
[14]	validation_0-merror:0.854222	validation_1-merror:0.9368
[15]	validation_0-merror:0.850933	validation_1-merror:0.936267
[16]	validation_0-merror:0.846933	validation_1-merror:0.937333
[17]	validation_0-merror:0.842844	validation_1-merror:0.936267
Stopping. Best iteration:
[12]	validation_0-merror:0.859644	validation_1-merror:0.933867

[0]	validation_0-merror:0.915556	validation_1-merror:0.933067
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.905511	validation_1-merror:0.930133
[2]	v

[6]	validation_0-merror:0.883556	validation_1-merror:0.930133
[7]	validation_0-merror:0.878311	validation_1-merror:0.932533
Stopping. Best iteration:
[2]	validation_0-merror:0.901422	validation_1-merror:0.9264

[0]	validation_0-merror:0.904178	validation_1-merror:0.930667
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.897067	validation_1-merror:0.935467
[2]	validation_0-merror:0.888178	validation_1-merror:0.935467
[3]	validation_0-merror:0.881156	validation_1-merror:0.934667
[4]	validation_0-merror:0.875556	validation_1-merror:0.9368
[5]	validation_0-merror:0.867644	validation_1-merror:0.933333
Stopping. Best iteration:
[0]	validation_0-merror:0.904178	validation_1-merror:0.930667

[0]	validation_0-merror:0.912178	validation_1-merror:0.938133
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.900533	validation_1-merror:0.937333
[2]	validation_0-merror:0.894222	validation_1-merror:0.934933
[3]	validation_0-merror:0.885689	validation_1-merror:0.9352
[4]	validation_0-merror:0.878044	validation_1-merror:0.9344
[5]	validation_0-merror:0.871378	validation_1-merror:0.930933
Stopping. Best iteration:
[0]	validation_0-merror:0.909067	validation_1-merror:0.927733

[0]	validation_0-merror:0.911467	validation_1-merror:0.933867
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.897511	validation_1-merror:0.9368
[2]	validation_0-merror:0.891644	validation_1-merror:0.938933
[3]	validation_0-merror:0.884533	validation_1-merror:0.939733
[4]	validation_0-merror:0.877333	validation_1-merror:0.938667
[5]	validation_0-merror:0.870133	validation_1-merror:0.937067
Stoppin

[9]	validation_0-merror:0.897067	validation_1-merror:0.932533
[10]	validation_0-merror:0.8952	validation_1-merror:0.930933
[11]	validation_0-merror:0.892356	validation_1-merror:0.934933
[12]	validation_0-merror:0.890311	validation_1-merror:0.933333
Stopping. Best iteration:
[7]	validation_0-merror:0.9016	validation_1-merror:0.927467

[0]	validation_0-merror:0.920356	validation_1-merror:0.930133
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.915822	validation_1-merror:0.930133
[2]	validation_0-merror:0.913867	validation_1-merror:0.929067
[3]	validation_0-merror:0.912178	validation_1-merror:0.928533
[4]	validation_0-merror:0.910311	validation_1-merror:0.9304
[5]	validation_0-merror:0.907467	validation_1-merror:0.9336
[6]	validation_0-merror:0.905867	validation_1-merror:0.931467
[7]	validation_0-merror:0.902756	validation_1-merror:0.9328
[8]	validati

[7]	validation_0-merror:0.904889	validation_1-merror:0.9336
Stopping. Best iteration:
[2]	validation_0-merror:0.914667	validation_1-merror:0.926667

[0]	validation_0-merror:0.925067	validation_1-merror:0.930933
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.920711	validation_1-merror:0.935467
[2]	validation_0-merror:0.9184	validation_1-merror:0.934667
[3]	validation_0-merror:0.914578	validation_1-merror:0.936533
[4]	validation_0-merror:0.912711	validation_1-merror:0.936267
[5]	validation_0-merror:0.907289	validation_1-merror:0.933867
Stopping. Best iteration:
[0]	validation_0-merror:0.925067	validation_1-merror:0.930933

[0]	validation_0-merror:0.924978	validation_1-merror:0.940533
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	va

[7]	validation_0-merror:0.876889	validation_1-merror:0.934133
[8]	validation_0-merror:0.873422	validation_1-merror:0.937867
[9]	validation_0-merror:0.867911	validation_1-merror:0.9416
[10]	validation_0-merror:0.864178	validation_1-merror:0.940267
[11]	validation_0-merror:0.860444	validation_1-merror:0.9408
[12]	validation_0-merror:0.856178	validation_1-merror:0.9376
Stopping. Best iteration:
[7]	validation_0-merror:0.876889	validation_1-merror:0.934133

[0]	validation_0-merror:0.917422	validation_1-merror:0.942933
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.910667	validation_1-merror:0.940533
[2]	validation_0-merror:0.905333	validation_1-merror:0.9408
[3]	validation_0-merror:0.897511	validation_1-merror:0.939467
[4]	validation_0-merror:0.891822	validation_1-merror:0.9384
[5]	validation_0-merror:0.888178	validation_1-merror:0.940533
[6]	validati

[1]	validation_0-merror:0.906578	validation_1-merror:0.930933
[2]	validation_0-merror:0.901244	validation_1-merror:0.9296
[3]	validation_0-merror:0.897689	validation_1-merror:0.930133
[4]	validation_0-merror:0.892533	validation_1-merror:0.930133
[5]	validation_0-merror:0.888089	validation_1-merror:0.931733
[6]	validation_0-merror:0.883289	validation_1-merror:0.931467
[7]	validation_0-merror:0.876533	validation_1-merror:0.933333
Stopping. Best iteration:
[2]	validation_0-merror:0.901244	validation_1-merror:0.9296

[0]	validation_0-merror:0.904178	validation_1-merror:0.930667
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.895111	validation_1-merror:0.9336
[2]	validation_0-merror:0.886044	validation_1-merror:0.9328
[3]	validation_0-merror:0.877511	validation_1-merror:0.9296
[4]	validation_0-merror:0.8712	validation_1-merror:0.9296
[5]	validation_0-me

[3]	validation_0-merror:0.8792	validation_1-merror:0.932267
[4]	validation_0-merror:0.873067	validation_1-merror:0.9312
[5]	validation_0-merror:0.861511	validation_1-merror:0.930933
Stopping. Best iteration:
[0]	validation_0-merror:0.909067	validation_1-merror:0.927733

[0]	validation_0-merror:0.911467	validation_1-merror:0.933867
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.899733	validation_1-merror:0.9376
[2]	validation_0-merror:0.891911	validation_1-merror:0.9352
[3]	validation_0-merror:0.882578	validation_1-merror:0.936
[4]	validation_0-merror:0.876978	validation_1-merror:0.940533
[5]	validation_0-merror:0.868711	validation_1-merror:0.9392
Stopping. Best iteration:
[0]	validation_0-merror:0.911467	validation_1-merror:0.933867

[0]	validation_0-merror:0.909333	validation_1-merror:0.939733
Multiple eval metrics have been passed: 'validation_1

[2]	validation_0-merror:0.919022	validation_1-merror:0.928533
[3]	validation_0-merror:0.914756	validation_1-merror:0.928
[4]	validation_0-merror:0.913778	validation_1-merror:0.9256
[5]	validation_0-merror:0.914222	validation_1-merror:0.9272
[6]	validation_0-merror:0.912	validation_1-merror:0.9296
[7]	validation_0-merror:0.911556	validation_1-merror:0.9296
[8]	validation_0-merror:0.910044	validation_1-merror:0.928267
[9]	validation_0-merror:0.909333	validation_1-merror:0.928
Stopping. Best iteration:
[4]	validation_0-merror:0.913778	validation_1-merror:0.9256

[0]	validation_0-merror:0.920356	validation_1-merror:0.930133
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.915911	validation_1-merror:0.930667
[2]	validation_0-merror:0.916711	validation_1-merror:0.932267
[3]	validation_0-merror:0.914311	validation_1-merror:0.928267
[4]	validation_0-merror:

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.921689	validation_1-merror:0.931733
[2]	validation_0-merror:0.920889	validation_1-merror:0.9336
[3]	validation_0-merror:0.918044	validation_1-merror:0.929867
[4]	validation_0-merror:0.917422	validation_1-merror:0.930133
[5]	validation_0-merror:0.918578	validation_1-merror:0.931733
[6]	validation_0-merror:0.916	validation_1-merror:0.9328
[7]	validation_0-merror:0.914667	validation_1-merror:0.9312
[8]	validation_0-merror:0.914311	validation_1-merror:0.930133
Stopping. Best iteration:
[3]	validation_0-merror:0.918044	validation_1-merror:0.929867

[0]	validation_0-merror:0.924978	validation_1-merror:0.940533
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.921689	validation_1-merror:0.930933
[2]	validation_0-merror:0.919378	validation_1-merror:0.928533
[3]	valida

[6]	validation_0-merror:0.903022	validation_1-merror:0.933333
[7]	validation_0-merror:0.901067	validation_1-merror:0.936533
Stopping. Best iteration:
[2]	validation_0-merror:0.908622	validation_1-merror:0.931467

[0]	validation_0-merror:0.915556	validation_1-merror:0.933067
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.907022	validation_1-merror:0.928
[2]	validation_0-merror:0.907289	validation_1-merror:0.931467
[3]	validation_0-merror:0.905156	validation_1-merror:0.9304
[4]	validation_0-merror:0.900533	validation_1-merror:0.9304
[5]	validation_0-merror:0.899467	validation_1-merror:0.9296
[6]	validation_0-merror:0.897333	validation_1-merror:0.929067
Stopping. Best iteration:
[1]	validation_0-merror:0.907022	validation_1-merror:0.928

[0]	validation_0-merror:0.914044	validation_1-merror:0.938133
Multiple eval metrics have been passed: 'validation_

[5]	validation_0-merror:0.902933	validation_1-merror:0.929867
[6]	validation_0-merror:0.902133	validation_1-merror:0.932267
[7]	validation_0-merror:0.901333	validation_1-merror:0.932267
Stopping. Best iteration:
[2]	validation_0-merror:0.908622	validation_1-merror:0.927467

[0]	validation_0-merror:0.917511	validation_1-merror:0.9384
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.913422	validation_1-merror:0.935467
[2]	validation_0-merror:0.910578	validation_1-merror:0.932533
[3]	validation_0-merror:0.908356	validation_1-merror:0.930667
[4]	validation_0-merror:0.905156	validation_1-merror:0.9352
[5]	validation_0-merror:0.903111	validation_1-merror:0.932533
[6]	validation_0-merror:0.903733	validation_1-merror:0.930667
[7]	validation_0-merror:0.901156	validation_1-merror:0.9352
[8]	validation_0-merror:0.899111	validation_1-merror:0.9328
Stopping. Bes

[4]	validation_0-merror:0.894844	validation_1-merror:0.934667
[5]	validation_0-merror:0.893422	validation_1-merror:0.9296
[6]	validation_0-merror:0.891022	validation_1-merror:0.931733
[7]	validation_0-merror:0.888622	validation_1-merror:0.933067
[8]	validation_0-merror:0.883911	validation_1-merror:0.9352
[9]	validation_0-merror:0.8816	validation_1-merror:0.936267
[10]	validation_0-merror:0.880444	validation_1-merror:0.938933
Stopping. Best iteration:
[5]	validation_0-merror:0.893422	validation_1-merror:0.9296

[0]	validation_0-merror:0.909244	validation_1-merror:0.939467
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.901333	validation_1-merror:0.930933
[2]	validation_0-merror:0.899111	validation_1-merror:0.9304
[3]	validation_0-merror:0.894844	validation_1-merror:0.9288
[4]	validation_0-merror:0.892356	validation_1-merror:0.925333
[5]	validation_0

[5]	validation_0-merror:0.892533	validation_1-merror:0.933333
[6]	validation_0-merror:0.888089	validation_1-merror:0.932
[7]	validation_0-merror:0.884178	validation_1-merror:0.933067
[8]	validation_0-merror:0.885689	validation_1-merror:0.932533
[9]	validation_0-merror:0.884178	validation_1-merror:0.932
[10]	validation_0-merror:0.883733	validation_1-merror:0.9328
[11]	validation_0-merror:0.879111	validation_1-merror:0.9328
Stopping. Best iteration:
[6]	validation_0-merror:0.888089	validation_1-merror:0.932

[0]	validation_0-merror:0.909333	validation_1-merror:0.939733
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.901156	validation_1-merror:0.930933
[2]	validation_0-merror:0.898044	validation_1-merror:0.929333
[3]	validation_0-merror:0.895111	validation_1-merror:0.932
[4]	validation_0-merror:0.890844	validation_1-merror:0.928
[5]	validation_0-merro

In [84]:
acc = accuracy_score(y_test, val_preds) * 100

In [85]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, val_preds)

ValueError: multiclass format is not supported

In [86]:
roc_auc = auc(false_positive_rate, true_positive_rate)

NameError: name 'false_positive_rate' is not defined

In [None]:
print('Accuracy is: {0}'.format(acc))
print('\nAUC is: {0}'.format(round(roc_auc, 2)))
print('\nConfusion Matrix')
print('----------------')
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)