In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import precision_recall_curve, auc, accuracy_score


In [2]:
df = pd.read_csv('fe.csv')

In [3]:
col = df.columns
col = [x for x in col if not x.startswith('label_')]
df = df[col]

In [4]:
df_features = df[col[1:-1]]
df_label = df[col[-1]]
X_train, X_test, y_train, y_test = train_test_split(df_features, df_label, test_size=0.33, random_state=42)


In [5]:

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


In [6]:
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 2}  # the number of classes that exist in this datset
bst = xgb.train(param, dtrain, 100)

In [7]:
precision, recall, _ = precision_recall_curve(y_test, bst.predict(dtest)[:,1])
area = auc(recall, precision)
print(area)


0.5248535661729962


In [54]:
x = bst.get_score(importance_type='gain')
sorted_x = sorted(x.items(), key=lambda kv: kv[1], reverse=True)
sorted_x

[('second_median', 64.93195273583332),
 ('second_std', 56.44265646774999),
 ('gyro_x_std', 41.3779144),
 ('second_mean', 36.497460610000005),
 ('acceleration_y_mad', 29.97953502),
 ('Speed_mean', 26.944353692500002),
 ('acceleration_z_mad', 24.350096469999997),
 ('gyro_z_abs_mean', 21.5712919),
 ('Speed_mean_90_sec', 21.000571745000002),
 ('second_max', 18.17402018375),
 ('Bearing_std', 17.797105655000003),
 ('Bearing_mad', 13.966251934000002),
 ('acceleration_x_max', 13.1982975),
 ('second_mad_120_sec', 12.5750313),
 ('second_mean_10_sec', 12.38619185),
 ('acceleration_y_std', 12.070799551666667),
 ('gyro_y_min', 11.972249),
 ('Speed_skew', 11.382991999285712),
 ('acceleration_z_std', 10.773324185555555),
 ('gyro_z_mad', 10.41396285),
 ('Speed_mean_120_sec', 10.238440113333334),
 ('second_mad_15_sec', 9.935660528333331),
 ('second_skew', 9.606435853461537),
 ('Speed_max', 9.564736553214285),
 ('acceleration_z_skew_15_sec', 8.30029297),
 ('Speed_sum', 8.261687015),
 ('Speed_max_60_sec'

In [57]:
params = {
    'max_depth': 3,
    'gamma': 0.3,
    'colsample_bytree': 0.2,
}

clf = xgb.XGBClassifier(
    n_estimators=250,
    learning_rate=0.05,
    n_jobs=4,
    **params
)


In [60]:
model = clf.fit(X_train, y_train)

In [62]:
model.predict_proba(X_test)

array([[0.9206424 , 0.07935762],
       [0.7482774 , 0.25172257],
       [0.67150146, 0.32849854],
       ...,
       [0.6817523 , 0.3182477 ],
       [0.8937309 , 0.10626913],
       [0.7500552 , 0.24994482]], dtype=float32)

In [7]:
import numpy as np
import pandas as pd

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer

import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge, RidgeCV

In [17]:
def scorer(truth, predictions):
    precision, recall, _ = precision_recall_curve(truth, predictions)
    area = auc(recall, precision)
    return area

n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)

def train_model(X,
                y,
                params=None,
                folds=folds,
                model_type='lgb', 
                plot_feature_importance=False,
                model=None):
    
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        train_data = xgb.DMatrix(data=X_train, label=y_train)
        valid_data = xgb.DMatrix(data=X_valid, label=y_valid)

        watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
        params = {
         'max_depth': 5,
         "boosting": "gbdt",
         'reg_alpha': 1.1302650970728192,
         'reg_lambda': 0.3603427518866501
         }

        model = xgb.train(dtrain=train_data,
                          num_boost_round=1,
                          evals=watchlist,
                          early_stopping_rounds=1,
                          verbose_eval=1,
                          params=params)
        y_pred_valid = model.predict(valid_data, ntree_limit=model.best_ntree_limit)
        print(len(y_pred_valid), len(y_valid))
        print(fold_n, scorer(y_valid, y_pred_valid))

train_model(df_features, df_label)

[17:12:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[0]	train-rmse:0.450151	valid_data-rmse:0.453116
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 1 rounds.
4000 4000
0 0.49619014008836
[17:12:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[0]	train-rmse:0.450056	valid_data-rmse:0.453088
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 1 rounds.
4000 4000
1 0.49677173197510843
[17:12:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[0]	train-rmse:0.451365	valid_data-rmse:0.451681
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 1 ro

In [83]:
score

0.5248535661729962

In [28]:
a = df[['bookingID', 'acceleration_x_mean', 'acceleration_z_mean']]

In [30]:
a

Unnamed: 0,bookingID,acceleration_x_mean,acceleration_z_mean
0,0,-0.711264,-1.619658
1,1,-0.525406,-2.198999
2,2,0.306786,0.139347
3,4,-0.365117,-2.613639
4,6,0.490616,2.355059
5,7,1.327679,2.139714
6,8,-1.197008,1.707760
7,10,0.954849,-1.576419
8,11,0.564240,0.615137
9,13,0.993263,4.421685
