In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from scipy.stats import uniform,randint
from sklearn.model_selection import RandomizedSearchCV

In [2]:
from sklearn.model_selection import train_test_split
import category_encoders

In [151]:
features = pd.read_csv('features_13.csv')

In [152]:
kaggle = pd.read_csv('trocafone_kaggle_test.csv')
labels = pd.read_csv('labels_training_set.csv')

In [153]:
training = labels.set_index('person').join(features.set_index('person'))
train_labels = training['label']
train_data = training.drop('label',axis=1)

In [154]:
predict_data = kaggle.set_index('person').join(features.set_index('person'))

In [155]:
encoder = category_encoders.TargetEncoder()
train_data_enc = encoder.fit_transform(np.array(train_data),np.array(train_labels))
predict_data_enc = encoder.transform(np.array(predict_data))

## Random Search

In [156]:
xtrain,xtest,ytrain,ytest = train_test_split(train_data_enc,np.array(train_labels),train_size=0.75,stratify=np.array(train_labels))



In [157]:
params={
    'n_estimators':randint(150,500),
    'learning_rate':uniform(0.01,0.3),
    'subsample':uniform(0.3,0.7),
    'min_child_weight':randint(5,20),
    'max_depth':randint(6,20),
    'gamma':randint(0,10),
    'colsample_bytree':uniform(0.4,0.6),
    'subsample':uniform(0,1),
    'max_delta_step':randint(0,5)
}
grid = RandomizedSearchCV(xgb.XGBClassifier(n_jobs=-1),
                          param_distributions=params,
                          scoring='roc_auc',
                          cv=2,
                          verbose=1,
                          n_iter=25)

In [158]:
grid.fit(xtrain,ytrain)

Fitting 2 folds for each of 25 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  3.4min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
          fit_params=None, iid='warn', n_iter=25, n_jobs=None,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001C48568B198>, 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001C48568B6A0>, 'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001C4859B5A58>, 'min_...5898>, 'max_delta_step': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001C4859B5390>},
          pre_dispatch='2*n_jobs', random_state=None, 

In [159]:
grid.score(xtest,ytest)

0.8719630182296395

In [160]:
grid.best_params_

{'colsample_bytree': 0.7476502755257962,
 'gamma': 1,
 'learning_rate': 0.015344184565596445,
 'max_delta_step': 0,
 'max_depth': 11,
 'min_child_weight': 11,
 'n_estimators': 324,
 'subsample': 0.7718583635196341}

## Pruebo con otro split

In [161]:
xg_classifier = grid.best_estimator_

In [174]:
xtr,xte,ytr,yte = train_test_split(train_data_enc,np.array(train_labels),stratify=np.array(train_labels))

In [175]:
xg_classifier.fit(xtr,ytr)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7476502755257962, gamma=1,
       learning_rate=0.015344184565596445, max_delta_step=0, max_depth=11,
       min_child_weight=11, missing=None, n_estimators=324, n_jobs=-1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7718583635196341)

In [176]:
roc_auc_score(yte,xg_classifier.predict_proba(xte)[:,1])

0.8663245380599626

## Submit

In [168]:
xg_classifier.fit(train_data_enc,train_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7476502755257962, gamma=1,
       learning_rate=0.015344184565596445, max_delta_step=0, max_depth=11,
       min_child_weight=11, missing=None, n_estimators=324, n_jobs=-1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7718583635196341)

In [169]:
proba_loco = xg_classifier.predict_proba(predict_data_enc)[:,1]

In [170]:
submit = kaggle.set_index('person')

In [171]:
submit['label'] = proba_loco

In [172]:
submit.to_csv('submit.csv')

In [173]:
proba_loco.mean()

0.053208914