In [123]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from scipy.stats import uniform,randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

In [3]:
from sklearn.model_selection import train_test_split
import category_encoders

In [4]:
features = pd.read_csv('features_13.csv')

In [5]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38829 entries, 0 to 38828
Columns: 119 entries, person to ultima_visita_seconds
dtypes: bool(57), float64(10), int64(38), object(14)
memory usage: 20.5+ MB


In [6]:
kaggle = pd.read_csv('trocafone_kaggle_test.csv')
labels = pd.read_csv('labels_training_set.csv')

In [7]:
training = labels.set_index('person').join(features.set_index('person'))
train_labels = training['label']
train_data = training.drop('label',axis=1)

In [8]:
predict_data = kaggle.set_index('person').join(features.set_index('person'))

In [11]:
encoder = category_encoders.TargetEncoder()
train_data_enc = encoder.fit_transform(np.array(train_data),np.array(train_labels))
predict_data_enc = encoder.transform(np.array(predict_data))

## Random Search

In [12]:
xtrain,xtest,ytrain,ytest = train_test_split(train_data_enc,np.array(train_labels),train_size=0.75,stratify=np.array(train_labels))



In [42]:
params={
    'n_estimators':randint(150,500),
    'learning_rate':uniform(0.01,0.3),
    'subsample':uniform(0.3,0.7),
    'min_child_weight':randint(2,20),
    'max_depth':randint(3,15),
    'gamma':randint(0,10),
    'colsample_bytree':uniform(0.4,0.6),
    'subsample':uniform(0,1),
    'max_delta_step':randint(0,5),
    'scale_pos_weight':uniform(0.1,2),
    
}
grid = RandomizedSearchCV(xgb.XGBClassifier(n_jobs=-1,booster='dart'),
                          param_distributions=params,
                          scoring='roc_auc',
                          cv=2,
                          verbose=1,
                          n_iter=25)

In [43]:
grid.fit(xtrain,ytrain)

Fitting 2 folds for each of 25 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  7.2min finished


RandomizedSearchCV(cv=2, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=25, n_jobs=1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000233E1C8E588>, 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000233E1C8E6D8>, 'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000233E1C908D0>, 'min_...38>, 'scale_pos_weight': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000233E1C909E8>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
       

In [44]:
grid.score(xtest,ytest)

0.8703804889280511

In [46]:
grid.best_params_

{'colsample_bytree': 0.6970524442540915,
 'gamma': 9,
 'learning_rate': 0.15247016289317203,
 'max_delta_step': 1,
 'max_depth': 5,
 'min_child_weight': 3,
 'n_estimators': 246,
 'scale_pos_weight': 1.9522454207022248,
 'subsample': 0.9182821643357728}

## Pruebo con otro split

In [73]:
xg_classifier = grid.best_estimator_

In [74]:
xtr,xte,ytr,yte = train_test_split(train_data_enc,np.array(train_labels),train_size=0.5,stratify=np.array(train_labels))



In [75]:
xg_classifier.fit(xtr,ytr)

XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bytree=0.6970524442540915, gamma=9,
       learning_rate=0.15247016289317203, max_delta_step=1, max_depth=5,
       min_child_weight=3, missing=None, n_estimators=246, n_jobs=-1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1.9522454207022248,
       seed=None, silent=True, subsample=0.9182821643357728)

In [76]:
roc_auc_score(ytr,xg_classifier.predict_proba(xtr)[:,1])

0.97894020144675

In [77]:
roc_auc_score(yte,xg_classifier.predict_proba(xte)[:,1])

0.8622861925501459

## Hay overfitting, necesito reducirlo

In [153]:
params_ = {'colsample_bytree': 0.6,
 'gamma': 7,
 'learning_rate': 0.02,
 'max_delta_step': 1,
 'max_depth': 3,
 'min_child_weight': 6,
 'n_estimators': 246,
 'scale_pos_weight': 1.9522454207022248,
 'subsample': 0.4}
xgb_classifier = xgb.XGBClassifier(**params_)

In [154]:
skf = StratifiedKFold(n_splits=10,shuffle=True)

In [155]:
for tr,te in skf.split(train_data_enc,np.array(train_labels)):
    xtr = train_data_enc.iloc[tr,:]
    xte = train_data_enc.iloc[te,:]
    ytr = train_labels[tr]
    yte = train_labels[te]
    xgb_classifier.fit(xtr,ytr)
    print ('Train score: '+ str(roc_auc_score(ytr,xgb_classifier.predict_proba(xtr)[:,1])))
    print ('Test score: '+ str(roc_auc_score(yte,xgb_classifier.predict_proba(xte)[:,1])))
    print (' ')

Train score: 0.8943488345709993
Test score: 0.8568108371331178
 
Train score: 0.8918027689275428
Test score: 0.8816846695294169
 
Train score: 0.8915488116082279
Test score: 0.8842024879366063
 
Train score: 0.8938464897713153
Test score: 0.8544147593961663
 
Train score: 0.892086227937421
Test score: 0.8709734572070825
 
Train score: 0.8928788058329031
Test score: 0.8451006012822926
 
Train score: 0.890118758209892
Test score: 0.8819139158647724
 
Train score: 0.8910396738608248
Test score: 0.8755688927768611
 
Train score: 0.8906629294274919
Test score: 0.8817865724694652
 
Train score: 0.8909757099954885
Test score: 0.877047183496296
 


In [157]:
xgb_classifier.fit(train_data_enc,train_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=7, learning_rate=0.02, max_delta_step=1,
       max_depth=3, min_child_weight=6, missing=None, n_estimators=246,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1.9522454207022248,
       seed=None, silent=True, subsample=0.4)

In [158]:
roc_auc_score(train_labels,xgb_classifier.predict_proba(train_data_enc)[:,1])

0.8907591174692726

## Submit

In [168]:
xgb_classifier.fit(train_data_enc,train_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7476502755257962, gamma=1,
       learning_rate=0.015344184565596445, max_delta_step=0, max_depth=11,
       min_child_weight=11, missing=None, n_estimators=324, n_jobs=-1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7718583635196341)

In [169]:
proba_loco = xgb_classifier.predict_proba(predict_data_enc)[:,1]

In [170]:
submit = kaggle.set_index('person')

In [171]:
submit['label'] = proba_loco

In [172]:
submit.to_csv('submit.csv')

In [173]:
proba_loco.mean()

0.053208914