In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from scipy.stats import uniform,randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

In [4]:
from sklearn.model_selection import train_test_split
import category_encoders

In [7]:
features = pd.read_csv('features_14_reducido.csv')

In [8]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38829 entries, 0 to 38828
Data columns (total 53 columns):
person                                   38829 non-null object
mayor_evento                             38829 non-null object
dispos                                   38829 non-null object
primer_visita                            38829 non-null object
ultima_visita                            38829 non-null object
cant_eventos                             38829 non-null int64
permanencia                              38829 non-null int64
hace_cuanto_visito                       38829 non-null int64
eventos_por_dia                          38829 non-null float64
channel                                  38829 non-null object
region                                   38829 non-null object
personas_mismo_prod_visto                38829 non-null int64
personas_mismo_mayor_evento              38829 non-null int64
personas_mismo_disp                      38829 non-null int64
personas_mismo

In [9]:
kaggle = pd.read_csv('trocafone_kaggle_test.csv')
labels = pd.read_csv('labels_training_set.csv')

In [10]:
training = labels.set_index('person').join(features.set_index('person'))
train_labels = training['label']
train_data = training.drop('label',axis=1)

In [11]:
predict_data = kaggle.set_index('person').join(features.set_index('person'))

In [12]:
encoder = category_encoders.TargetEncoder()
train_data_enc = encoder.fit_transform(np.array(train_data),np.array(train_labels))
predict_data_enc = encoder.transform(np.array(predict_data))

## Random Search

In [13]:
xtrain,xtest,ytrain,ytest = train_test_split(train_data_enc,np.array(train_labels),train_size=0.75,stratify=np.array(train_labels))



In [14]:
params={
    'n_estimators':randint(150,500),
    'learning_rate':uniform(0.01,0.3),
    'subsample':uniform(0.3,0.7),
    'min_child_weight':randint(2,20),
    'max_depth':randint(3,15),
    'gamma':randint(0,10),
    'colsample_bytree':uniform(0.4,0.6),
    'subsample':uniform(0,1),
    'max_delta_step':randint(0,5),
    'scale_pos_weight':uniform(0.1,2),
    
}
grid = RandomizedSearchCV(xgb.XGBClassifier(n_jobs=-1,booster='dart'),
                          param_distributions=params,
                          scoring='roc_auc',
                          cv=2,
                          verbose=1,
                          n_iter=25)

In [15]:
grid.fit(xtrain,ytrain)

Fitting 2 folds for each of 25 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  3.5min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
          fit_params=None, iid='warn', n_iter=25, n_jobs=None,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001B5C5DBDCF8>, 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001B5C5DBDEB8>, 'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001B5C8AA5E48>, 'min_...C0>, 'scale_pos_weight': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001B5C8AA5F28>},
          pre_dispatch='2*n_jobs', random_state=None, re

In [16]:
grid.score(xtest,ytest)

0.8700112025717207

In [17]:
grid.best_params_

{'colsample_bytree': 0.4855223331690547,
 'gamma': 9,
 'learning_rate': 0.08077390687637791,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 7,
 'n_estimators': 295,
 'scale_pos_weight': 1.1084436167834333,
 'subsample': 0.8286364800042514}

## Pruebo con otro split

In [18]:
xg_classifier = grid.best_estimator_

In [19]:
xtr,xte,ytr,yte = train_test_split(train_data_enc,np.array(train_labels),train_size=0.5,stratify=np.array(train_labels))



In [20]:
xg_classifier.fit(xtr,ytr)

XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bytree=0.4855223331690547, gamma=9,
       learning_rate=0.08077390687637791, max_delta_step=0, max_depth=3,
       min_child_weight=7, missing=None, n_estimators=295, n_jobs=-1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1.1084436167834333,
       seed=None, silent=True, subsample=0.8286364800042514)

In [21]:
roc_auc_score(ytr,xg_classifier.predict_proba(xtr)[:,1])

0.9028607298403792

In [22]:
roc_auc_score(yte,xg_classifier.predict_proba(xte)[:,1])

0.8669830371119913

## Hay overfitting, necesito reducirlo

In [30]:
params_ = {'colsample_bytree': 0.6,
 'gamma': 7,
 'learning_rate': 0.02,
 'max_delta_step': 1,
 'max_depth': 3,
 'min_child_weight': 6,
 'n_estimators': 246,
 'scale_pos_weight': 1.9522454207022248,
 'subsample': 0.4}
xgb_classifier = xgb.XGBClassifier(**params_)

In [31]:
skf = StratifiedKFold(n_splits=10,shuffle=True)

In [32]:
for tr,te in skf.split(train_data_enc,np.array(train_labels)):
    xtr = train_data_enc.iloc[tr,:]
    xte = train_data_enc.iloc[te,:]
    ytr = train_labels[tr]
    yte = train_labels[te]
    xgb_classifier.fit(xtr,ytr)
    print ('Train score: '+ str(roc_auc_score(ytr,xgb_classifier.predict_proba(xtr)[:,1])))
    print ('Test score: '+ str(roc_auc_score(yte,xgb_classifier.predict_proba(xte)[:,1])))
    print (' ')

Train score: 0.8854678801397995
Test score: 0.8985844880251449
 
Train score: 0.8902928300112489
Test score: 0.8465016158307141
 
Train score: 0.8879034374448996
Test score: 0.8691287794944441
 
Train score: 0.8885609176360919
Test score: 0.8574970118199123
 
Train score: 0.889172010997958
Test score: 0.8515342110799827
 
Train score: 0.8866208709992344
Test score: 0.8862712746520204
 
Train score: 0.8891007350240843
Test score: 0.8465567453242827
 
Train score: 0.8858697056063097
Test score: 0.9054502973191447
 
Train score: 0.8867440492762312
Test score: 0.8760062896563943
 
Train score: 0.8909252427790878
Test score: 0.8357131783804136
 


In [33]:
xgb_classifier.fit(train_data_enc,train_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=7, learning_rate=0.02, max_delta_step=1,
       max_depth=3, min_child_weight=6, missing=None, n_estimators=246,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1.9522454207022248,
       seed=None, silent=True, subsample=0.4)

In [34]:
roc_auc_score(train_labels,xgb_classifier.predict_proba(train_data_enc)[:,1])

0.8872870782250191

## Submit

In [35]:
xgb_classifier.fit(train_data_enc,train_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=7, learning_rate=0.02, max_delta_step=1,
       max_depth=3, min_child_weight=6, missing=None, n_estimators=246,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1.9522454207022248,
       seed=None, silent=True, subsample=0.4)

In [36]:
proba_loco = xgb_classifier.predict_proba(predict_data_enc)[:,1]

In [37]:
submit = kaggle.set_index('person')

In [38]:
submit['label'] = proba_loco

In [39]:
submit.to_csv('submitXG.csv')

In [40]:
proba_loco.mean()

0.08979232