In [14]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib
from sklearn import cross_validation
import xgboost as xgb
from sklearn.metrics import roc_auc_score, roc_curve

import warnings 
warnings.filterwarnings("ignore")

In [15]:
data = pd.read_csv("train.csv") 
data = data.replace(-999999,2)

X = data.iloc[:,:-1]
y = data.TARGET

In [16]:
features = ['var15'
,'var38'
,'saldo_var30'
,'saldo_medio_var5_hace2'
,'saldo_var37'
,'num_var45_ult3'
,'saldo_medio_var5_hace3'
,'num_var22_hace3'
,'num_var22_ult1'
,'imp_trans_var37_ult1'
,'num_meses_var39_vig_ult3'
,'saldo_medio_var5_ult3'
,'imp_ent_var16_ult1'
,'saldo_medio_var5_ult1'
,'imp_var43_emit_ult1'
,'num_var22_ult3'
,'num_var45_hace3'
,'num_var45_hace2'
,'saldo_medio_var12_hace2'
,'num_var45_ult1'
,'saldo_var31'
,'saldo_medio_var12_ult3'
,'imp_op_var41_comer_ult3'
,'imp_op_var39_efect_ult1'
,'imp_op_var39_ult1'
,'imp_op_var41_efect_ult1'
,'num_var37'
,'num_var43_recib_ult1'
,'num_op_var41_hace2'
,'num_ent_var16_ult1']

X = X[features]


In [17]:
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

X_normalized = normalize(X, axis=0)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_normalized)
X['PCA1'] = X_pca[:,0]
X['PCA2'] = X_pca[:,1]

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

print(X_train.shape, X_test.shape, X_val.shape)
print(y_train.sum(), y_test.sum(), y_val.sum())

(48652, 32) (15204, 32) (12164, 32)
1909 603 496


In [19]:
clf = xgb.XGBClassifier()
                
clf.fit(X_train, y_train, early_stopping_rounds=50, eval_metric="auc", eval_set=[(X_train, y_train), (X_val, y_val)])
        
print('Overall AUC:', roc_auc_score(y_train, clf.predict_proba(X_train, ntree_limit=clf.best_iteration)[:,1]))

[0]	validation_0-auc:0.801656	validation_1-auc:0.802016
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 50 rounds.
[1]	validation_0-auc:0.810507	validation_1-auc:0.8097
[2]	validation_0-auc:0.815832	validation_1-auc:0.817919
[3]	validation_0-auc:0.81711	validation_1-auc:0.817191
[4]	validation_0-auc:0.818338	validation_1-auc:0.816824
[5]	validation_0-auc:0.819803	validation_1-auc:0.818152
[6]	validation_0-auc:0.819725	validation_1-auc:0.817721
[7]	validation_0-auc:0.819488	validation_1-auc:0.817388
[8]	validation_0-auc:0.820409	validation_1-auc:0.81899
[9]	validation_0-auc:0.820576	validation_1-auc:0.818594
[10]	validation_0-auc:0.820702	validation_1-auc:0.817805
[11]	validation_0-auc:0.821672	validation_1-auc:0.817567
[12]	validation_0-auc:0.821817	validation_1-auc:0.81733
[13]	validation_0-auc:0.822821	validation_1-auc:0.817479
[14]	validation_0-auc:0.823396	validation_1-auc:0.818051
[15

In [None]:
y_proba = clf.predict_proba(X_test)[:,1]
print("Roc AUC:", roc_auc_score(y_test, clf.predict_proba(X_test)[:,1],average='macro'))
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.plot(fpr, tpr)
plt.plot(1)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.show()

In [30]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

model = xgb.XGBClassifier()
n_estimators = range(50, 200, 50)

param_grid = dict(n_estimators=n_estimators)

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)

grid_search = GridSearchCV(model, param_grid, scoring="roc_auc", n_jobs=4, cv=kfold)
grid_result = grid_search.fit(X_test, y_test)

In [25]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

clf = xgb.XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic', silent=True, nthread=1)

folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(clf, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4,
                                   cv=skf.split(X_train, y_train), verbose=3, random_state=1001 )

# Here we go
random_search.fit(X_train, y_train)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] subsample=1.0, min_child_weight=5, max_depth=3, gamma=5, colsample_bytree=1.0 
[CV] subsample=1.0, min_child_weight=5, max_depth=3, gamma=5, colsample_bytree=1.0 
[CV] subsample=1.0, min_child_weight=5, max_depth=3, gamma=5, colsample_bytree=1.0 
[CV] subsample=0.6, min_child_weight=1, max_depth=5, gamma=1.5, colsample_bytree=0.8 
[CV]  subsample=1.0, min_child_weight=5, max_depth=3, gamma=5, colsample_bytree=1.0, score=0.844636, total=  55.1s
[CV] subsample=0.6, min_child_weight=1, max_depth=5, gamma=1.5, colsample_bytree=0.8 
[CV]  subsample=1.0, min_child_weight=5, max_depth=3, gamma=5, colsample_bytree=1.0, score=0.820051, total=  55.5s
[CV] subsample=0.6, min_child_weight=1, max_depth=5, gamma=1.5, colsample_bytree=0.8 
[CV]  subsample=1.0, min_child_weight=5, max_depth=3, gamma=5, colsample_bytree=1.0, score=0.833174, total=  55.8s
[CV] subsample=0.8, min_child_weight=5, max_depth=5, gamma=1, colsample_bytree=0.8 
[

Process ForkPoolWorker-4:
Traceback (most recent call last):
  File "/usr/local/Cellar/python3/3.6.0_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python3/3.6.0_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/Cellar/python3/3.6.0_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()


KeyboardInterrupt: 