In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import warnings
from matplotlib import pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

In [3]:
df_train = pd.read_excel('./data/train_clientes.xlsx', index_col='ID_CORRELATIVO')
df_test = pd.read_excel('./data/test_clientes.xlsx', index_col='ID_CORRELATIVO')

## Data Clientes

In [4]:
x_train = df_train.drop(['ATTRITION'], axis=1)
y_train = df_train['ATTRITION']
x_test = df_test

In [5]:
# Join train and test df to aplly feature engineering
df = pd.concat([x_train, x_test])
print df.shape

(100000, 51)


In [6]:
# Split columns by type
num_cols = df.select_dtypes(exclude=['datetime', 'object']).columns.tolist()
cat_cols = [col for col in df.columns if col not in num_cols]
cat_cols

[u'RANG_INGRESO',
 u'FLAG_LIMA_PROVINCIA',
 u'RANG_SDO_PASIVO_MENOS0',
 u'RANG_NRO_PRODUCTOS_MENOS0']

In [7]:
df[cat_cols[1]] = df[cat_cols[1]].map(df[cat_cols[1]].value_counts()/df[cat_cols[1]].value_counts().sum())
df[cat_cols[1]].head()

ID_CORRELATIVO
35653    0.626166
66575    0.373834
56800    0.373834
8410     0.373834
6853     0.626166
Name: FLAG_LIMA_PROVINCIA, dtype: float64

In [8]:
cat_cols.remove('FLAG_LIMA_PROVINCIA')

In [9]:
for col in cat_cols:
    df[col] = df[col].astype('category').cat.codes

df.drop('CODMES', inplace=True, axis=1)

In [31]:
from sklearn.decomposition import PCA

decomp = PCA(n_components=10)
df_ = decomp.fit_transform(df.values)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## Modelado clientes

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier



In [12]:
x_train_ = df.iloc[0:70000]
x_test_ = df.iloc[70000:]

xtrain, xtest, ytrain, ytest = train_test_split(x_train_, y_train, test_size=.10, random_state=0)

### XGBoost

In [61]:
dtrain = xgb.DMatrix(xtrain, label=ytrain)
dtest = xgb.DMatrix(xtest, label=ytest)

params_xgb = {
    'max_depth': 6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective': 'binary:logistic',
    'eval_metric':'logloss'
}

xgb_model = xgb.train(params_xgb, 
                      dtrain,
                      num_boost_round=999, 
                      early_stopping_rounds=10,
                      evals=[(dtest, "Test")])

[0]	Test-logloss:0.540315
Will train until Test-logloss hasn't improved in 10 rounds.
[1]	Test-logloss:0.456852
[2]	Test-logloss:0.407596
[3]	Test-logloss:0.374685
[4]	Test-logloss:0.354341
[5]	Test-logloss:0.340005
[6]	Test-logloss:0.329807
[7]	Test-logloss:0.324418
[8]	Test-logloss:0.320152
[9]	Test-logloss:0.316743
[10]	Test-logloss:0.313036
[11]	Test-logloss:0.311438
[12]	Test-logloss:0.310112
[13]	Test-logloss:0.30955
[14]	Test-logloss:0.308657
[15]	Test-logloss:0.307889
[16]	Test-logloss:0.307728
[17]	Test-logloss:0.306558
[18]	Test-logloss:0.306367
[19]	Test-logloss:0.306281
[20]	Test-logloss:0.305077
[21]	Test-logloss:0.30452
[22]	Test-logloss:0.30462
[23]	Test-logloss:0.304122
[24]	Test-logloss:0.303944
[25]	Test-logloss:0.303644
[26]	Test-logloss:0.303615
[27]	Test-logloss:0.303854
[28]	Test-logloss:0.304092
[29]	Test-logloss:0.30428
[30]	Test-logloss:0.30363
[31]	Test-logloss:0.303794
[32]	Test-logloss:0.303731
[33]	Test-logloss:0.303704
[34]	Test-logloss:0.303787
[35]	Test-

In [62]:
xgb.cv(params_xgb, 
       xgb.DMatrix(x_train_, label=y_train), 
       num_boost_round=999,
       early_stopping_rounds=10, 
       nfold=5, 
       seed=0,
       metrics='logloss')

Unnamed: 0,test-logloss-mean,test-logloss-std,train-logloss-mean,train-logloss-std
0,0.540111,0.001387,0.539419,0.000608
1,0.456349,0.002412,0.455057,0.000829
2,0.406292,0.003238,0.404449,0.001026
3,0.37384,0.003555,0.371301,0.001229
4,0.353053,0.004043,0.349708,0.001401
5,0.339353,0.004531,0.335195,0.001321
6,0.330301,0.004938,0.32505,0.001465
7,0.324064,0.004859,0.31789,0.001765
8,0.319504,0.00544,0.312271,0.001434
9,0.316669,0.005591,0.308396,0.001271


In [63]:
gridsearch_params = [(max_depth, min_child_weight)
                    for max_depth in range(9,12)
                    for min_child_weight in range(5,8)]

min_logloss = float("Inf")
best_params = None

for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))

    # Update our parameters
    params_xgb['max_depth'] = max_depth
    params_xgb['min_child_weight'] = min_child_weight

    # Run CV
    cv_results = xgb.cv(
        params_xgb,
        xgb.DMatrix(x_train_, label=y_train),
        num_boost_round=999,
        seed=0,
        nfold=5,
        metrics='logloss',
        early_stopping_rounds=10
    )

    # Update best logloss
    mean_logloss = cv_results['test-logloss-mean'].min()
    boost_rounds = cv_results['test-logloss-mean'].argmin()
    print("\tLogloss {} for {} rounds".format(mean_logloss, boost_rounds))
    if mean_logloss < min_logloss:
        min_logloss = mean_logloss
        best_params = (max_depth, min_child_weight)

print("Best params: {}, {}, Logloss: {}".format(best_params[0], best_params[1], min_logloss))

CV with max_depth=9, min_child_weight=5
	Logloss 0.3025916 for 24 rounds
CV with max_depth=9, min_child_weight=6
	Logloss 0.3024176 for 17 rounds
CV with max_depth=9, min_child_weight=7
	Logloss 0.3015254 for 17 rounds
CV with max_depth=10, min_child_weight=5
	Logloss 0.3026122 for 15 rounds
CV with max_depth=10, min_child_weight=6
	Logloss 0.3022632 for 15 rounds
CV with max_depth=10, min_child_weight=7
	Logloss 0.3016802 for 18 rounds
CV with max_depth=11, min_child_weight=5
	Logloss 0.3025726 for 15 rounds
CV with max_depth=11, min_child_weight=6
	Logloss 0.3018722 for 18 rounds
CV with max_depth=11, min_child_weight=7
	Logloss 0.3019368 for 14 rounds
Best params: 9, 7, Logloss: 0.3015254


In [64]:
params_xgb['max_depth'] = 9
params_xgb['min_child_weight'] = 7

In [65]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

min_logloss = float("Inf")
best_params = None

for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample_bytree={}".format(
                             subsample,
                             colsample))

    # We update our parameters
    params_xgb['subsample'] = subsample
    params_xgb['colsample_bytree'] = colsample

    # Run CV
    cv_results = xgb.cv(
        params_xgb,
        xgb.DMatrix(x_train_, label=y_train),
        num_boost_round=999,
        seed=0,
        nfold=5,
        metrics='logloss',
        early_stopping_rounds=10
    )

    # Update best logloss
    mean_logloss = cv_results['test-logloss-mean'].min()
    boost_rounds = cv_results['test-logloss-mean'].argmin()
    print("\tLogloss {} for {} rounds".format(mean_logloss, boost_rounds))
    if mean_logloss < min_logloss:
        min_logloss = mean_logloss
        best_params = (subsample, colsample)

print("Best params: {}, {}, Logloss: {}".format(best_params[0], best_params[1], min_logloss))

CV with subsample=1.0, colsample_bytree=1.0
	Logloss 0.3015254 for 17 rounds
CV with subsample=1.0, colsample_bytree=0.9
	Logloss 0.3023684 for 22 rounds
CV with subsample=1.0, colsample_bytree=0.8
	Logloss 0.3014086 for 18 rounds
CV with subsample=1.0, colsample_bytree=0.7
	Logloss 0.3013444 for 19 rounds
CV with subsample=0.9, colsample_bytree=1.0
	Logloss 0.302638 for 17 rounds
CV with subsample=0.9, colsample_bytree=0.9
	Logloss 0.3020914 for 22 rounds
CV with subsample=0.9, colsample_bytree=0.8
	Logloss 0.3029778 for 17 rounds
CV with subsample=0.9, colsample_bytree=0.7
	Logloss 0.3015262 for 22 rounds
CV with subsample=0.8, colsample_bytree=1.0
	Logloss 0.302576 for 24 rounds
CV with subsample=0.8, colsample_bytree=0.9
	Logloss 0.3024208 for 17 rounds
CV with subsample=0.8, colsample_bytree=0.8
	Logloss 0.3036434 for 18 rounds
CV with subsample=0.8, colsample_bytree=0.7
	Logloss 0.3020036 for 19 rounds
CV with subsample=0.7, colsample_bytree=1.0
	Logloss 0.3032336 for 15 rounds
C

In [66]:
params_xgb['subsample'] = 1.0
params_xgb['colsample_bytree'] = 0.7

In [67]:
min_logloss = float("Inf")
best_params = None

for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))

    # We update our parameters
    params_xgb['eta'] = eta
    
    cv_results = xgb.cv(
        params_xgb,
        xgb.DMatrix(x_train_, label=y_train),
        num_boost_round=999,
        seed=0,
        nfold=5,
        metrics='logloss',
        early_stopping_rounds=10
    )

    # Update best score
    mean_logloss = cv_results['test-logloss-mean'].min()
    boost_rounds = cv_results['test-logloss-mean'].argmin()
    print("\tLogloss {} for {} rounds".format(mean_logloss, boost_rounds))
    if mean_logloss < min_logloss:
        min_logloss = mean_logloss
        best_params = eta

print("Best params: {}, Logloss: {}".format(best_params, min_logloss))

CV with eta=0.3
	Logloss 0.3013444 for 19 rounds
CV with eta=0.2
	Logloss 0.3001992 for 35 rounds
CV with eta=0.1
	Logloss 0.2986254 for 65 rounds
CV with eta=0.05
	Logloss 0.2983642 for 146 rounds
CV with eta=0.01
	Logloss 0.2979122 for 737 rounds
CV with eta=0.005
	Logloss 0.299106 for 998 rounds
Best params: 0.01, Logloss: 0.2979122


In [68]:
params_xgb['eta'] = 0.01

In [69]:
params_xgb

{'colsample_bytree': 0.7,
 'eta': 0.01,
 'eval_metric': 'logloss',
 'max_depth': 9,
 'min_child_weight': 7,
 'objective': 'binary:logistic',
 'subsample': 1.0}

In [70]:
best_xgb_model = xgb.train(
    params_xgb,
    xgb.DMatrix(x_train_, label=y_train),
    num_boost_round=737
)

In [71]:
y_pred = best_xgb_model.predict(xgb.DMatrix(x_test_))
submission = pd.DataFrame()
submission['ID_CORRELATIVO'] = x_test_.index
submission['ATTRITION'] = y_pred

submission.to_csv('./data/submission15_XGB.csv', index=False)

### LightGBM

In [13]:
# Lightgbm

dtrain =lgb.Dataset(xtrain,label=ytrain)
dtest = lgb.Dataset(xtest,label=ytest)

params_lgb = {'objective': 'binary',
          'max_depth': 6,
          'learning_rate':.3,
          'max_bin': 200,
          'metric': 'binary_logloss'}


lgb_model = lgb.train(params_lgb,
                      dtrain,
                      num_boost_round=999,
                      early_stopping_rounds=10,
                      valid_sets=[dtest, dtrain],
                      valid_names=["Test", "Train"]
                      )

[1]	Train's binary_logloss: 0.539359	Test's binary_logloss: 0.540194
Training until validation scores don't improve for 10 rounds.
[2]	Train's binary_logloss: 0.455093	Test's binary_logloss: 0.456731
[3]	Train's binary_logloss: 0.404951	Test's binary_logloss: 0.407433
[4]	Train's binary_logloss: 0.3718	Test's binary_logloss: 0.374736
[5]	Train's binary_logloss: 0.351384	Test's binary_logloss: 0.354544
[6]	Train's binary_logloss: 0.337081	Test's binary_logloss: 0.340343
[7]	Train's binary_logloss: 0.326852	Test's binary_logloss: 0.330458
[8]	Train's binary_logloss: 0.320217	Test's binary_logloss: 0.324401
[9]	Train's binary_logloss: 0.315515	Test's binary_logloss: 0.320492
[10]	Train's binary_logloss: 0.311794	Test's binary_logloss: 0.317005
[11]	Train's binary_logloss: 0.308954	Test's binary_logloss: 0.314752
[12]	Train's binary_logloss: 0.306905	Test's binary_logloss: 0.313345
[13]	Train's binary_logloss: 0.304834	Test's binary_logloss: 0.312147
[14]	Train's binary_logloss: 0.302411	T

In [73]:
gridsearch_params = [(max_depth, min_child_weight)
                    for max_depth in range(9,12)
                    for min_child_weight in range(5,8)]

min_logloss = float("Inf")
best_params = None

for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))

    # Update our parameters
    params_lgb['max_depth'] = max_depth
    params_lgb['min_child_weight'] = min_child_weight

    # Run CV
    cv_results = lgb.cv(
        params_lgb,
        lgb.Dataset(x_train_, label=y_train),
        num_boost_round=999,
        seed=0,
        nfold=5,
        metrics=['binary_logloss'],
        early_stopping_rounds=10
    )

    # Update best logloss
    mean_logloss = np.min(cv_results['binary_logloss-mean'])
    boost_rounds = len(cv_results['binary_logloss-mean'])
    print("\tLogloss {} for {} rounds".format(mean_logloss, boost_rounds))
    if mean_logloss < min_logloss:
        min_logloss = mean_logloss
        best_params = (max_depth, min_child_weight)

print("Best params: {}, {}, Logloss: {}".format(best_params[0], best_params[1], min_logloss))

CV with max_depth=9, min_child_weight=5
	Logloss 0.302213708203 for 34 rounds
CV with max_depth=9, min_child_weight=6
	Logloss 0.30242043626 for 41 rounds
CV with max_depth=9, min_child_weight=7
	Logloss 0.302346182934 for 37 rounds
CV with max_depth=10, min_child_weight=5
	Logloss 0.302547089373 for 31 rounds
CV with max_depth=10, min_child_weight=6
	Logloss 0.302485790015 for 27 rounds
CV with max_depth=10, min_child_weight=7
	Logloss 0.302089679894 for 37 rounds
CV with max_depth=11, min_child_weight=5
	Logloss 0.302185575685 for 37 rounds
CV with max_depth=11, min_child_weight=6
	Logloss 0.302545624019 for 28 rounds
CV with max_depth=11, min_child_weight=7
	Logloss 0.301524958434 for 35 rounds
Best params: 11, 7, Logloss: 0.301524958434


In [14]:
params_lgb['max_depth'] = 11
params_lgb['min_child_weight'] = 7

In [75]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

min_logloss = float("Inf")
best_params = None

for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample_bytree={}".format(
                             subsample,
                             colsample))

    # We update our parameters
    params_lgb['subsample'] = subsample
    params_lgb['colsample_bytree'] = colsample

    # Run CV
    cv_results = lgb.cv(
        params_lgb,
        lgb.Dataset(x_train_, label=y_train),
        num_boost_round=999,
        seed=0,
        nfold=5,
        metrics=['binary_logloss'],
        early_stopping_rounds=10
    )

    # Update best logloss
    mean_logloss = np.min(cv_results['binary_logloss-mean'])
    boost_rounds = len(cv_results['binary_logloss-mean'])
    print("\tLogloss {} for {} rounds".format(mean_logloss, boost_rounds))
    if mean_logloss < min_logloss:
        min_logloss = mean_logloss
        best_params = (subsample, colsample)

print("Best params: {}, {}, Logloss: {}".format(best_params[0], best_params[1], min_logloss))

CV with subsample=1.0, colsample_bytree=1.0
	Logloss 0.301524958434 for 35 rounds
CV with subsample=1.0, colsample_bytree=0.9
	Logloss 0.30151304919 for 36 rounds
CV with subsample=1.0, colsample_bytree=0.8
	Logloss 0.301618596407 for 30 rounds
CV with subsample=1.0, colsample_bytree=0.7
	Logloss 0.301977330712 for 33 rounds
CV with subsample=0.9, colsample_bytree=1.0
	Logloss 0.301524958434 for 35 rounds
CV with subsample=0.9, colsample_bytree=0.9
	Logloss 0.30151304919 for 36 rounds
CV with subsample=0.9, colsample_bytree=0.8
	Logloss 0.301618596407 for 30 rounds
CV with subsample=0.9, colsample_bytree=0.7
	Logloss 0.301977330712 for 33 rounds
CV with subsample=0.8, colsample_bytree=1.0
	Logloss 0.301524958434 for 35 rounds
CV with subsample=0.8, colsample_bytree=0.9
	Logloss 0.30151304919 for 36 rounds
CV with subsample=0.8, colsample_bytree=0.8
	Logloss 0.301618596407 for 30 rounds
CV with subsample=0.8, colsample_bytree=0.7
	Logloss 0.301977330712 for 33 rounds
CV with subsample=0

In [15]:
params_lgb['subsample'] = 1.0
params_lgb['colsample_bytree'] = 0.9

In [16]:
min_logloss = float("Inf")
best_params = None

for eta in [.3, .2, .1, .05, .01]:
    print("CV with eta={}".format(eta))

    # We update our parameters
    params_lgb['learning_rate'] = eta
    
    # Run CV
    cv_results = lgb.cv(
        params_lgb,
        lgb.Dataset(x_train_, label=y_train),
        num_boost_round=5000,
        seed=0,
        nfold=5,
        metrics=['binary_logloss'],
        early_stopping_rounds=10
    )

    # Update best logloss
    mean_logloss = np.min(cv_results['binary_logloss-mean'])
    boost_rounds = len(cv_results['binary_logloss-mean'])
    print("\tLogloss {} for {} rounds".format(mean_logloss, boost_rounds))
    if mean_logloss < min_logloss:
        min_logloss = mean_logloss
        best_params = eta

print("Best params: {}, Logloss: {}".format(best_params, min_logloss))

CV with eta=0.3
	Logloss 0.30151304919 for 36 rounds
CV with eta=0.2
	Logloss 0.300409367962 for 61 rounds
CV with eta=0.1
	Logloss 0.299742926475 for 166 rounds
CV with eta=0.05
	Logloss 0.299374569008 for 242 rounds
CV with eta=0.01
	Logloss 0.298904046246 for 1488 rounds
Best params: 0.01, Logloss: 0.298904046246


In [17]:
params_lgb['learning_rate'] = 0.01

In [18]:
best_lgb_model = lgb.train(params_lgb,
                      lgb.Dataset(x_train_, label=y_train),
                      num_boost_round=1488
                      )

In [19]:
y_pred = best_lgb_model.predict(x_test_)

submission = pd.DataFrame()
submission['ID_CORRELATIVO'] = x_test_.index
submission['ATTRITION'] = y_pred

submission.to_csv('./data/submission15_LGB.csv', index=False)

## Catboost

In [20]:
xtrain_ = xtrain.fillna(-999)
xtest_ = xtest.fillna(-999)

cat = CatBoostClassifier(iterations=120, 
                         learning_rate=0.3, 
                         depth=6, 
                         eval_metric='Logloss',
                         l2_leaf_reg=3,
                         border_count=32,
                         od_type='Iter')

cat.fit(xtrain_, ytrain, eval_set=(xtest_, ytest))

0:	learn: 0.4777066	test: 0.4758057	best: 0.4758057 (0)	total: 358ms	remaining: 42.6s
1:	learn: 0.4079779	test: 0.4079152	best: 0.4079152 (1)	total: 540ms	remaining: 31.8s
2:	learn: 0.3671615	test: 0.3666022	best: 0.3666022 (2)	total: 740ms	remaining: 28.9s
3:	learn: 0.3451698	test: 0.3438190	best: 0.3438190 (3)	total: 921ms	remaining: 26.7s
4:	learn: 0.3355541	test: 0.3346152	best: 0.3346152 (4)	total: 1.11s	remaining: 25.7s
5:	learn: 0.3288819	test: 0.3278035	best: 0.3278035 (5)	total: 1.3s	remaining: 24.7s
6:	learn: 0.3241006	test: 0.3238692	best: 0.3238692 (6)	total: 1.51s	remaining: 24.3s
7:	learn: 0.3229458	test: 0.3227808	best: 0.3227808 (7)	total: 1.66s	remaining: 23.2s
8:	learn: 0.3196518	test: 0.3193553	best: 0.3193553 (8)	total: 1.85s	remaining: 22.9s
9:	learn: 0.3177976	test: 0.3175023	best: 0.3175023 (9)	total: 2.03s	remaining: 22.3s
10:	learn: 0.3165599	test: 0.3163700	best: 0.3163700 (10)	total: 2.23s	remaining: 22.1s
11:	learn: 0.3150738	test: 0.3149589	best: 0.3149589 

94:	learn: 0.2825531	test: 0.3020598	best: 0.3020598 (94)	total: 18.5s	remaining: 4.88s
95:	learn: 0.2823513	test: 0.3019790	best: 0.3019790 (95)	total: 18.7s	remaining: 4.69s
96:	learn: 0.2821651	test: 0.3021053	best: 0.3019790 (95)	total: 18.9s	remaining: 4.49s
97:	learn: 0.2819552	test: 0.3019347	best: 0.3019347 (97)	total: 19.1s	remaining: 4.29s
98:	learn: 0.2817580	test: 0.3018878	best: 0.3018878 (98)	total: 19.3s	remaining: 4.1s
99:	learn: 0.2816279	test: 0.3019574	best: 0.3018878 (98)	total: 19.5s	remaining: 3.9s
100:	learn: 0.2814723	test: 0.3018693	best: 0.3018693 (100)	total: 19.7s	remaining: 3.71s
101:	learn: 0.2813288	test: 0.3020008	best: 0.3018693 (100)	total: 19.9s	remaining: 3.52s
102:	learn: 0.2811826	test: 0.3020371	best: 0.3018693 (100)	total: 20.1s	remaining: 3.32s
103:	learn: 0.2810546	test: 0.3020054	best: 0.3018693 (100)	total: 20.3s	remaining: 3.12s
104:	learn: 0.2806258	test: 0.3017774	best: 0.3017774 (104)	total: 20.5s	remaining: 2.93s
105:	learn: 0.2803392	te

<catboost.core._CatBoostBase at 0xfda2e48>

In [21]:
from catboost import cv, Pool

x_train__ = x_train_.fillna(-999)

params_cat = {
    'depth': 6,
    'learning_rate': .3,
    'iterations': 1000,
    'loss_function': 'Logloss',
    'l2_leaf_reg': 3,
    'border_count': 32,
    'od_type': 'Iter'
}

#cv(params, Pool(x_train__, y_train),  partition_random_seed=0, fold_count=5)

In [22]:
gridsearch_params = [depth for depth in range(4,12)]

min_logloss = float("Inf")
best_params = None

for depth in gridsearch_params:
    print("CV with depth={}".format(depth))

    # Update our parameters
    params_cat['depth'] = depth

    # Run CV
    cv_results = cv(
        params_cat,
        Pool(x_train__, y_train),  
        partition_random_seed=0, 
        fold_count=5
    )

    # Update best logloss
    mean_logloss = np.min(cv_results['Logloss_test_avg'])
    boost_rounds = len(cv_results['Logloss_test_avg'])
    print("\tLogloss {} for {} rounds".format(mean_logloss, boost_rounds))
    if mean_logloss < min_logloss:
        min_logloss = mean_logloss
        best_params = depth
        best_rounds = boost_rounds

print("Best params: {}, Logloss: {}, Rounds: {}".format(best_params, min_logloss, best_rounds))

CV with depth=4
	Logloss 0.303483883959 for 210 rounds
CV with depth=5


KeyboardInterrupt: 

In [23]:
params_cat['depth'] = 7

In [24]:
params_cat

{'border_count': 32,
 'depth': 7,
 'iterations': 1000,
 'l2_leaf_reg': 3,
 'learning_rate': 0.3,
 'loss_function': 'Logloss',
 'od_type': 'Iter'}

In [123]:
gridsearch_params = [10,100,200,500]

min_logloss = float("Inf")
best_params = None

for leaf_reg in gridsearch_params:
    print("CV with leaf_reg={}".format(leaf_reg))

    # Update our parameters
    params_cat['l2_leaf_reg'] = leaf_reg

    # Run CV
    cv_results = cv(
        params_cat,
        Pool(x_train__, y_train),  
        partition_random_seed=0, 
        fold_count=5
    )

    # Update best logloss
    mean_logloss = np.min(cv_results['Logloss_test_avg'])
    boost_rounds = len(cv_results['Logloss_test_avg'])
    print("\tLogloss {} for {} rounds".format(mean_logloss, boost_rounds))
    if mean_logloss < min_logloss:
        min_logloss = mean_logloss
        best_params = leaf_reg
        best_rounds = boost_rounds

print("Best params: {}, Logloss: {}, Rounds: {}".format(best_params, min_logloss, best_rounds))

CV with leaf_reg=10
	Logloss 0.297993372921 for 405 rounds
CV with leaf_reg=100
	Logloss 0.296618841003 for 533 rounds
CV with leaf_reg=200
	Logloss 0.296863228118 for 533 rounds
CV with leaf_reg=500


KeyboardInterrupt: 

In [25]:
params_cat['l2_leaf_reg'] = 100

In [90]:
min_logloss = float("Inf")
best_params = None

params_cat['iterations'] = 5000

for eta in [0.05]:
    print("CV with eta={}".format(eta))

    # We update our parameters
    params_cat['learning_rate'] = eta

    # Run CV
    cv_results = cv(
        params_cat,
        Pool(x_train__, y_train),  
        partition_random_seed=0, 
        fold_count=5
    )

    # Update best logloss
    mean_logloss = np.min(cv_results['Logloss_test_avg'])
    boost_rounds = len(cv_results['Logloss_test_avg'])
    print("\tLogloss {} for {} rounds".format(mean_logloss, boost_rounds))
    if mean_logloss < min_logloss:
        min_logloss = mean_logloss
        best_params = eta
        best_rounds = boost_rounds

print("Best params: {}, Logloss: {}, Rounds: {}".format(best_params, min_logloss, best_rounds))

CV with eta=0.05
	Logloss 0.296545588394 for 1020 rounds
Best params: 0.05, Logloss: 0.296545588394, Rounds: 1020


In [26]:
params_cat['iterations'] = 533
params_cat['learning_rate'] = 0.1

In [27]:
cat = CatBoostClassifier(**params_cat)

cat.fit(x_train__, y_train)

0:	learn: 0.6111281	total: 224ms	remaining: 1m 59s
1:	learn: 0.5465069	total: 456ms	remaining: 2m
2:	learn: 0.5090004	total: 577ms	remaining: 1m 41s
3:	learn: 0.4662903	total: 820ms	remaining: 1m 48s
4:	learn: 0.4340754	total: 1.04s	remaining: 1m 50s
5:	learn: 0.4128742	total: 1.28s	remaining: 1m 52s
6:	learn: 0.3941823	total: 1.52s	remaining: 1m 53s
7:	learn: 0.3831947	total: 1.76s	remaining: 1m 55s
8:	learn: 0.3723913	total: 2.01s	remaining: 1m 57s
9:	learn: 0.3628061	total: 2.27s	remaining: 1m 58s
10:	learn: 0.3553447	total: 2.52s	remaining: 1m 59s
11:	learn: 0.3507091	total: 2.78s	remaining: 2m
12:	learn: 0.3453069	total: 3.03s	remaining: 2m 1s
13:	learn: 0.3413203	total: 3.29s	remaining: 2m 1s
14:	learn: 0.3376049	total: 3.54s	remaining: 2m 2s
15:	learn: 0.3357743	total: 3.78s	remaining: 2m 2s
16:	learn: 0.3333883	total: 4.04s	remaining: 2m 2s
17:	learn: 0.3304876	total: 4.29s	remaining: 2m 2s
18:	learn: 0.3289273	total: 4.54s	remaining: 2m 2s
19:	learn: 0.3270711	total: 4.82s	rem

159:	learn: 0.2920381	total: 40.3s	remaining: 1m 33s
160:	learn: 0.2919825	total: 40.5s	remaining: 1m 33s
161:	learn: 0.2918649	total: 40.8s	remaining: 1m 33s
162:	learn: 0.2918418	total: 41s	remaining: 1m 33s
163:	learn: 0.2917334	total: 41.3s	remaining: 1m 32s
164:	learn: 0.2916582	total: 41.5s	remaining: 1m 32s
165:	learn: 0.2915894	total: 41.8s	remaining: 1m 32s
166:	learn: 0.2915015	total: 42s	remaining: 1m 32s
167:	learn: 0.2914062	total: 42.3s	remaining: 1m 31s
168:	learn: 0.2913199	total: 42.5s	remaining: 1m 31s
169:	learn: 0.2912536	total: 42.8s	remaining: 1m 31s
170:	learn: 0.2911828	total: 43.1s	remaining: 1m 31s
171:	learn: 0.2909550	total: 43.4s	remaining: 1m 31s
172:	learn: 0.2909049	total: 43.6s	remaining: 1m 30s
173:	learn: 0.2907941	total: 43.9s	remaining: 1m 30s
174:	learn: 0.2906935	total: 44.2s	remaining: 1m 30s
175:	learn: 0.2906359	total: 44.4s	remaining: 1m 30s
176:	learn: 0.2905220	total: 44.6s	remaining: 1m 29s
177:	learn: 0.2904600	total: 44.9s	remaining: 1m 2

315:	learn: 0.2813414	total: 1m 22s	remaining: 56.9s
316:	learn: 0.2812428	total: 1m 23s	remaining: 56.7s
317:	learn: 0.2812041	total: 1m 23s	remaining: 56.4s
318:	learn: 0.2811285	total: 1m 23s	remaining: 56.2s
319:	learn: 0.2810411	total: 1m 24s	remaining: 55.9s
320:	learn: 0.2809743	total: 1m 24s	remaining: 55.7s
321:	learn: 0.2808922	total: 1m 24s	remaining: 55.4s
322:	learn: 0.2807952	total: 1m 24s	remaining: 55.2s
323:	learn: 0.2807237	total: 1m 25s	remaining: 54.9s
324:	learn: 0.2806704	total: 1m 25s	remaining: 54.7s
325:	learn: 0.2806008	total: 1m 25s	remaining: 54.4s
326:	learn: 0.2805077	total: 1m 25s	remaining: 54.1s
327:	learn: 0.2804466	total: 1m 26s	remaining: 53.9s
328:	learn: 0.2803986	total: 1m 26s	remaining: 53.6s
329:	learn: 0.2803382	total: 1m 26s	remaining: 53.4s
330:	learn: 0.2802612	total: 1m 27s	remaining: 53.2s
331:	learn: 0.2801919	total: 1m 27s	remaining: 52.9s
332:	learn: 0.2801523	total: 1m 27s	remaining: 52.7s
333:	learn: 0.2800554	total: 1m 27s	remaining:

471:	learn: 0.2727012	total: 2m 5s	remaining: 16.2s
472:	learn: 0.2726438	total: 2m 5s	remaining: 15.9s
473:	learn: 0.2725981	total: 2m 5s	remaining: 15.7s
474:	learn: 0.2725434	total: 2m 6s	remaining: 15.4s
475:	learn: 0.2725035	total: 2m 6s	remaining: 15.1s
476:	learn: 0.2724273	total: 2m 6s	remaining: 14.9s
477:	learn: 0.2723953	total: 2m 7s	remaining: 14.6s
478:	learn: 0.2723467	total: 2m 7s	remaining: 14.4s
479:	learn: 0.2723286	total: 2m 7s	remaining: 14.1s
480:	learn: 0.2722736	total: 2m 7s	remaining: 13.8s
481:	learn: 0.2722390	total: 2m 8s	remaining: 13.6s
482:	learn: 0.2721467	total: 2m 8s	remaining: 13.3s
483:	learn: 0.2720939	total: 2m 8s	remaining: 13s
484:	learn: 0.2720831	total: 2m 8s	remaining: 12.8s
485:	learn: 0.2720432	total: 2m 9s	remaining: 12.5s
486:	learn: 0.2720041	total: 2m 9s	remaining: 12.2s
487:	learn: 0.2719589	total: 2m 9s	remaining: 12s
488:	learn: 0.2719442	total: 2m 9s	remaining: 11.7s
489:	learn: 0.2719059	total: 2m 10s	remaining: 11.4s
490:	learn: 0.2

<catboost.core._CatBoostBase at 0xfdc7240>

In [28]:
x_test__ = x_test_.fillna(-999)

submission = pd.DataFrame()
submission['ID_CORRELATIVO'] = x_test__.index
submission['ATTRITION'] = cat.predict_proba(x_test__)[:,1]
submission.to_csv('./data/submission15_CAT.csv', index=False)

## Modelado requerimientos

In [217]:
x_req = df_train_req.join(y_train).drop('ATTRITION', axis=1)
y_req = df_train_req.join(y_train)['ATTRITION']
x_req.fillna(-999, inplace=True)

In [218]:
x_train_req, x_test_req, y_train_req, y_test_req = train_test_split(x_req, y_req, test_size=.10, random_state=1234)

In [219]:
cat_feat_index = np.where(x_req.dtypes != np.int64)[0]

cat = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6)
cat.fit(x_train_req, y_train_req, cat_features=cat_feat_index, eval_set=(x_test_req, y_test_req))

0:	learn: 0.6044600	test: 0.6031466	best: 0.6031466 (0)	total: 113ms	remaining: 11.2s
1:	learn: 0.5360142	test: 0.5335257	best: 0.5335257 (1)	total: 154ms	remaining: 7.53s
2:	learn: 0.4833144	test: 0.4797636	best: 0.4797636 (2)	total: 211ms	remaining: 6.82s
3:	learn: 0.4425946	test: 0.4381694	best: 0.4381694 (3)	total: 287ms	remaining: 6.89s
4:	learn: 0.4113655	test: 0.4060800	best: 0.4060800 (4)	total: 328ms	remaining: 6.22s
5:	learn: 0.3870200	test: 0.3810510	best: 0.3810510 (5)	total: 455ms	remaining: 7.12s
6:	learn: 0.3683349	test: 0.3616980	best: 0.3616980 (6)	total: 545ms	remaining: 7.24s
7:	learn: 0.3538443	test: 0.3465183	best: 0.3465183 (7)	total: 621ms	remaining: 7.14s
8:	learn: 0.3425319	test: 0.3346711	best: 0.3346711 (8)	total: 746ms	remaining: 7.54s
9:	learn: 0.3337013	test: 0.3253680	best: 0.3253680 (9)	total: 811ms	remaining: 7.3s
10:	learn: 0.3265408	test: 0.3178328	best: 0.3178328 (10)	total: 898ms	remaining: 7.26s
11:	learn: 0.3211377	test: 0.3120240	best: 0.3120240 

95:	learn: 0.2972434	test: 0.2859774	best: 0.2859765 (94)	total: 8.42s	remaining: 351ms
96:	learn: 0.2971881	test: 0.2859623	best: 0.2859623 (96)	total: 8.54s	remaining: 264ms
97:	learn: 0.2971674	test: 0.2859644	best: 0.2859623 (96)	total: 8.66s	remaining: 177ms
98:	learn: 0.2971250	test: 0.2859578	best: 0.2859578 (98)	total: 8.8s	remaining: 88.8ms
99:	learn: 0.2970371	test: 0.2859411	best: 0.2859411 (99)	total: 8.99s	remaining: 0us

bestTest = 0.2859410566
bestIteration = 99



<catboost.core._CatBoostBase at 0xb492b70>

In [57]:
from catboost import cv, Pool

params ={
    'depth': 6,
    'learning_rate': .1,
    'iterations': 100,
    'loss_function': 'Logloss'
}

cv(params, Pool(x_train_req, y_train_req, cat_features=cat_feat_index),  partition_random_seed=1234, fold_count=5)

defaultdict(list,
            {'Logloss_test_avg': [0.6044590655482499,
              0.5358769341370746,
              0.4831631361206874,
              0.44266836665059667,
              0.41130337126945904,
              0.387199392487251,
              0.36854173334588264,
              0.35395704295215674,
              0.34267024847766736,
              0.33383455044077237,
              0.3268189790347938,
              0.32134512608824817,
              0.3170148265141174,
              0.31358654291561605,
              0.3108858404954843,
              0.3087493540895614,
              0.30703190890868537,
              0.3056071108638281,
              0.3044784752974292,
              0.30361880765614996,
              0.30287648987621496,
              0.30227719167405254,
              0.30180587338820675,
              0.30143048209180734,
              0.3011080926921161,
              0.30076702870558575,
              0.30057768507970434,
              0.3003539898528

In [305]:
df_test_req['ATTRITION_REQ'] = cat.predict_proba(df_test_req)[:, 1]

df_test_req_ = df_test_req.groupby(by=df_test_req.index)['ATTRITION_REQ'].mean()

new_submission = pd.DataFrame(index=x_test_.index)
new_submission['ATT'] = y_pred
new_submission.head()

Unnamed: 0_level_0,ATT
ID_CORRELATIVO,Unnamed: 1_level_1
47411,0.291465
39861,0.269891
38898,0.028191
50927,0.019253
32969,0.470753


In [306]:
df_test_req_ = new_submission.join(df_test_req_)
df_test_req_['ATTRITION'] = df_test_req_.apply(lambda x: np.mean(x), axis=1)

In [307]:
df_test_req_.reset_index(inplace=True)
df_test_req_[['ID_CORRELATIVO', 'ATTRITION']].to_csv('./data/submission11.csv', index=False)