In [None]:
# https://github.com/catboost/tutorials
# https://github.com/hyperopt/hyperopt/wiki/FMin
# https://www.kaggle.com/felipeleiteantunes/xgboost-hyperopt-cv-via-python-api

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys; sys.path.append('scripts')
from common import *
from global_common import GaussRankScaler
import xgboost as xgb
# from load_and_proccess_data import *

In [3]:
from common import *

def load_data():
    # Читаем данные
    path_to_data = Path('../data')
    train_df = pd.read_csv(path_to_data/'train_music.csv')
    train_y = train_df['target']
    del train_df['target']
    test_df  = pd.read_csv(path_to_data/'test_music.csv')

    # Объединяем выборки для обработки переменных, запоминаем индексы чтобы потом разделить назад
    train_idx = train_df.index[-1]
    merged = pd.concat([train_df, test_df], axis=0)
    
    # Отдельно выделяем бинарные в тип bool
    bool_columns = ['tp_flag', 'block_flag', 'is_obl_center', 'is_my_vf']
    print(f'Из целочисленныъ - {len(bool_columns)} бинарных.')

    # Целочисленные переменные
    int_columns = [o for o in merged.columns for crit in ['flag', 'is', 'count'] if crit in o]
    int_columns += ['sim_count','device_type','manufacturer_category','os_category','tp_flag','days_exp', 'paym_last_days']
    _ = [int_columns.remove(o) for o in bool_columns]
    print(f'Целочисленных переменных  : {len(int_columns)}')

    # Переменные с плавающей точкой
    criterion_for_float_columns = ['data_type', 'rr', 'vol', 'cost', 'dur', 'sum', 'part', 'clc', 'lt', 'brnd']
    float_columns = [o for o in merged.columns for crit in criterion_for_float_columns if crit in o]
    print(f'Переменных с плавающей точкой : {len(float_columns)}')

    # Если в значениях переменной есть 0, тогда пропуски заполняет -1. Если нет 0, тогда заполняем нулем.
    merged[int_columns] = merged[int_columns].apply(lambda x: x.fillna(-1) if 0 in x.values else x.fillna(0))
    merged[int_columns] = merged[int_columns].apply(lambda x: x.astype('int'))
    merged[bool_columns] = merged[bool_columns].apply(lambda x: x.astype('bool'))
    merged[float_columns] = merged[float_columns].apply(lambda x: x.fillna(x.median()) if 0 in x.values else x.fillna(0))
    merged[float_columns] = merged[float_columns].apply(lambda x: x.astype('float'))
    print(f'Переменным присвоен соответствующий тип.')
    
    cat_features = ['os_category', 'device_type', 'service_7_flag_m1', 
                       'service_7_flag_m2', 'service_7_flag_m3', 'manufacturer_category']
    
    merged = pd.concat([
        merged, 
        pd.get_dummies(merged['os_category'], prefix='os_category'), 
        pd.get_dummies(merged['device_type'], prefix='device_type'), 
        pd.get_dummies(merged['service_7_flag_m1'], prefix='service_7_flag_m1'), 
        pd.get_dummies(merged['service_7_flag_m2'], prefix='service_7_flag_m2'), 
        pd.get_dummies(merged['service_7_flag_m3'], prefix='service_7_flag_m3'),
        pd.get_dummies(merged['manufacturer_category'], prefix='manufacturer_category'),        
    ], axis=1)
    
    merged.drop(cat_features, axis=1, inplace=True)

    # Разделяем обработанные обучающую и тестовую выборки
    train = merged.iloc[:train_idx+1, :]
    train['target'] = train_y
    test = merged.iloc[train_idx+1:, :]
    del merged
    del train_y
    
    return train, test, {
        'int_cols'  : int_columns,
        'bool_cols' : bool_columns,
        'float_cols': float_columns,
}

train, test, columns_type_dict = load_data()

Из целочисленныъ - 4 бинарных.
Целочисленных переменных  : 232
Переменных с плавающей точкой : 225
Переменным присвоен соответствующий тип.


In [29]:
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

def score(params):
    print("Training with params:")
    print(params)
    params['max_depth'] = int(params['max_depth'])
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
    model = xgb.train(params=params, 
                      dtrain=dtrain, 
                      num_boost_round=5000,
                      evals=watchlist,
                      verbose_eval=False,
                      early_stopping_rounds=50,
                      )
    nb_trees = model.best_ntree_limit    
    y_pred = model.predict(X_test, ntree_limit=nb_trees)
    test_loss = sklearn.metrics.roc_auc_score(y_test, y_pred)
    return{'loss':test_loss, 'status': STATUS_OK }


def optimize(trials):
    space = {
             'num_boost_round': 400,
             'learning_rate': hp.quniform('eta', 0.005, 0.05, 0.005),
             'max_depth': hp.quniform('max_depth', 3, 20, 1),
             'min_child_weight': hp.quniform('min_child_weight', 1, 12, 1),
             'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
             'gamma': hp.quniform('gamma', 0.3, 1, 0.01),
             'colsample_bytree': hp.quniform('colsample_bytree', 0.4, 1, 0.05),
             'colsample_bylevel': hp.uniform('colsample_bylevel', 0.70, 1.0),
             'num_class' : 1,
             'eval_metric': 'auc',
             'objective': 'binary:logistic',
             'nthread' : 10,
             'silent' : 1,
             'scale_pos_weight' : 1,
             }
    
    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=500)
    return best

In [30]:
X_train, X_valid, y_train, y_valid = train_test_split(
    train.drop(['target', 'id'], axis=1), train['target'].values, test_size=0.15, random_state=SEED)

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, test_size=0.15, random_state=SEED)

In [4]:
# Минимум больше 84
trials = Trials()
best_params = optimize(trials)
best_params

# Считаем модели

In [5]:
# DEFAULT DATA! Withoot OHE
# Training with params:
# {'colsample_bylevel': 0.8292364340638145, 'colsample_bytree': 0.45, 'eval_metric': 'auc', 
# 'gamma': 0.73, 'learning_rate': 0.03, 'max_depth': 11.0, 'min_child_weight': 8.0, 'nthread': 10, 
# 'num_boost_round': 5000, 'num_class': 1, 'objective': 'binary:logistic', 'scale_pos_weight': 1, 'silent': 1, 
# 'subsample': 0.9500000000000001}
# 	Score 0.8404299910529781

# BEST XGBoost_3
# Training with params:
# {'colsample_bytree': 0.45, 'eval_metric': 'auc', 'gamma': 0.93, 'learning_rate': 0.025, 
# 'max_depth': 13.0, 'min_child_weight': 7.0, 'nthread': 10, 'num_class': 1, 'num_round': 400, 
# 'objective': 'binary:logistic', 'silent': 1, 'subsample': 0.9}
# 	Score 0.8383544505292855

# TOP 0.84616, WITH OHE for category
# cat_features = ['os_category', 'device_type', 'service_7_flag_m1', 
#                        'service_7_flag_m2', 'service_7_flag_m3', 'manufacturer_category']

best_params = {}
# best_params['colsample_bylevel'] = 0.8292364340638145
best_params['colsample_bytree'] = 0.45
best_params['eta'] = 0.025
best_params['gamma'] = 0.93
best_params['max_depth'] = 13
best_params['min_child_weight'] = 7
best_params['subsample'] = 0.9
best_params['num_class'] = 1
best_params['eval_metric'] = 'auc'
# best_params['scale_pos_weight'] = 1
best_params['objective'] = 'binary:logistic'
best_params['nthread'] = 12
best_params['silent'] = 1

In [6]:
dtrain = xgb.DMatrix(train.drop(['target', 'id'], axis=1), train['target'].values)

In [7]:
%%time
xgbCvResult = xgb.cv(best_params, 
                     dtrain,  
                     num_boost_round=1500,  
                     nfold=7, 
                     early_stopping_rounds=150, 
                     seed=100,
                    )

CPU times: user 2h 26min 56s, sys: 33.5 s, total: 2h 27min 29s
Wall time: 12min 33s


In [8]:
# 326 	0.996390 	0.000147 	0.833989 	0.004800
xgbCvResult

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.808984,0.002826,0.756903,0.009408
1,0.83698,0.001207,0.769166,0.006315
2,0.848824,0.003935,0.776512,0.003926
3,0.859417,0.004975,0.78418,0.005238
4,0.866956,0.003073,0.790133,0.005028
5,0.871797,0.003885,0.792043,0.004343
6,0.875307,0.003737,0.792691,0.004477
7,0.877652,0.003804,0.794831,0.005728
8,0.881605,0.003307,0.797952,0.00543
9,0.883858,0.003784,0.800211,0.005444


In [38]:
best_num_round = np.argmax(xgbCvResult['test-auc-mean'])
best_num_round

326

In [39]:
bestXgb = xgb.train(best_params, dtrain, num_boost_round=best_num_round)

In [46]:
# bestXgb.save_model('best_xgb')

In [40]:
dtest = xgb.DMatrix(test.drop(['id'], axis=1))

In [41]:
xgboost_predict_proba = bestXgb.predict(dtest)

In [42]:
xgboost_predict_proba

array([0.005281, 0.013559, 0.0135  , 0.006801, ..., 0.07813 , 0.037337, 0.031527, 0.024254], dtype=float32)

In [43]:
# Prepare submission
subm = pd.DataFrame()
subm['id'] = test.id.values
subm['prediction'] = xgboost_predict_proba
subm.to_csv('submissions/XGBoost_5.csv', index=False)

In [None]:


models = []

np.random.seed(42)
for i in tqdm.tqdm_notebook(range(100)):
    X_train = X.copy()
    for col in columns_with_nulls:
        _idx = np.random.choice(X_train.index, size=X_train.shape[0]//10, replace=False)
        X_train.loc[_idx, col] = -999
    
    model = CatBoostClassifier(
        iterations=1000,
        depth=6,
        thread_count=12,
        border_count=128,
        learning_rate=0.015,
        random_seed=np.random.randint(10**10),
        logging_level='Silent'
    )
    
    model.fit(X_train, y)
    models.append(model.copy())



In [None]:


predictions = []

for i in range(10):
    clf = CatBoostClassifier(learning_rate=0.1, iterations=1000, random_seed=i, logging_level='Silent')
    clf.fit(train_df, labels, cat_features=cat_features_ids)
    predictions.append(clf.predict_proba(test_df)[:,1])

