А теперь про короля - LightGBM. Загрузил данные, запустил в ГБМ, покрутил немного параметры.
Теперь нужно провести его тонкую настройку.

In [13]:
import sys; sys.path.append('scripts')
from common import *
from global_common import *

import gc
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

from tqdm import tqdm
import lightgbm as lgb

In [14]:
# Читаем данные
path_to_data = Path('../data')
train_df = pd.read_csv(path_to_data/'train_music.csv')
train_y = train_df['target']
del train_df['target']

test_df  = pd.read_csv(path_to_data/'test_music.csv')

sample_submission_df = pd.read_csv(path_to_data/'sample_submission_music.csv')
print(f'Shape of train {train_df.shape}\nShape of test {test_df.shape}\n')

Shape of train (70134, 460)
Shape of test (30001, 460)



In [16]:
# Объединяем выборки для обработки переменных, запоминаем индексы чтобы потом разделить назад
train_idx = train_df.index[-1]
merged = pd.concat([train_df, test_df], axis=0)

# Бинарный признка целочисленное значение, пропуски = -1, не эмбединги
binary_columns = [o for o in merged.columns if 'flag' in o or 'is' in o]
binary_columns.remove('tp_flag')
print(f'Бинарных признаков - {len(binary_columns)}')

# Для категориальных признаков используем -1. Кодируем в эмбединги. Целочисленные значения.
cat_names = ['sim_count', 'device_type', 'manufacturer_category', 'os_category', 'tp_flag']
print(f'Категориальных признаков - {len(cat_names)}')

# Флоат должны использовать медиану для замещения пропусков
float_columns = list(set(merged.columns).difference(set(binary_columns+cat_names)))
float_columns.remove('id')
print(f'Числовых признаков - {len(float_columns)}')

# Замещаем пропущенные значения в соответствии с типом данных
merged[binary_columns] = merged[binary_columns].apply(lambda x: x.fillna(-1))
merged[cat_names] = merged[cat_names].apply(lambda x: x.fillna(-1))
merged[float_columns] = merged[float_columns].apply(lambda x: x.fillna(x.median()))
assert merged.isna().sum().sum() == 0, 'Buddy, slow down!'

# Присваивает тип для каждой из переменных
merged[binary_columns] = merged[binary_columns].apply(lambda x: x.astype('int'))
merged[cat_names] = merged[cat_names].apply(lambda x: x.astype('int'))
merged[float_columns] = merged[float_columns].apply(lambda x: x.astype('float'))

Бинарных признаков - 15
Категориальных признаков - 5
Числовых признаков - 439


In [18]:
# Разделяем обработанные обучающую и тестовую выборки
train = merged.iloc[:train_idx+1, :]
train['target'] = train_y
test = merged.iloc[train_idx+1:, :]
del merged
del train_y

# Указываем для модели категориальные (для кодировки в эмб) и числовые
dep_var = 'target'
cont_names = float_columns
cat_names = cat_names

In [34]:
# Prepare dataset for training
cols_to_drop = [
    'id',
    'target',
]

categorical = cat_names

X = train.drop(cols_to_drop, axis=1, errors='ignore')
y = train.target.values

id_test = test.id.values
X_test = test.drop(cols_to_drop[0], axis=1, errors='ignore')


print('train.shape = {}, test.shape = {}'.format(train.shape, test.shape))

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 5,
    'learning_rate': 0.01, 
    'verbose': -1,
    'num_threads': 2,
#     'scale_pos_weight' : int(train.target.mean()*10),
    'is_unbalance' : 'true',
}

# Build the model
cnt = 0
p_buf = []
n_splits = 5
n_repeats = 1
kf = StratifiedKFold(
    n_splits=n_splits, 
    random_state=0)
err_buf = []   

n_features = X.shape[1]

for train_index, valid_index in kf.split(X, y):
    print('Fold {}/{}*{}'.format(cnt + 1, n_splits, n_repeats))
    params = lgb_params.copy() 
    
    lgb_train = lgb.Dataset(
        X.iloc[train_index], 
        y[train_index], 
        categorical_feature=categorical,
    
        )
    lgb_train.raw_data = None

    lgb_valid = lgb.Dataset(
        X.iloc[valid_index], 
        y[valid_index],
        categorical_feature=categorical,
        )
    lgb_valid.raw_data = None

    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=100000,
        valid_sets=[lgb_train, lgb_valid],
        early_stopping_rounds=150, 
        verbose_eval=100,
    )

    if cnt == 0:
        importance = model.feature_importance()
        model_fnames = model.feature_name()
        tuples = sorted(zip(model_fnames, importance), key=lambda x: x[1])[::-1]
        tuples = [x for x in tuples if x[1] > 0]
        print('Important features:')
        for i in range(60):
            if i < len(tuples):
                print(tuples[i])
            else:
                break

        del importance, model_fnames, tuples

    p = model.predict(X.iloc[valid_index], num_iteration=model.best_iteration)
    err = roc_auc_score(y[valid_index], p)

    print('{} auc: {}'.format(cnt + 1, err))

    p = model.predict(X_test, num_iteration=model.best_iteration)
    if len(p_buf) == 0:
        p_buf = np.array(p, dtype=np.float16)
    else:
        p_buf += np.array(p, dtype=np.float16)
    err_buf.append(err)


    cnt += 1
    # if cnt > 0: # Comment this to run several folds
    #     break

    del model, lgb_train, lgb_valid, p
    gc.collect

err_mean = np.mean(err_buf)
err_std = np.std(err_buf)
print('auc = {:.6f} +/- {:.6f}'.format(err_mean, err_std))

preds = p_buf/cnt

train.shape = (70134, 461), test.shape = (30001, 460)
Fold 1/5*1
Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.833433	valid_1's auc: 0.81129
[200]	training's auc: 0.851859	valid_1's auc: 0.823689
[300]	training's auc: 0.866192	valid_1's auc: 0.832551
[400]	training's auc: 0.877289	valid_1's auc: 0.836772
[500]	training's auc: 0.887401	valid_1's auc: 0.839853
[600]	training's auc: 0.894883	valid_1's auc: 0.841458
[700]	training's auc: 0.901537	valid_1's auc: 0.842491
[800]	training's auc: 0.907523	valid_1's auc: 0.842797
[900]	training's auc: 0.91275	valid_1's auc: 0.842801
[1000]	training's auc: 0.917337	valid_1's auc: 0.842913
[1100]	training's auc: 0.9219	valid_1's auc: 0.84311
[1200]	training's auc: 0.925725	valid_1's auc: 0.843207
[1300]	training's auc: 0.929416	valid_1's auc: 0.843126
Early stopping, best iteration is:
[1243]	training's auc: 0.927455	valid_1's auc: 0.843341
Important features:
('manufacturer_category', 1861)
('data_type_3_m

In [None]:
# Stratified Split
# 5 auc: 0.8265728454896974
# auc = 0.837760 +/- 0.009300

In [None]:
# scale_pos_weight : 99 + stratify
# 5 auc: 0.8009548197291523
# auc = 0.810203 +/- 0.011326

In [None]:
# is_inbalanceb  + stratify
# 5 auc: 0.8211641682998747
# auc = 0.833885 +/- 0.010785  LB = 0.72

In [None]:
# is_inbalanceb  + stratify + categorical
# 5 auc: 0.8226285930575419
# auc = 0.834418 +/- 0.010447

In [35]:
# Prepare submission
subm = pd.DataFrame()
subm['id'] = id_test
subm['prediction'] = preds
subm.to_csv('submissions/lightgbm2.csv', index=False)

# Finetune LighGBM

In [None]:
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
def score(params):
    from sklearn.metrics import log_loss
    print("Training with params:")
    print(params)
    params['max_depth'] = int(params['max_depth'])
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test, label=y_test)
    model = xgb.train(params, dtrain, params['num_round'])
    predictions = model.predict(dvalid).reshape((X_test.shape[0], 7))
    score = log_loss(y_test, predictions)
    print("\tScore {0}\n\n".format(score))
    return {'loss': score, 'status': STATUS_OK}

In [None]:
def optimize(trials):
    space = {
             'num_round': 100,
             'learning_rate': hp.quniform('eta', 0.005, 0.05, 0.005),
             'max_depth': hp.quniform('max_depth', 3, 14, 1),
             'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
             'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
             'gamma': hp.quniform('gamma', 0.5, 1, 0.01),
             'colsample_bytree': hp.quniform('colsample_bytree', 0.4, 1, 0.05),
             'num_class' : 7,
             'eval_metric': 'merror',
             'objective': 'multi:softprob',
             'nthread' : 4,
             'silent' : 1
             }
    
    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=10)
    return best

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df, y, test_size=0.3, random_state=17)

In [None]:
trials = Trials()
best_params = optimize(trials)
best_params