In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
def data_inspection(data):
	df_shop=data.copy()
	nunique_list=[]
	miss_list=[]
	type_list=[]
	for col in df_shop.columns:
		nunique_list.append(df_shop[col].nunique())
		miss_list.append(df_shop[col].isnull().sum())
		type_list.append(df_shop[col].dtypes)
	pd.set_option('display.max_rows',300)
	feat_labels=df_shop.columns
	summary=np.array([nunique_list, miss_list, type_list])
	columns=['nunique', 'missing', 'type']
	results=pd.DataFrame(summary.T,index=feat_labels,columns=columns)
	print('Memory usage: ',round(data.memory_usage(index=True,deep=False)))
	print('The number of raws: ', data.shape[0])
	print('The number of columns: ', data.shape[1])
	return results

In [3]:
from boost_preprocessing import BoostARoota
br=BoostARoota(clf=None,
              cutoff=4,
              iters=10,
              max_rounds=100,
              delta=0.1,
              silent=False,
              metric='mae')

# Разбор класса BoostARoota

In [4]:
# импортируем необходимые библиотеки и классы
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb
import operator
import warnings

In [5]:
# выполняем разбор класса BoostARoota

# для работы класса внутри конвейеров потребуется BaseEstimator
class BoostARoota(BaseEstimator, TransformerMixin):
    # все параметры для инициализации публичных атрибутов
    # задаем в методе __init__
    def __init__(self, metric=None, clf=None, cutoff=4, iters=10, 
                 max_rounds=100, delta=0.1, silent=False):
        # оптимизируемая метрика
        self.metric = metric
        # алгоритм на основе деревьев для отбора признаков
        # (по умолчанию XGBoost, clf=None)
        self.clf = clf
        # порог отсечения для удаления признаков, исходя из их важности. 
        # По умолчанию равен 4. Большие значения будут более консервативными - 
        # если установить значение параметра слишком высоким, в конечном итоге 
        # может быть удалено незначительное количество признаков. Маленькие
        # значения будут более агрессивными. Значение должно быть выше нуля 
        # (может быть значение с плавающей точкой).
        self.cutoff = cutoff
        # количество итераций (запусков алгоритма XGBoost) для усреднения важности
        # признаков. По умолчанию равен 10. Не рекомендуется устанавливать значение 
        # параметра равным 1 (чем меньше итераций, тем выше случайная изменчивость 
        # оценок важностей и быстрее скорость вычислений, так как алгоритм запустит 
        # XGBoost меньшее количество раз). Параметр масштабируется линейно: при iters=4 
        # требуется в 2 раза больше времени, чем при iters=2, и в 4 раза больше времени, 
        # чем при iters=1.
        self.iters = iters
        # количество раундов работы основного алгоритма BoostARoota. Каждый раунд
        # устраняет все больше и больше признаков. Значение по умолчанию 
        # установлено достаточно высоким, т.к. все равно при нормальных 
        # обстоятельствах такое количество рандов не потребуется. Если вам 
        # кажется, что переменные удаляются слишком агрессивно, вы можете 
        # установить более низкое значение.
        self.max_rounds = max_rounds
        # доля удаляемых признаков для перехода к следующему раунду
        self.delta = delta
        # вывод сообщений о ходе работы
        self.silent = silent
        # переменные, отобранные алгоритмом
        self.keep_vars_ = None
        
        # выдать ошибки, если для параметров заданы некорректные значения
        if (metric is None) and (clf is None):
            # сообщение о том, что нужно задать либо метрику, либо алгоритм отбора 
            raise ValueError("you must enter one of metric or clf as arguments")
        if cutoff <= 0:
            # сообщение о том, что cutoff должен быть больше 0
            raise ValueError("cutoff should be greater than 0. You entered" + str(cutoff))
        if iters <= 0:
            # сообщение о том, что iters должен быть больше 0
            raise ValueError("iters should be greater than 0. You entered" + str(iters))
        if (delta <= 0) | (delta > 1):
            # сообщение о том, что значение delta должно быть больше 0, но не больше 1 
            raise ValueError("delta should be between 0 and 1, was " + str(delta))

        # выдать предупреждения при измененных параметрах
        if (metric is not None) and (clf is not None):
            # предупреждение о том, что изменены метрика и алгоритм отбора
            warnings.warn("You entered values for metric and clf, defaulting to clf and ignoring metric")
        if delta < 0.02:
            # предупреждение о том, что при delta < 0.02 алгоритм может не сойтись
            warnings.warn("WARNING: Setting a delta below 0.02 may not converge on a solution.")
        if max_rounds < 1:
            # предупреждение о том, что если значение max_rounds установлено ниже 1, 
            # это значение будет автоматически задано равным 1
            warnings.warn("WARNING: Setting max_rounds below 1 will automatically be set to 1.")
            
    # метод .fit() выполняет обучение - отбор признаков
    def fit(self, x, y):
        # вызов основной функции, запускающей алгоритм BoostARoota,
        # возвращаем имена переменных, которые нужно сохранить
        self.keep_vars_ = _BoostARoota(x, y,
                                       metric=self.metric,
                                       clf=self.clf,
                                       cutoff=self.cutoff,
                                       iters=self.iters,
                                       max_rounds=self.max_rounds,
                                       delta=self.delta,
                                       silent=self.silent)
        return self

    # метод .transform() формирует новый массив -
    # массив отобранных признаков
    def transform(self, x):
        # если переменная keep_vars_ еще не определена и был вызван метод .transform()
        if self.keep_vars_ is None:
            # выводится сообщение о том, что сначала нужно применить метод .fit()
            raise ValueError("You need to fit the model first")
        # возвращает массив данных с теми признаками, которые
        # были отобраны алгоритмом BoostARoota в методе .fit()
        return x[self.keep_vars_]

# функция добавления "теневых" признаков. Она удваивает ширину набора данных, создав
# копии всех признаков исходного набора. Случайным образом перемешивает значения новых 
# признаков. Эти дублированные и перемешанные признаки называются «теневыми».    
def _create_shadow(x_train):
    """
    :параметр x_train: датафрейм данных для создания 
        на его основе "теневых" признаков
    :возвращает: датафрейм данных удвоенной ширины и имена 
        "теневых" признаков для последующего удаления
    """
    # создаем копию обучающего массива признаков
    x_shadow = x_train.copy()
    # в цикле проходим по всем "теневым" признакам
    # и перемешиваем их значения
    for c in x_shadow.columns:
        np.random.shuffle(x_shadow[c].values)
        
    # переименовываем "теневые" признаки
    shadow_names = ["ShadowVar" + str(i + 1) for i in range(x_train.shape[1])]
    x_shadow.columns = shadow_names
    # объединяем набор из исходных и набор из "теневых" признаков 
    # в один новый датафрейм удвоенной ширины
    new_x = pd.concat([x_train, x_shadow], axis=1)
    # возвращаем датафрейм удвоенной ширины из исходных и "теневых" признаков
    # и список имен "теневых" признаков для последующего удаления
    return new_x, shadow_names

# функция вычисления важностей для отбора признаков 
# на основе алгоритма XGBoost
def _reduce_vars_xgb(x, y, metric, this_round, cutoff, n_iterations, delta, silent):
    """
    :параметр x: входной массив признаков - X
    :параметр y: зависимая переменная
    :параметр metric: оптимизируемая метрика в XGBoost
    :параметр this_round: номер текущего раунда, чтобы его можно было вывести на экран
    :возвращает: кортеж - критерий остановки и имена переменных, которые нужно сохранить
    """
    # если метрика для оптимизации - mlogloss, то задаем соответствующую
    # функцию потерь, оптимизируемую метрику, количество классов, "тихий"
    # режим для обучения модели XGBoost 
    if metric == 'mlogloss':
        param = {'objective': 'multi:softmax',
                 'eval_metric': 'mlogloss',
                 'num_class': len(np.unique(y)),
                 'silent': 1}
    else:
        # в противном случае оптимизируемой метрикой 
        # будет заданная метрика для оптимизации
        param = {'eval_metric': metric,
                 'silent': 1}
        
    # выполнение в цикле итераций обучения алгоритма XGBoost 
    # для усреднения важности признаков
    for i in range(1, n_iterations + 1):
        # создаем "теневые" признаки:
        # new_x - содержит датафрейм удвоенной ширины 
        # с исходными и "теневыми" предикторами
        # shadow_names - список имен "теневых" признаков 
        # для последующего удаления
        new_x, shadow_names = _create_shadow(x)
        # преобразовываем массив признаков и массив меток в объект DMatrix 
        dtrain = xgb.DMatrix(new_x, label=y)
        # обучаем модель XGBoost
        bst = xgb.train(param, dtrain, verbose_eval=False)
        # если это первая итерация
        if i == 1:
            # создаем датафрейм df со столбцом-списком признаков
            df = pd.DataFrame({'feature': new_x.columns})
            pass

        # получаем значение важности для каждого признака, по умолчанию
        # используется weight - простой показатель важности, который 
        # суммирует, сколько раз конкретный признак использовался 
        # в качестве предиктора разбиения в алгоритме XGBoost      
        importance = bst.get_fscore()
        # сортируем по значению важности
        importance = sorted(importance.items(), key=operator.itemgetter(1))
        # создаем датафрейм, содержащий названия предикторов и их важности
        df2 = pd.DataFrame(importance, columns=['feature', 'fscore'+str(i)])
        # нормируем значения важности
        df2['fscore'+str(i)] = df2['fscore'+str(i)] / df2['fscore'+str(i)].sum()
        # объединяем датафреймы df и df2, т.е. к df добавляется столбец со 
        # значениями важности, найденными в текущей i-ой итерации
        df = pd.merge(df, df2, on='feature', how='outer')
        # если не задан "тихий" режим, печатаем информацию
        # о текущем раунде и итерации
        if not silent:
            print("Round: ", this_round, " iteration: ", i)

    # в df добавляем усредненное значение важности по всем пройденным итерациям 
    df['Mean'] = df.mean(axis=1)
    # выполняем обратное разделение признаков на исходные и "теневые"
    real_vars = df[~df['feature'].isin(shadow_names)]
    shadow_vars = df[df['feature'].isin(shadow_names)]

    # вычисляем «порог отсечения»: среднее значение важности 
    # для всех «теневых» признаков, поделенное на значение 
    # cutoff (по умолчанию равно 4)
    mean_shadow = shadow_vars['Mean'].mean() / cutoff
    
    # удаляем признаки, средняя важность которых по результатам 
    # всех итераций меньше «порога отсечения»
    real_vars = real_vars[(real_vars.Mean > mean_shadow)]
    
    # проверяем критерий остановки
    # в основном мы хотим убедиться, что удаляем не менее 10% переменных, 
    # иначе следует остановиться
    if (len(real_vars['feature']) / len(x.columns)) > (1 - delta):
        criteria = True
    else:
        criteria = False
        
    # возвращаем критерий остановки и список оставшихся признаков
    return criteria, real_vars['feature']

# функция вычисления важностей для отбора признаков на основе алгоритма из 
# библиотеки sklearn, в котором поддерживается атрибут feature_importances_
def _reduce_vars_sklearn(x, y, clf, this_round, cutoff, n_iterations, delta, silent):  
    """
    :параметр x: входной массив признаков - X
    :параметр y: зависимая переменная
    :параметр clf: алгоритм из библиотеки sklearn на основе 
        деревьев решений, переданный пользователем
    :параметр this_round: номер текущего раунда, чтобы его можно было вывести на экран
    :возвращает: кортеж - критерий остановки и имена переменных, которые нужно сохранить    
    """
    # выполнение в цикле итераций обучения указанного алгоритма
    # для усреднения важности признаков
    for i in range(1, n_iterations+1):
        # создаем "теневые" признаки:
        # new_x - содержит датафрейм удвоенной ширины 
        # с исходными и "теневыми" предикторами
        # shadow_names - список имен "теневых" признаков 
        # для последующего удаления
        new_x, shadow_names = _create_shadow(x)
        # задали обучение модели sklearn
        clf = clf.fit(new_x, np.ravel(y))
        # если это первая итерация
        if i == 1:
            # создаем датафрейм df со столбцом-списком признаков
            df = pd.DataFrame({'feature': new_x.columns})
            # копируем его в датафрейм df2
            df2 = df.copy()
            pass

        try:
            # получаем значение важности для каждого признака
            # с помощью атрибута feature_importances_
            importance = clf.feature_importances_
            # добавляем в df2 значения важности, найденные в текущей итерации
            df2['fscore' + str(i)] = importance
        except ValueError:
            # выдается ошибка, если задан алгоритм, в котором 
            # нет атрибута feature_importances_
            print("this clf doesn't have the feature_importances_ method.  Only Sklearn tree based methods allowed")
        
        # нормируем значения важности
        df2['fscore'+str(i)] = df2['fscore'+str(i)] / df2['fscore'+str(i)].sum()
        # объединяем датафреймы df и df2, т.е. к df добавляется столбец со 
        # значениями важности, найденными в текущей i-ой итерации
        df = pd.merge(df, df2, on='feature', how='outer')
        # если не задан "тихий" режим, печатаем информацию
        # о текущем раунде и итерации
        if not silent:
            print("Round: ", this_round, " iteration: ", i)

    # в df добавляем усредненное значение важности по всем пройденным итерациям         
    df['Mean'] = df.mean(axis=1)
    # выполняем обратное разделение признаков на исходные и "теневые"
    real_vars = df[~df['feature'].isin(shadow_names)]
    shadow_vars = df[df['feature'].isin(shadow_names)]

    # вычисляем «порог отсечения»: среднее значение важности 
    # для всех «теневых» признаков, поделенное на значение 
    # cutoff (по умолчанию равно 4)
    mean_shadow = shadow_vars['Mean'].mean() / cutoff
    
    # удаляем признаки, средняя важность которых по результатам 
    # всех итераций меньше «порога отсечения»
    real_vars = real_vars[(real_vars.Mean > mean_shadow)]

    # проверяем критерий остановки
    # в основном мы хотим убедиться, что удаляем не менее 10% переменных, 
    # иначе следует остановиться
    if (len(real_vars['feature']) / len(x.columns)) > (1 - delta):
        criteria = True
    else:
        criteria = False

    # возвращаем критерий остановки и список оставшихся признаков
    return criteria, real_vars['feature']

# основная функция, запускающая алгоритм BoostARoota
def _BoostARoota(x, y, metric, clf, cutoff, iters, max_rounds, delta, silent):
    """
    Функция проходит цикл, ожидая изменения критерия остановки
    :параметр x: массив признаков X (если есть категориальные переменные,
        нужно выполнить дамми-кодирование)
    :параметр y: массив меток зависимой переменной
    :параметр metric: оптимизируемая метрика
    :возвращает: имена переменных, которые нужно сохранить
    """
    # создаем копию обучающего массива признаков
    new_x = x.copy()
    
    # выполняем цикл до тех пор, пока переменная crit не изменится
    # выставляем в ноль счетчик раундов
    i = 0
    while True: # внутри этого цикла мы уменьшаем набор данных на каждом раунде
        # увеличиваем счетчик раундов
        i += 1
        
        # если параметр clf задан по умолчанию, то для отбора
        # признаков используется алгоритм XGBoost
        if clf is None:
            # вызывается функция _reduce_vars_xgb, которая возвращает критерий 
            # остановки и уменьшенный на данном раунде список предикторов
            crit, keep_vars = _reduce_vars_xgb(new_x,
                                               y,
                                               metric=metric,
                                               this_round=i,
                                               cutoff=cutoff,
                                               n_iterations=iters,
                                               delta=delta,
                                               silent=silent)
            
        # в противном случае используется алгоритм из библиотеки 
        # sklearn, в котором есть атрибут feature_importances_
        else:
            # вызывается функция _reduce_vars_sklearn (алгоритм передается
            # через параметр clf), которая возвращает критерий остановки
            # и уменьшенный на данном раунде список предикторов
            crit, keep_vars = _reduce_vars_sklearn(new_x,
                                                   y,
                                                   clf=clf,
                                                   this_round=i,
                                                   cutoff=cutoff,
                                                   n_iterations=iters,
                                                   delta=delta,
                                                   silent=silent)
            
        # если критерий остановки принял значение True 
        # или достигнуто максимальное количество раундов
        if crit | (i >= max_rounds):
            break
            # то выйти из цикла и использовать keep_vars в качестве
            # итогового списка отобранных переменных
        # в противном случае
        else:
            # создаем копию массива из списка признаков keep_vars
            new_x = new_x[keep_vars].copy()
    # если режим не является "тихим"
    if not silent:
        # напечатать сообщение об успешном завершении работы алгоритма
        print("BoostARoota ran successfully! Algorithm went through ", i, " rounds.")
    return keep_vars

# Пример использования

In [7]:
# загружаем и смотрим данные
data = pd.read_csv('Data/bankloan.csv', sep=';', decimal=',')
data.head()

Unnamed: 0,age,job,debtinc,creddebt,othdebt,default
0,28,working - other,17.7,2.990592,4.797408,0
1,64,working - production,14.7,5.047392,12.004608,0
2,40,working - IT,4.8,1.042368,1.885632,0
3,30,working - IT,34.5,1.75122,7.56378,0
4,25,working - IT,22.4,0.75936,5.96064,1


## Добавляю случайный признак

In [8]:
arr=np.random.randint(0,3,data.shape[0])
strr='abc'

In [9]:
res=[strr[i] for i in arr]

In [10]:
data['new']=res


In [11]:
# формируем массив меток и массив признаков
y = data.pop('default')
# выполняем дамми-кодирование
X = pd.get_dummies(data)
# смотрим количество признаков
print(len(X.columns))
# выводим итоговый массив признаков
X.head()

12


Unnamed: 0,age,debtinc,creddebt,othdebt,job_civil service,job_own business,job_working - IT,job_working - other,job_working - production,new_a,new_b,new_c
0,28,17.7,2.990592,4.797408,0,0,0,1,0,0,1,0
1,64,14.7,5.047392,12.004608,0,0,0,0,1,0,0,1
2,40,4.8,1.042368,1.885632,0,0,1,0,0,0,1,0
3,30,34.5,1.75122,7.56378,0,0,1,0,0,0,0,1
4,25,22.4,0.75936,5.96064,0,0,1,0,0,0,0,1


In [12]:
# создаем обьект класса BoostARoota
br = BoostARoota(metric='logloss')

In [13]:
# обучаем
br.fit(X, y)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Round:  1  iteration:  1
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Round:  1  iteration:  2
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Round:  1  iteration:  3
Parameters: { silent } might not be used.

  This may not be accurate due to so

Round:  3  iteration:  6
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Round:  3  iteration:  7
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Round:  3  iteration:  8
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Round:  3  iteration:  9
Parameters: { silent } might not be used.

  This may 

BoostARoota(metric='logloss')

In [14]:
# смотрим отобранные признаки
br.keep_vars_

0         age
1     debtinc
2    creddebt
3     othdebt
Name: feature, dtype: object

# Пошаговая реализация алгоритма

In [15]:
# выполняем отбор признаков
new_X = br.transform(X)
# смотрим новый набор
new_X.head()

Unnamed: 0,age,debtinc,creddebt,othdebt
0,28,17.7,2.990592,4.797408
1,64,14.7,5.047392,12.004608
2,40,4.8,1.042368,1.885632
3,30,34.5,1.75122,7.56378
4,25,22.4,0.75936,5.96064


In [16]:
# задаем оптимизируемую метрику
param = {'eval_metric': 'logloss',
         'silent': 1}

In [17]:
# создаем копию обучающего массива признаков
x_shadow = X.copy()
x_shadow.head()

Unnamed: 0,age,debtinc,creddebt,othdebt,job_civil service,job_own business,job_working - IT,job_working - other,job_working - production,new_a,new_b,new_c
0,28,17.7,2.990592,4.797408,0,0,0,1,0,0,1,0
1,64,14.7,5.047392,12.004608,0,0,0,0,1,0,0,1
2,40,4.8,1.042368,1.885632,0,0,1,0,0,0,1,0
3,30,34.5,1.75122,7.56378,0,0,1,0,0,0,0,1
4,25,22.4,0.75936,5.96064,0,0,1,0,0,0,0,1


In [18]:
# в цикле проходим по всем признакам
# и перемешиваем их значения    
for c in x_shadow.columns:
    np.random.shuffle(x_shadow[c].values)

# смотрим перемешанные значения
x_shadow.head()

Unnamed: 0,age,debtinc,creddebt,othdebt,job_civil service,job_own business,job_working - IT,job_working - other,job_working - production,new_a,new_b,new_c
0,58,12.1,0.7488,0.408828,0,0,0,0,0,0,0,0
1,26,9.1,1.450242,1.586112,0,0,0,0,0,0,0,1
2,19,9.8,2.388498,2.308326,0,0,0,0,1,0,0,0
3,40,16.5,2.186184,3.14685,0,0,0,1,0,0,1,0
4,35,16.1,2.33772,6.18618,0,0,0,1,0,0,1,1


In [19]:
# переименовываем "теневые" признаки
shadow_names = ["ShadowVar" + str(i + 1) for i in range(X.shape[1])]
x_shadow.columns = shadow_names
x_shadow.head()

Unnamed: 0,ShadowVar1,ShadowVar2,ShadowVar3,ShadowVar4,ShadowVar5,ShadowVar6,ShadowVar7,ShadowVar8,ShadowVar9,ShadowVar10,ShadowVar11,ShadowVar12
0,58,12.1,0.7488,0.408828,0,0,0,0,0,0,0,0
1,26,9.1,1.450242,1.586112,0,0,0,0,0,0,0,1
2,19,9.8,2.388498,2.308326,0,0,0,0,1,0,0,0
3,40,16.5,2.186184,3.14685,0,0,0,1,0,0,1,0
4,35,16.1,2.33772,6.18618,0,0,0,1,0,0,1,1


In [20]:
# объединяем набор из исходных и набор из "теневых" признаков 
# в один новый датафрейм удвоенной ширины
new_x = pd.concat([X, x_shadow], axis=1)
new_x.head()

Unnamed: 0,age,debtinc,creddebt,othdebt,job_civil service,job_own business,job_working - IT,job_working - other,job_working - production,new_a,...,ShadowVar3,ShadowVar4,ShadowVar5,ShadowVar6,ShadowVar7,ShadowVar8,ShadowVar9,ShadowVar10,ShadowVar11,ShadowVar12
0,28,17.7,2.990592,4.797408,0,0,0,1,0,0,...,0.7488,0.408828,0,0,0,0,0,0,0,0
1,64,14.7,5.047392,12.004608,0,0,0,0,1,0,...,1.450242,1.586112,0,0,0,0,0,0,0,1
2,40,4.8,1.042368,1.885632,0,0,1,0,0,0,...,2.388498,2.308326,0,0,0,0,1,0,0,0
3,30,34.5,1.75122,7.56378,0,0,1,0,0,0,...,2.186184,3.14685,0,0,0,1,0,0,1,0
4,25,22.4,0.75936,5.96064,0,0,1,0,0,0,...,2.33772,6.18618,0,0,0,1,0,0,1,1


In [21]:
# преобразовываем массив признаков и массив меток в объект DMatrix 
dtrain = xgb.DMatrix(new_x, label=y)

In [22]:
# обучаем модель XGBoost
bst = xgb.train(param, dtrain, verbose_eval=False)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [23]:
# поскольку у нас - первая итерация, создаем датафрейм df 
# со столбцом-списком признаков
df = pd.DataFrame({'feature': new_x.columns})
df

Unnamed: 0,feature
0,age
1,debtinc
2,creddebt
3,othdebt
4,job_civil service
5,job_own business
6,job_working - IT
7,job_working - other
8,job_working - production
9,new_a


In [24]:
# получаем значение важности для каждого признака
importance = bst.get_fscore()
importance

{'debtinc': 73,
 'age': 72,
 'ShadowVar7': 8,
 'creddebt': 62,
 'othdebt': 38,
 'ShadowVar1': 32,
 'ShadowVar4': 45,
 'job_civil service': 7,
 'ShadowVar3': 36,
 'ShadowVar2': 41,
 'job_own business': 4,
 'job_working - production': 1,
 'job_working - IT': 4,
 'ShadowVar5': 2,
 'ShadowVar10': 6,
 'ShadowVar12': 1,
 'ShadowVar6': 1,
 'ShadowVar8': 8,
 'job_working - other': 3,
 'ShadowVar11': 4,
 'new_c': 1,
 'ShadowVar9': 2,
 'new_a': 2}

In [25]:
# сортируем по значению важности
importance = sorted(importance.items(), key=operator.itemgetter(1))
importance

[('job_working - production', 1),
 ('ShadowVar12', 1),
 ('ShadowVar6', 1),
 ('new_c', 1),
 ('ShadowVar5', 2),
 ('ShadowVar9', 2),
 ('new_a', 2),
 ('job_working - other', 3),
 ('job_own business', 4),
 ('job_working - IT', 4),
 ('ShadowVar11', 4),
 ('ShadowVar10', 6),
 ('job_civil service', 7),
 ('ShadowVar7', 8),
 ('ShadowVar8', 8),
 ('ShadowVar1', 32),
 ('ShadowVar3', 36),
 ('othdebt', 38),
 ('ShadowVar2', 41),
 ('ShadowVar4', 45),
 ('creddebt', 62),
 ('age', 72),
 ('debtinc', 73)]

In [26]:
# создаем датафрейм, содержащий названия предикторов и их важности
df2 = pd.DataFrame(importance, columns=['feature', 'fscore'+str(1)])
df2

Unnamed: 0,feature,fscore1
0,job_working - production,1
1,ShadowVar12,1
2,ShadowVar6,1
3,new_c,1
4,ShadowVar5,2
5,ShadowVar9,2
6,new_a,2
7,job_working - other,3
8,job_own business,4
9,job_working - IT,4


In [27]:
# нормируем значения важности
df2['fscore'+str(1)] = df2['fscore'+str(1)] / df2['fscore'+str(1)].sum()
df2

Unnamed: 0,feature,fscore1
0,job_working - production,0.002208
1,ShadowVar12,0.002208
2,ShadowVar6,0.002208
3,new_c,0.002208
4,ShadowVar5,0.004415
5,ShadowVar9,0.004415
6,new_a,0.004415
7,job_working - other,0.006623
8,job_own business,0.00883
9,job_working - IT,0.00883


In [28]:
# объединяем датафреймы df и df2, т.е. к df добавляется столбец со 
# значениями важности, найденными в текущей итерации 0
df = pd.merge(df, df2, on='feature', how='outer')
df

Unnamed: 0,feature,fscore1
0,age,0.15894
1,debtinc,0.161148
2,creddebt,0.136865
3,othdebt,0.083885
4,job_civil service,0.015453
5,job_own business,0.00883
6,job_working - IT,0.00883
7,job_working - other,0.006623
8,job_working - production,0.002208
9,new_a,0.004415


In [29]:
# теперь 10 раз запустим XGBoost и вычислим важности

# задаем оптимизируемую метрику
param = {'eval_metric': 'logloss',
         'silent': 1}

n_iterations=10

for i in range(1, n_iterations+1):
    # создаем "теневые" признаки:
    # new_x - содержит датафрейм удвоенной ширины 
    # с исходными и "теневыми" предикторами
    # shadow_names - список имен "теневых" признаков 
    # для последующего удаления
    new_x, shadow_names = _create_shadow(X)
    # преобразовываем массив признаков и массив меток в объект DMatrix 
    dtrain = xgb.DMatrix(new_x, label=y)
    # обучаем модель XGBoost
    bst = xgb.train(param, dtrain, verbose_eval=False)
    # если это первая итерация
    if i == 1:
        # создаем датафрейм df со столбцом-списком признаков
        df = pd.DataFrame({'feature': new_x.columns})
        pass

    # получаем значение важности для каждого признака, по умолчанию
    # используется weight - простой показатель важности, который 
    # суммирует, сколько раз конкретный признак использовался 
    # в качестве предиктора разбиения в алгоритме XGBoost
    importance = bst.get_fscore()
    # сортируем по значению важности
    importance = sorted(importance.items(), key=operator.itemgetter(1))
    # создаем датафрейм, содержащий названия предикторов и их важности
    df2 = pd.DataFrame(importance, columns=['feature', 'fscore'+str(i)])
    # нормируем значения важности
    df2['fscore'+str(i)] = df2['fscore'+str(i)] / df2['fscore'+str(i)].sum()
    # объединяем датафреймы df и df2, т.е. к df добавляется столбец со 
    # значениями важности, найденными в текущей i-ой итерации
    df = pd.merge(df, df2, on='feature', how='outer')
    
df

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Unnamed: 0,feature,fscore1,fscore2,fscore3,fscore4,fscore5,fscore6,fscore7,fscore8,fscore9,fscore10
0,age,0.191781,0.151261,0.170732,0.185185,0.181416,0.158537,0.152034,0.155419,0.146998,0.174528
1,debtinc,0.159817,0.151261,0.135255,0.145969,0.139381,0.138211,0.162741,0.159509,0.15942,0.146226
2,creddebt,0.152968,0.144958,0.155211,0.11329,0.146018,0.152439,0.141328,0.147239,0.138716,0.141509
3,othdebt,0.068493,0.071429,0.097561,0.08061,0.053097,0.101626,0.085653,0.087935,0.086957,0.07783
4,job_civil service,0.011416,0.018908,0.013304,0.017429,0.015487,0.01626,0.019272,0.014315,0.014493,0.016509
5,job_own business,0.006849,0.012605,0.008869,0.010893,0.006637,0.020325,0.006424,0.010225,0.008282,0.009434
6,job_working - IT,0.004566,0.012605,0.004435,0.006536,0.00885,0.004065,0.008565,0.006135,0.006211,0.011792
7,job_working - other,0.004566,,,0.010893,,0.004065,,0.002045,0.006211,0.011792
8,job_working - production,,,0.004435,,0.004425,0.002033,0.002141,0.00409,0.004141,0.007075
9,new_a,0.006849,0.004202,0.002217,0.008715,0.004425,0.00813,0.002141,0.006135,0.010352,0.011792


In [30]:
# в df добавляем усредненное значение важности 
# по всем пройденным итерациям 
df['Mean'] = df.mean(axis=1)
df

Unnamed: 0,feature,fscore1,fscore2,fscore3,fscore4,fscore5,fscore6,fscore7,fscore8,fscore9,fscore10,Mean
0,age,0.191781,0.151261,0.170732,0.185185,0.181416,0.158537,0.152034,0.155419,0.146998,0.174528,0.166789
1,debtinc,0.159817,0.151261,0.135255,0.145969,0.139381,0.138211,0.162741,0.159509,0.15942,0.146226,0.149779
2,creddebt,0.152968,0.144958,0.155211,0.11329,0.146018,0.152439,0.141328,0.147239,0.138716,0.141509,0.143368
3,othdebt,0.068493,0.071429,0.097561,0.08061,0.053097,0.101626,0.085653,0.087935,0.086957,0.07783,0.081119
4,job_civil service,0.011416,0.018908,0.013304,0.017429,0.015487,0.01626,0.019272,0.014315,0.014493,0.016509,0.015739
5,job_own business,0.006849,0.012605,0.008869,0.010893,0.006637,0.020325,0.006424,0.010225,0.008282,0.009434,0.010054
6,job_working - IT,0.004566,0.012605,0.004435,0.006536,0.00885,0.004065,0.008565,0.006135,0.006211,0.011792,0.007376
7,job_working - other,0.004566,,,0.010893,,0.004065,,0.002045,0.006211,0.011792,0.006596
8,job_working - production,,,0.004435,,0.004425,0.002033,0.002141,0.00409,0.004141,0.007075,0.004048
9,new_a,0.006849,0.004202,0.002217,0.008715,0.004425,0.00813,0.002141,0.006135,0.010352,0.011792,0.006496


In [31]:
# выполняем обратное разделение признаков на исходные и "теневые"
real_vars = df[~df['feature'].isin(shadow_names)]
shadow_vars = df[df['feature'].isin(shadow_names)]

In [32]:
# смотрим исходные признаки
real_vars

Unnamed: 0,feature,fscore1,fscore2,fscore3,fscore4,fscore5,fscore6,fscore7,fscore8,fscore9,fscore10,Mean
0,age,0.191781,0.151261,0.170732,0.185185,0.181416,0.158537,0.152034,0.155419,0.146998,0.174528,0.166789
1,debtinc,0.159817,0.151261,0.135255,0.145969,0.139381,0.138211,0.162741,0.159509,0.15942,0.146226,0.149779
2,creddebt,0.152968,0.144958,0.155211,0.11329,0.146018,0.152439,0.141328,0.147239,0.138716,0.141509,0.143368
3,othdebt,0.068493,0.071429,0.097561,0.08061,0.053097,0.101626,0.085653,0.087935,0.086957,0.07783,0.081119
4,job_civil service,0.011416,0.018908,0.013304,0.017429,0.015487,0.01626,0.019272,0.014315,0.014493,0.016509,0.015739
5,job_own business,0.006849,0.012605,0.008869,0.010893,0.006637,0.020325,0.006424,0.010225,0.008282,0.009434,0.010054
6,job_working - IT,0.004566,0.012605,0.004435,0.006536,0.00885,0.004065,0.008565,0.006135,0.006211,0.011792,0.007376
7,job_working - other,0.004566,,,0.010893,,0.004065,,0.002045,0.006211,0.011792,0.006596
8,job_working - production,,,0.004435,,0.004425,0.002033,0.002141,0.00409,0.004141,0.007075,0.004048
9,new_a,0.006849,0.004202,0.002217,0.008715,0.004425,0.00813,0.002141,0.006135,0.010352,0.011792,0.006496


In [33]:
# смотрим "теневые" признаки
shadow_vars

Unnamed: 0,feature,fscore1,fscore2,fscore3,fscore4,fscore5,fscore6,fscore7,fscore8,fscore9,fscore10,Mean
12,ShadowVar1,0.054795,0.071429,0.066519,0.067538,0.075221,0.071138,0.06424,0.08998,0.062112,0.063679,0.068665
13,ShadowVar2,0.075342,0.102941,0.113082,0.067538,0.088496,0.063008,0.104925,0.07771,0.097308,0.073113,0.086346
14,ShadowVar3,0.114155,0.094538,0.093126,0.11329,0.103982,0.107724,0.089936,0.07362,0.086957,0.096698,0.097403
15,ShadowVar4,0.091324,0.073529,0.079823,0.089325,0.117257,0.079268,0.077088,0.083845,0.078675,0.113208,0.088334
16,ShadowVar5,0.006849,0.006303,0.002217,0.017429,0.006637,0.00813,0.004283,0.00818,0.010352,0.007075,0.007746
17,ShadowVar6,0.004566,0.023109,0.008869,0.002179,0.00885,0.00813,0.004283,0.01227,0.008282,0.004717,0.008525
18,ShadowVar7,0.004566,0.004202,0.002217,0.015251,,0.00813,0.008565,0.006135,0.016563,,0.008204
19,ShadowVar8,0.006849,0.006303,0.006652,0.002179,0.004425,0.012195,0.010707,0.01227,0.008282,0.009434,0.007929
20,ShadowVar9,0.004566,0.002101,0.002217,0.004357,0.004425,0.004065,0.014989,0.00409,0.008282,0.004717,0.005381
21,ShadowVar10,0.006849,0.008403,0.006652,0.015251,0.002212,0.014228,0.010707,0.006135,0.010352,0.002358,0.008315


In [34]:
# вычисляем «порог отсечения»: среднее значение важности 
# для всех «теневых» признаков, поделенное на значение 
# cutoff (по умолчанию равно 4)
cutoff = 4
mean_shadow = shadow_vars['Mean'].mean() / cutoff
mean_shadow

0.00838879390258888

In [35]:
# удаляем исходные признаки, средняя важность которых по результатам 
# всех итераций меньше «порога отсечения»
real_vars = real_vars[(real_vars.Mean > mean_shadow)]
real_vars

Unnamed: 0,feature,fscore1,fscore2,fscore3,fscore4,fscore5,fscore6,fscore7,fscore8,fscore9,fscore10,Mean
0,age,0.191781,0.151261,0.170732,0.185185,0.181416,0.158537,0.152034,0.155419,0.146998,0.174528,0.166789
1,debtinc,0.159817,0.151261,0.135255,0.145969,0.139381,0.138211,0.162741,0.159509,0.15942,0.146226,0.149779
2,creddebt,0.152968,0.144958,0.155211,0.11329,0.146018,0.152439,0.141328,0.147239,0.138716,0.141509,0.143368
3,othdebt,0.068493,0.071429,0.097561,0.08061,0.053097,0.101626,0.085653,0.087935,0.086957,0.07783,0.081119
4,job_civil service,0.011416,0.018908,0.013304,0.017429,0.015487,0.01626,0.019272,0.014315,0.014493,0.016509,0.015739
5,job_own business,0.006849,0.012605,0.008869,0.010893,0.006637,0.020325,0.006424,0.010225,0.008282,0.009434,0.010054


In [36]:
# возвращаем значение критерия остановки
delta = 0.1
criteria = (len(real_vars['feature']) / len(X.columns)) > (1 - delta)
criteria

False

In [37]:
# возвращаем оставшиеся признаки
keep_vars = real_vars['feature']
keep_vars

0                  age
1              debtinc
2             creddebt
3              othdebt
4    job_civil service
5     job_own business
Name: feature, dtype: object

In [38]:
# формируем новый массив из признаков keep_vars
new_x = new_x[keep_vars].copy()
new_x.head()

Unnamed: 0,age,debtinc,creddebt,othdebt,job_civil service,job_own business
0,28,17.7,2.990592,4.797408,0,0
1,64,14.7,5.047392,12.004608,0,0
2,40,4.8,1.042368,1.885632,0,0
3,30,34.5,1.75122,7.56378,0,0
4,25,22.4,0.75936,5.96064,0,0


# Кейс применения класса BoostARoota

In [39]:
# импортируем необходимые библиотеки
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from boost_preprocessing import BoostARoota
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [40]:
# загружаем набор
data = pd.read_csv('Data/santander_train.csv')
data

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.170000,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.030000,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.770000,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.970000,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76015,151829,2,48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60926.490000,0
76016,151830,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118634.520000,0
76017,151835,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74028.150000,0
76018,151836,2,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84278.160000,0


In [41]:
data_inspection(data)

Memory usage:  Index                         128
ID                         608160
var3                       608160
var15                      608160
imp_ent_var16_ult1         608160
                            ...  
saldo_medio_var44_hace3    608160
saldo_medio_var44_ult1     608160
saldo_medio_var44_ult3     608160
var38                      608160
TARGET                     608160
Length: 372, dtype: int64
The number of raws:  76020
The number of columns:  371


Unnamed: 0,nunique,missing,type
ID,76020,0,int64
var3,208,0,int64
var15,100,0,int64
imp_ent_var16_ult1,596,0,float64
imp_op_var39_comer_ult1,7551,0,float64
...,...,...,...
saldo_medio_var44_hace3,33,0,float64
saldo_medio_var44_ult1,141,0,float64
saldo_medio_var44_ult3,141,0,float64
var38,57736,0,float64


In [42]:
data.var38.to_frame().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   var38   76020 non-null  float64
dtypes: float64(1)
memory usage: 594.0 KB


In [43]:
data.var38.to_frame().describe()

Unnamed: 0,var38
count,76020.0
mean,117235.8
std,182664.6
min,5163.75
25%,67870.61
50%,106409.2
75%,118756.3
max,22034740.0


In [44]:
# в квазиконстантных признаках бывает эффективным значению с
# наибольшей частотой присвоить большое отрицательное значение
data.loc[(data['var38'] > 117310.979) & (data['var38'] < 117310.98), 'var38'] = -999.0
    
# удаляем ID
data.drop('ID', axis=1, inplace=True)
    
# удаляем константные признаки
constant_features = [feat for feat in data.columns if data[feat].nunique() == 1]
data.drop(constant_features, axis=1, inplace=True)
    
# удаляем дублирующиеся признаки
duplicated_features = ['ind_var6_0', 'ind_var6', 'num_var6_0', 'num_var6', 'saldo_var6',
                       'delta_imp_reemb_var13_1y3', 'delta_imp_reemb_var17_1y3',
                       'delta_imp_reemb_var33_1y3', 'delta_imp_trasp_var17_in_1y3',
                       'delta_imp_trasp_var17_out_1y3', 'delta_imp_trasp_var33_in_1y3',
                       'delta_imp_trasp_var33_out_1y3', 'saldo_medio_var13_medio_ult1']
    
data.drop(duplicated_features, axis=1, inplace=True)

In [45]:
# разбиваем данные на обучающие и тестовые: получаем обучающий
# массив признаков, тестовый массив признаков, обучающий массив
# меток, тестовый массив меток
X_train, X_test, y_train, y_test = train_test_split(data.drop('TARGET', axis=1), 
                                                    data['TARGET'], 
                                                    test_size=0.3,
                                                    stratify=data['TARGET'],
                                                    random_state=42)

# создаем экземпляр класса BoostARoota
br = BoostARoota(metric='logloss', silent=True)

# создаем экземпляр класса XGBClassifier
xgb_model = xgb.XGBClassifier(eta=0.04,
                              n_estimators=150,
                              max_depth=4,
                              subsample=0.9,
                              colsample_bytree=0.6,
                              objective='binary:logistic',
                              random_state=42)

# создаем конвейер
pipe = Pipeline([('selector', br),
                 ('xgbst', xgb_model)])

# задаем стратегию проверки
strat = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# задаем сетку гиперпараметров
param_grid = {'selector__cutoff': [4, 10, 20]}

# задаем настройки поиска по сетке
gs = GridSearchCV(pipe, 
                  param_grid, 
                  scoring='roc_auc',
                  cv=strat, 
                  return_train_score=False)

In [46]:
%%time

# выполняем поиск по сетке
gs.fit(X_train, y_train)
# смотрим наилучшие значения гиперпараметров
print('Наилучшие значения гиперпараметров: {}'.format(gs.best_params_))
# смотрим наилучшее значение AUC
print('Наилучшее значение AUC: {:.3f}'.format(gs.best_score_))
# смотрим значение AUC на тестовой выборке
print('AUC на тестовом наборе: {:.3f}'.format(
    roc_auc_score(y_test, gs.predict_proba(X_test)[:, 1])))

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo



Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo



Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo



Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo



Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo



Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo



Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo



Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo



Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo



Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo



Наилучшие значения гиперпараметров: {'selector__cutoff': 4}
Наилучшее значение AUC: 0.835
AUC на тестовом наборе: 0.845
Wall time: 9min 17s


In [47]:
# выводим результаты поиска
cv_results = pd.DataFrame(gs.cv_results_)[['mean_test_score',
                                           'param_selector__cutoff']]
cv_results = cv_results.sort_values(by='mean_test_score', ascending=False).reset_index(drop=True)
cv_results

Unnamed: 0,mean_test_score,param_selector__cutoff
0,0.834759,4
1,0.834651,10
2,0.834456,20


In [48]:
# формируем список предикторов, 
# который дал наилучший результат
fs = gs.best_estimator_.named_steps['selector']
br_selected_features = fs.keep_vars_.tolist()
br_selected_features

['var3',
 'var15',
 'imp_ent_var16_ult1',
 'imp_op_var39_comer_ult1',
 'imp_op_var39_comer_ult3',
 'imp_op_var40_comer_ult1',
 'imp_op_var40_comer_ult3',
 'imp_op_var40_efect_ult1',
 'imp_op_var40_efect_ult3',
 'imp_op_var41_comer_ult1',
 'imp_op_var41_comer_ult3',
 'imp_op_var41_efect_ult1',
 'imp_op_var41_efect_ult3',
 'imp_op_var41_ult1',
 'imp_op_var39_efect_ult1',
 'imp_op_var39_ult1',
 'imp_sal_var16_ult1',
 'ind_var1_0',
 'ind_var5_0',
 'ind_var5',
 'ind_var8_0',
 'ind_var12_0',
 'ind_var13_0',
 'ind_var26_cte',
 'ind_var30_0',
 'ind_var30',
 'ind_var32_cte',
 'ind_var37_cte',
 'ind_var39_0',
 'ind_var41_0',
 'num_var1_0',
 'num_var4',
 'num_var5',
 'num_var12_0',
 'num_var26_0',
 'num_op_var40_ult1',
 'num_op_var41_hace2',
 'num_op_var41_ult1',
 'num_op_var41_ult3',
 'num_op_var39_ult1',
 'num_var35',
 'num_var37_med_ult2',
 'num_var37_0',
 'num_var39_0',
 'num_var42_0',
 'saldo_var1',
 'saldo_var5',
 'saldo_var8',
 'saldo_var12',
 'saldo_var25',
 'saldo_var30',
 'saldo_var37',

In [49]:
# смотрим количество отобранных признаков
len(br_selected_features)

93

In [50]:
# загружаем наборы
train = pd.read_csv(path+'santander_train.csv')
test = pd.read_csv(path+'santander_test.csv')

# сохраняем ID тестового набора
test_id = test['ID']

# формируем массив меток и массив признаков
labels = train.pop('TARGET')

# наиболее часто встречающемуся значению присваиваем -999.0 
train.loc[(train['var38'] > 117310.979) & (train['var38'] < 117310.98), 'var38'] = -999.0
test.loc[(test['var38'] > 117310.979) & (test['var38'] < 117310.98), 'var38'] = -999.0

# формируем новые наборы на основе отобранных признаков
train = train[br_selected_features]
test = test[br_selected_features]

# строим модель на всей обучающей выборке
xgb_model.fit(train, labels)

# вычисляем вероятности для тестовой выборки
preds_prob = xgb_model.predict_proba(test)[:, 1]
# формируем посылку
pd.DataFrame({'ID': test_id, 'TARGET': preds_prob}).to_csv('subm_boostaroota.csv', index=False)



