# Elo Merchant Categories
### Competition on Kaggle
## Outlier Detection: Bag of Catboost Classifiers
### 4-1-2019

In [1]:
## Variables specific for competition

ID = 'card_id'                                            
TARGET = 'target'    

RAW_DIRECTORY = 'C:/Users/judit/Documents/learning/kaggle/Elo_201812/rawdata/'  
DIRECTORY = 'C:/Users/judit/Documents/learning/kaggle/Elo_201812/data/'
HIST_TRANS_FILE = RAW_DIRECTORY + 'historical_transactions.csv'
MERCHANTS_FILE = RAW_DIRECTORY + 'merchants.csv'
NEW_MERCH_TRANS_FILE = RAW_DIRECTORY + 'new_merchant_transactions.csv'
TRAIN_FILE = RAW_DIRECTORY + 'train.csv'    
TEST_FILE = RAW_DIRECTORY +'test.csv'
SAMPLE_SUBMISSION_FILE = RAW_DIRECTORY + 'sample_submission.csv'

SUBMISSION_DIRECTORY = 'C:/Users/judit/Documents/learning/kaggle/Elo_201812/submissions/'

In [2]:
## Variables specific for notebook
NUM = 0
SUBMIT_FILENAME = SUBMISSION_DIRECTORY + 'submit_190104_'

HIST_AGG_FILE_1 = DIRECTORY + 'hist_agg_20.pkl'        # only authorized, domestic, last 3 months
HIST_AGG_FILE_2 = DIRECTORY + 'hist_agg_28.pkl'        # only authorized, non-domestic, last 3 months
NEW_AGG_FILE_1 = DIRECTORY + 'new_agg_3.pkl'           # only authorized, domestic, whole period
NEW_AGG_FILE_2 = DIRECTORY + 'new_agg_5.pkl'           # only authorized, non-domestic, whole period

In [3]:
from catboost import CatBoostRegressor, CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.utils import resample
from sklearn.metrics import mean_squared_error, accuracy_score, log_loss, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pickle
import random
random.seed(1)

In [4]:
## Useful functions
def reduce_mem_usage(df, verbose = True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Starting memory usage: {:5.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Reduced memory usage: {:5.2f} MB ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem\
                                                                                                   ) / start_mem))
    return df

def mode(series):
    if len(series.mode()) > 0:
        return series.mode().iloc[0]
    else:
        return np.nan

def label_encoder(train_df, test_df = None, valid_df = None, localtest_df = None,
                  prefix = '_labelenc_', suffix = '',
                  target = 'is_outlier',
                  cols_and_encodings = {'feature_1' : ['mean', 'median'],
                                         'feature_2' : [mode]}):
    '''
    Calculates label encodings based on train_df.
    Can be used both for classification and regression problems.
    
    cols_and_encodings is a dictionary. 
    Keys: features that we want to encode (usually categorical features with only a few possible values).
    Values : list of aggregation functions which will be applied on target values.
    '''
    for col, enc_list in cols_and_encodings.items():
        agg_df = train_df.groupby(col).agg({target : enc_list})
        agg_df.columns = [col + prefix + '_'.join(colname).strip() + suffix for colname in agg_df.columns.values]  
        agg_df.reset_index(inplace = True)
        train_df = train_df.merge(agg_df, how = 'left', on = col)
        if test_df:
            test_df = test_df.merge(agg_df, how = 'left', on = col)
        if valid_df:
            valid_df = valid_df.merge(agg_df, how = 'left', on = col)
        if localtest_df:
            localtest_df = localtest_df.merge(agg_df, how = 'left', on = col)

    return train_df, test_df, valid_df, localtest_df

In [5]:
hist_agg_1 = pd.read_pickle(HIST_AGG_FILE_1)
hist_agg_2 = pd.read_pickle(HIST_AGG_FILE_2)
new_agg_1 = pd.read_pickle(NEW_AGG_FILE_1)
new_agg_2 = pd.read_pickle(NEW_AGG_FILE_2)
merch = pd.read_csv(MERCHANTS_FILE)
train = pd.read_csv(TRAIN_FILE, parse_dates = ["first_active_month"])
merch = pd.read_csv(MERCHANTS_FILE)

In [6]:
# card_id is a hexadecimal number. Convert it into decimal.
train['card_id'] = train['card_id'].apply(lambda s : int(s[5:], 16))

train['is_outlier'] = train['target'].apply(lambda x : x < -30)
train['is_outlier'] = train['is_outlier'].apply(lambda x : int(x))

train = reduce_mem_usage(df = train, verbose = True)

Starting memory usage: 10.78 MB
Reduced memory usage:  4.24 MB (60.7% reduction)


In [7]:
train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,is_outlier
0,2017-06-01,629783156055,5,2,1,-0.820312,0
1,2017-01-01,261997498959,4,1,0,0.392822,0
2,2016-08-01,920094897869,2,2,0,0.687988,0
3,2017-09-01,104914905345,4,3,0,0.142456,0
4,2017-11-01,883642076594,1,3,0,-0.15979,0


In [8]:
# merchant_id is a hexadecimal number. Convert it into decimal.
merch['merchant_id'] = merch['merchant_id'].apply(lambda s : int(s[5:], 16))

# convert categorical features into numerical ones
merch['category_1'] = merch['category_1'].apply(lambda x : 1 if x == 'Y' else 0 if x == 'N' else np.nan)
merch['most_recent_sales_range'] = merch['most_recent_sales_range'].apply(lambda x : 1 if x == 'A' else
                                                                                     2 if x == 'B' else
                                                                                     3 if x == 'C' else
                                                                                     4 if x == 'D' else
                                                                                     5 if x == 'E' else
                                                                                     np.nan)
merch['most_recent_purchases_range'] = merch['most_recent_purchases_range'].apply(lambda x : 1 if x == 'A' else
                                                                                             2 if x == 'B' else
                                                                                             3 if x == 'C' else
                                                                                             4 if x == 'D' else
                                                                                             5 if x == 'E' else
                                                                                             np.nan)
merch['category_4'] = merch['category_4'].apply(lambda x : 1 if x == 'Y' else 0 if x == 'N' else np.nan)

merch = reduce_mem_usage(df = merch, verbose = True)

Starting memory usage: 56.18 MB
Reduced memory usage: 21.39 MB (61.9% reduction)


In [9]:
merch.head()

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,564794614924,8353,792,9,-0.057465,-0.057465,0,5,5,-0.4,...,-2.25,18.666667,6,-2.32,13.916667,12,0,242,9,1.0
1,632330682541,3184,840,20,-0.057465,-0.057465,0,5,5,-0.72,...,-0.74,1.291667,6,-0.57,1.6875,12,0,22,16,1.0
2,992787279390,447,690,1,-0.057465,-0.057465,0,5,5,-82.129997,...,-82.129997,260.0,2,-82.129997,260.0,2,0,-1,5,5.0
3,717504667521,5026,792,9,-0.057465,-0.057465,1,5,5,,...,,4.666667,6,,3.833333,12,1,-1,-1,
4,430661449678,2228,222,21,-0.057465,-0.057465,1,5,5,,...,,0.361111,6,,0.347222,12,1,-1,-1,


In [10]:
df = train.merge(hist_agg_1, how = 'left', on = 'card_id')
df = df.merge(hist_agg_2, how = 'left', on = 'card_id')
df = df.merge(new_agg_1, how = 'left', on = 'card_id')
df = df.merge(new_agg_2, how = 'left', on = 'card_id')

In [11]:
df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,is_outlier,hist_authorized_flag_sum_auth_dom_lag2m,hist_authorized_flag_mean_auth_dom_lag2m,hist_city_id_nunique_auth_dom_lag2m,...,new_purchase_quarter_max_nondom_all,new_purchase_quarter_min_nondom_all,new_purchase_quarter_std_nondom_all,new_purchase_quarter_mode_nondom_all,new_week_lag_median_nondom_all,new_week_lag_mean_nondom_all,new_week_lag_max_nondom_all,new_week_lag_min_nondom_all,new_week_lag_std_nondom_all,new_week_lag_mode_nondom_all
0,2017-06-01,629783156055,5,2,1,-0.820312,0,99.0,1.0,4.0,...,,,,,,,,,,
1,2017-01-01,261997498959,4,1,0,0.392822,0,107.0,1.0,4.0,...,,,,,,,,,,
2,2016-08-01,920094897869,2,2,0,0.687988,0,4.0,1.0,1.0,...,,,,,,,,,,
3,2017-09-01,104914905345,4,3,0,0.142456,0,21.0,1.0,2.0,...,2.0,2.0,,2.0,11.0,11.0,11.0,11.0,,11.0
4,2017-11-01,883642076594,1,3,0,-0.15979,0,97.0,1.0,4.0,...,2.0,1.0,0.707107,1.0,7.5,7.5,9.0,6.0,2.12132,6.0


In [12]:
df = df.merge(merch, how = 'left', left_on = 'hist_merchant_id_mode_auth_dom_lag2m', right_on = 'merchant_id',
              suffixes = ('', '_hist_merchant_id_mode_auth_dom_lag2m'))
list(df.columns)

['first_active_month',
 'card_id',
 'feature_1',
 'feature_2',
 'feature_3',
 'target',
 'is_outlier',
 'hist_authorized_flag_sum_auth_dom_lag2m',
 'hist_authorized_flag_mean_auth_dom_lag2m',
 'hist_city_id_nunique_auth_dom_lag2m',
 'hist_city_id_mode_auth_dom_lag2m',
 'hist_city_id_nancnt_auth_dom_lag2m',
 'hist_city_id_nanperc_auth_dom_lag2m',
 'hist_category_1_sum_auth_dom_lag2m',
 'hist_category_1_mean_auth_dom_lag2m',
 'hist_installments_sum_auth_dom_lag2m',
 'hist_installments_median_auth_dom_lag2m',
 'hist_installments_mean_auth_dom_lag2m',
 'hist_installments_max_auth_dom_lag2m',
 'hist_installments_min_auth_dom_lag2m',
 'hist_installments_std_auth_dom_lag2m',
 'hist_installments_mode_auth_dom_lag2m',
 'hist_installments_nancnt_auth_dom_lag2m',
 'hist_installments_nanperc_auth_dom_lag2m',
 'hist_category_3_nunique_auth_dom_lag2m',
 'hist_category_3_mode_auth_dom_lag2m',
 'hist_category_3_nancnt_auth_dom_lag2m',
 'hist_category_3_nanperc_auth_dom_lag2m',
 'hist_merchant_category

In [13]:
df = df.merge(merch, how = 'left', left_on = 'hist_merchant_id_mode_auth_nondom_lag2m', right_on = 'merchant_id',
              suffixes = ('', '_hist_merchant_id_mode_auth_nondom_lag2m'))
list(df.columns)

['first_active_month',
 'card_id',
 'feature_1',
 'feature_2',
 'feature_3',
 'target',
 'is_outlier',
 'hist_authorized_flag_sum_auth_dom_lag2m',
 'hist_authorized_flag_mean_auth_dom_lag2m',
 'hist_city_id_nunique_auth_dom_lag2m',
 'hist_city_id_mode_auth_dom_lag2m',
 'hist_city_id_nancnt_auth_dom_lag2m',
 'hist_city_id_nanperc_auth_dom_lag2m',
 'hist_category_1_sum_auth_dom_lag2m',
 'hist_category_1_mean_auth_dom_lag2m',
 'hist_installments_sum_auth_dom_lag2m',
 'hist_installments_median_auth_dom_lag2m',
 'hist_installments_mean_auth_dom_lag2m',
 'hist_installments_max_auth_dom_lag2m',
 'hist_installments_min_auth_dom_lag2m',
 'hist_installments_std_auth_dom_lag2m',
 'hist_installments_mode_auth_dom_lag2m',
 'hist_installments_nancnt_auth_dom_lag2m',
 'hist_installments_nanperc_auth_dom_lag2m',
 'hist_category_3_nunique_auth_dom_lag2m',
 'hist_category_3_mode_auth_dom_lag2m',
 'hist_category_3_nancnt_auth_dom_lag2m',
 'hist_category_3_nanperc_auth_dom_lag2m',
 'hist_merchant_category

In [14]:
df = df.merge(merch, how = 'left', left_on = 'new_merchant_id_mode_dom_all', right_on = 'merchant_id',
              suffixes = ('', '_new_merchant_id_mode_dom_all'))
list(df.columns)

['first_active_month',
 'card_id',
 'feature_1',
 'feature_2',
 'feature_3',
 'target',
 'is_outlier',
 'hist_authorized_flag_sum_auth_dom_lag2m',
 'hist_authorized_flag_mean_auth_dom_lag2m',
 'hist_city_id_nunique_auth_dom_lag2m',
 'hist_city_id_mode_auth_dom_lag2m',
 'hist_city_id_nancnt_auth_dom_lag2m',
 'hist_city_id_nanperc_auth_dom_lag2m',
 'hist_category_1_sum_auth_dom_lag2m',
 'hist_category_1_mean_auth_dom_lag2m',
 'hist_installments_sum_auth_dom_lag2m',
 'hist_installments_median_auth_dom_lag2m',
 'hist_installments_mean_auth_dom_lag2m',
 'hist_installments_max_auth_dom_lag2m',
 'hist_installments_min_auth_dom_lag2m',
 'hist_installments_std_auth_dom_lag2m',
 'hist_installments_mode_auth_dom_lag2m',
 'hist_installments_nancnt_auth_dom_lag2m',
 'hist_installments_nanperc_auth_dom_lag2m',
 'hist_category_3_nunique_auth_dom_lag2m',
 'hist_category_3_mode_auth_dom_lag2m',
 'hist_category_3_nancnt_auth_dom_lag2m',
 'hist_category_3_nanperc_auth_dom_lag2m',
 'hist_merchant_category

In [15]:
df = df.merge(merch, how = 'left', left_on = 'new_merchant_id_mode_nondom_all', right_on = 'merchant_id',
              suffixes = ('', '_new_merchant_id_mode_nondom_all'))
list(df.columns)

['first_active_month',
 'card_id',
 'feature_1',
 'feature_2',
 'feature_3',
 'target',
 'is_outlier',
 'hist_authorized_flag_sum_auth_dom_lag2m',
 'hist_authorized_flag_mean_auth_dom_lag2m',
 'hist_city_id_nunique_auth_dom_lag2m',
 'hist_city_id_mode_auth_dom_lag2m',
 'hist_city_id_nancnt_auth_dom_lag2m',
 'hist_city_id_nanperc_auth_dom_lag2m',
 'hist_category_1_sum_auth_dom_lag2m',
 'hist_category_1_mean_auth_dom_lag2m',
 'hist_installments_sum_auth_dom_lag2m',
 'hist_installments_median_auth_dom_lag2m',
 'hist_installments_mean_auth_dom_lag2m',
 'hist_installments_max_auth_dom_lag2m',
 'hist_installments_min_auth_dom_lag2m',
 'hist_installments_std_auth_dom_lag2m',
 'hist_installments_mode_auth_dom_lag2m',
 'hist_installments_nancnt_auth_dom_lag2m',
 'hist_installments_nanperc_auth_dom_lag2m',
 'hist_category_3_nunique_auth_dom_lag2m',
 'hist_category_3_mode_auth_dom_lag2m',
 'hist_category_3_nancnt_auth_dom_lag2m',
 'hist_category_3_nanperc_auth_dom_lag2m',
 'hist_merchant_category

In [16]:
merch.columns

Index(['merchant_id', 'merchant_group_id', 'merchant_category_id',
       'subsector_id', 'numerical_1', 'numerical_2', 'category_1',
       'most_recent_sales_range', 'most_recent_purchases_range',
       'avg_sales_lag3', 'avg_purchases_lag3', 'active_months_lag3',
       'avg_sales_lag6', 'avg_purchases_lag6', 'active_months_lag6',
       'avg_sales_lag12', 'avg_purchases_lag12', 'active_months_lag12',
       'category_4', 'city_id', 'state_id', 'category_2'],
      dtype='object')

In [17]:
for colname in merch.columns:
    df.rename(columns = {colname: colname + '_hist_merchant_id_mode_auth_dom_lag2m'}, inplace = True)
df.columns

Index(['first_active_month', 'card_id', 'feature_1', 'feature_2', 'feature_3',
       'target', 'is_outlier', 'hist_authorized_flag_sum_auth_dom_lag2m',
       'hist_authorized_flag_mean_auth_dom_lag2m',
       'hist_city_id_nunique_auth_dom_lag2m',
       ...
       'avg_sales_lag6_new_merchant_id_mode_nondom_all',
       'avg_purchases_lag6_new_merchant_id_mode_nondom_all',
       'active_months_lag6_new_merchant_id_mode_nondom_all',
       'avg_sales_lag12_new_merchant_id_mode_nondom_all',
       'avg_purchases_lag12_new_merchant_id_mode_nondom_all',
       'active_months_lag12_new_merchant_id_mode_nondom_all',
       'category_4_new_merchant_id_mode_nondom_all',
       'city_id_new_merchant_id_mode_nondom_all',
       'state_id_new_merchant_id_mode_nondom_all',
       'category_2_new_merchant_id_mode_nondom_all'],
      dtype='object', length=493)

In [18]:
list(df.columns)

['first_active_month',
 'card_id',
 'feature_1',
 'feature_2',
 'feature_3',
 'target',
 'is_outlier',
 'hist_authorized_flag_sum_auth_dom_lag2m',
 'hist_authorized_flag_mean_auth_dom_lag2m',
 'hist_city_id_nunique_auth_dom_lag2m',
 'hist_city_id_mode_auth_dom_lag2m',
 'hist_city_id_nancnt_auth_dom_lag2m',
 'hist_city_id_nanperc_auth_dom_lag2m',
 'hist_category_1_sum_auth_dom_lag2m',
 'hist_category_1_mean_auth_dom_lag2m',
 'hist_installments_sum_auth_dom_lag2m',
 'hist_installments_median_auth_dom_lag2m',
 'hist_installments_mean_auth_dom_lag2m',
 'hist_installments_max_auth_dom_lag2m',
 'hist_installments_min_auth_dom_lag2m',
 'hist_installments_std_auth_dom_lag2m',
 'hist_installments_mode_auth_dom_lag2m',
 'hist_installments_nancnt_auth_dom_lag2m',
 'hist_installments_nanperc_auth_dom_lag2m',
 'hist_category_3_nunique_auth_dom_lag2m',
 'hist_category_3_mode_auth_dom_lag2m',
 'hist_category_3_nancnt_auth_dom_lag2m',
 'hist_category_3_nanperc_auth_dom_lag2m',
 'hist_merchant_category

In [19]:
df.drop(['merchant_id_hist_merchant_id_mode_auth_dom_lag2m',
         'merchant_id_hist_merchant_id_mode_auth_nondom_lag2m',
         'merchant_id_new_merchant_id_mode_dom_all',
         'merchant_id_new_merchant_id_mode_nondom_all'], inplace = True, axis = 1)
list(df.columns)

['first_active_month',
 'card_id',
 'feature_1',
 'feature_2',
 'feature_3',
 'target',
 'is_outlier',
 'hist_authorized_flag_sum_auth_dom_lag2m',
 'hist_authorized_flag_mean_auth_dom_lag2m',
 'hist_city_id_nunique_auth_dom_lag2m',
 'hist_city_id_mode_auth_dom_lag2m',
 'hist_city_id_nancnt_auth_dom_lag2m',
 'hist_city_id_nanperc_auth_dom_lag2m',
 'hist_category_1_sum_auth_dom_lag2m',
 'hist_category_1_mean_auth_dom_lag2m',
 'hist_installments_sum_auth_dom_lag2m',
 'hist_installments_median_auth_dom_lag2m',
 'hist_installments_mean_auth_dom_lag2m',
 'hist_installments_max_auth_dom_lag2m',
 'hist_installments_min_auth_dom_lag2m',
 'hist_installments_std_auth_dom_lag2m',
 'hist_installments_mode_auth_dom_lag2m',
 'hist_installments_nancnt_auth_dom_lag2m',
 'hist_installments_nanperc_auth_dom_lag2m',
 'hist_category_3_nunique_auth_dom_lag2m',
 'hist_category_3_mode_auth_dom_lag2m',
 'hist_category_3_nancnt_auth_dom_lag2m',
 'hist_category_3_nanperc_auth_dom_lag2m',
 'hist_merchant_category

In [20]:
df.shape

(201917, 489)

In [21]:
df[df['is_outlier'] == 1].shape

(2207, 489)

### Prepair data for Catboost  
Catboost can handle categorical features, but we have to input a string for nan values in categorical features.

In [22]:
categorical_features = ['first_active_month',
                        'card_id',
                        'feature_1',
                        'feature_2',
                        'feature_3',
                        'hist_city_id_mode_auth_dom_lag2m',
                        'hist_category_3_mode_auth_dom_lag2m',
                        'hist_merchant_category_id_mode_auth_dom_lag2m',
                        'hist_merchant_id_mode_auth_dom_lag2m',
                        'hist_category_2_mode_auth_dom_lag2m',
                        'hist_state_id_mode_auth_dom_lag2m',
                        'hist_subsector_id_mode_auth_dom_lag2m',
                        'hist_city_id_mode_auth_nondom_lag2m',
                        'hist_category_3_mode_auth_nondom_lag2m',
                        'hist_merchant_category_id_mode_auth_nondom_lag2m',
                        'hist_merchant_id_mode_auth_nondom_lag2m',
                        'hist_category_2_mode_auth_nondom_lag2m',
                        'hist_state_id_mode_auth_nondom_lag2m',
                        'hist_subsector_id_mode_auth_nondom_lag2m',
                        'new_city_id_mode_dom_all',
                        'new_category_3_mode_dom_all',
                        'new_merchant_category_id_mode_dom_all',
                        'new_merchant_id_mode_dom_all',
                        'new_category_2_mode_dom_all',
                        'new_state_id_mode_dom_all',
                        'new_subsector_id_mode_dom_all',
                        'new_city_id_mode_nondom_all',
                        'new_category_3_mode_nondom_all',
                        'new_merchant_category_id_mode_nondom_all',
                        'new_merchant_id_mode_nondom_all',
                        'new_category_2_mode_nondom_all',
                        'new_state_id_mode_nondom_all',
                        'new_subsector_id_mode_nondom_all',
                        'merchant_group_id_hist_merchant_id_mode_auth_dom_lag2m',
                        'merchant_category_id_hist_merchant_id_mode_auth_dom_lag2m',
                        'subsector_id_hist_merchant_id_mode_auth_dom_lag2m',
                        'city_id_hist_merchant_id_mode_auth_dom_lag2m',
                        'state_id_hist_merchant_id_mode_auth_dom_lag2m',
                        'category_2_hist_merchant_id_mode_auth_dom_lag2m',
                        'merchant_group_id_hist_merchant_id_mode_auth_nondom_lag2m',
                        'merchant_category_id_hist_merchant_id_mode_auth_nondom_lag2m',
                        'subsector_id_hist_merchant_id_mode_auth_nondom_lag2m',
                        'city_id_hist_merchant_id_mode_auth_nondom_lag2m',
                        'state_id_hist_merchant_id_mode_auth_nondom_lag2m',
                        'category_2_hist_merchant_id_mode_auth_nondom_lag2m',
                        'merchant_group_id_new_merchant_id_mode_dom_all',
                        'merchant_category_id_new_merchant_id_mode_dom_all',
                        'subsector_id_new_merchant_id_mode_dom_all',
                        'city_id_new_merchant_id_mode_dom_all',
                        'state_id_new_merchant_id_mode_dom_all',
                        'category_2_new_merchant_id_mode_dom_all',
                        'merchant_group_id_new_merchant_id_mode_nondom_all',
                        'merchant_category_id_new_merchant_id_mode_nondom_all',
                        'subsector_id_new_merchant_id_mode_nondom_all',
                        'city_id_new_merchant_id_mode_nondom_all',
                        'state_id_new_merchant_id_mode_nondom_all',
                        'category_2_new_merchant_id_mode_nondom_all']

In [23]:
# impute string for missing categorical values (catboost can't handle np.nan in categorical features)
df[categorical_features] = df[categorical_features].fillna('nan')

for col in categorical_features:
    df[col] = df[col].astype('category').cat.codes

### Split into train and localtest

In [24]:
# create training and local test sets
X = df.drop(['target'], axis = 1)
y = df.target.copy()

# split into training set (80%) and local test set (20%)
X_train, X_localtest, y_train, y_localtest = train_test_split(X, y, train_size = 0.8, random_state = 1,
                                                              stratify = X[['feature_1', 'feature_2', 'feature_3', 
                                                                            'is_outlier']])



In [25]:
X_train.shape, X_train[X_train['is_outlier'] == 1].shape

((161533, 488), (1765, 488))

In [26]:
y_localtest_label = X_localtest.is_outlier.copy()
X_localtest = X_localtest.drop('is_outlier', axis = 1)

In [27]:
def create_resampled_training_set(df = X_train, col = 'is_outlier', seed = 1, majority_label = 0, minority_label = 1,
                                  majority_cnt = 5000, minority_cnt = 5000):
    # upsample outliers and downsample inliers in X_train
    # Separate majority and minority classes
    df_majority = df[df[col] == majority_label]
    df_minority = df[df[col] == minority_label]
    
    # Donwsample majority class
    df_majority_downsampled = resample(df_majority, replace = False, n_samples = majority_cnt, random_state = seed)
    # Upsample minority class
    df_minority_upsampled = resample(df_minority, replace = True, n_samples = minority_cnt, random_state = seed)
     
    # Combine majority class with upsampled minority class
    df_resampled = pd.concat([df_majority_downsampled, df_minority_upsampled])
    
    # Shuffle
    df_resampled = df_resampled.sample(frac = 1, random_state = seed).reset_index(drop = True)
    
    df_resampled_label = df_resampled.is_outlier.copy()
    df_resampled = df_resampled.drop('is_outlier', axis = 1)
    
    return df_resampled, df_resampled_label

In [28]:
X_train_resampled, y_train_resampled_label = create_resampled_training_set()

# Display new class counts
y_train_resampled_label.value_counts()

1    5000
0    5000
Name: is_outlier, dtype: int64

In [29]:
X_train_resampled.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,hist_authorized_flag_sum_auth_dom_lag2m,hist_authorized_flag_mean_auth_dom_lag2m,hist_city_id_nunique_auth_dom_lag2m,hist_city_id_mode_auth_dom_lag2m,hist_city_id_nancnt_auth_dom_lag2m,...,avg_sales_lag6_new_merchant_id_mode_nondom_all,avg_purchases_lag6_new_merchant_id_mode_nondom_all,active_months_lag6_new_merchant_id_mode_nondom_all,avg_sales_lag12_new_merchant_id_mode_nondom_all,avg_purchases_lag12_new_merchant_id_mode_nondom_all,active_months_lag12_new_merchant_id_mode_nondom_all,category_4_new_merchant_id_mode_nondom_all,city_id_new_merchant_id_mode_nondom_all,state_id_new_merchant_id_mode_nondom_all,category_2_new_merchant_id_mode_nondom_all
0,65,94219,1,0,0,6.0,1.0,3.0,77,0.0,...,,,,,,,,1,1,1
1,69,90372,2,2,1,16.0,1.0,2.0,283,0.0,...,,,,,,,,1,1,1
2,68,50764,1,0,0,,,,306,,...,,,,,,,,1,1,1
3,60,166060,2,0,1,14.0,1.0,3.0,296,0.0,...,,,,,,,,1,1,1
4,70,6395,4,1,1,34.0,1.0,2.0,16,0.0,...,,,,,,,,1,1,1


In [30]:
y_train_resampled_label[:10]

0    1
1    0
2    0
3    0
4    1
5    1
6    0
7    0
8    0
9    1
Name: is_outlier, dtype: int8

### Step 1: Classification to find outliers

#### Series of grid searches to tune hyperparameters

In [31]:
categorical_features = categorical_features
ignored_features = ['first_active_month', 'card_id']
for col in categorical_features:
    X_train_resampled[col] = X_train_resampled[col].astype('category').cat.codes
categorical_features_indices = [X_train_resampled.columns.get_loc(i) for i in categorical_features]
ignored_features_indices = [X_train_resampled.columns.get_loc(i) for i in ignored_features]

def grid_search(params =  {'learning_rate': .99, 'iterations' : 100},
                train_set = X_train_resampled,
                train_label = y_train_resampled_label,
                param_grid = {'border_count' : [16, 32]},
                cv_folds = 5,
                scoring = 'accuracy',
                n_jobs = -1,
                verbose = 10,
                seed = 1):
    clf = CatBoostClassifier(**params, ignored_features = ignored_features_indices, random_seed = seed)
    gs = GridSearchCV(estimator = clf, param_grid = param_grid, scoring = scoring, cv = cv_folds, n_jobs = n_jobs, 
                      verbose = verbose)
    gs.fit(train_set, train_label)
    best_parameters = gs.best_params_ 
    return gs, best_parameters

In [32]:
params =  {'learning_rate': .99, 
           'iterations' : 1000}
params_grid = {'depth':[10, 8, 6, 4],            
               'l2_leaf_reg':[0, 0.01, 0.1, 1, 10, 100]}
# I also tried to train with depth=13 and depth=16, but they were too long to train (22 hours and 6.5 days, respectively)

In [33]:
def run_iterative_grid_search(order_of_tuning = [['depth'], ['l2_leaf_reg']],
                              train_set = X_train_resampled,
                              train_label = y_train_resampled_label,
                              categorical_features = categorical_features,
                              ignored_features = ignored_features,
                              params = params,
                              params_grid = params_grid,
                              cv_folds = 5,
                              scoring = 'accuracy',
                              n_jobs = -1,
                              verbose = 10,
                              seed = 1):
    for col in categorical_features:
        train_set[col] = train_set[col].astype('category').cat.codes
    categorical_features_indices = [train_set.columns.get_loc(i) for i in categorical_features]
    ignored_features_indices = [train_set.columns.get_loc(i) for i in ignored_features]
    
    cnt_grid_seaches = len(order_of_tuning)
    grid_searches = []
    
    for ls in order_of_tuning:
        print('Tuning hyperparameters', ls)
        param_grid = {}
        for param in ls:
            param_grid[param] = params_grid[param]
        gs, best_parameters = grid_search(params =  params,
                                          train_set = train_set,
                                          train_label = train_label,
                                          param_grid = param_grid,
                                          cv_folds = cv_folds,
                                          scoring = scoring,
                                          n_jobs = n_jobs,
                                          seed = seed,
                                          verbose = verbose)
        for parameter, val in best_parameters.items():
            params[parameter] = val
        grid_searches.append(gs)
    return grid_searches, params

In [34]:
grid_search_depth, param_depth = run_iterative_grid_search(order_of_tuning = [['depth']])
pickle.dump(grid_search_depth, open(DIRECTORY + 'grid_search_depth_11_notebook.p',   'wb'))
pickle.dump(param_depth, open(DIRECTORY + 'param_depth_11_notebook.p', 'wb'))

Tuning hyperparameters ['depth']
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 78.1min
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed: 544.6min remaining: 816.9min
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed: 571.4min remaining: 467.5min
[Parallel(n_jobs=-1)]: Done  14 out of  20 | elapsed: 581.8min remaining: 249.3min
[Parallel(n_jobs=-1)]: Done  17 out of  20 | elapsed: 585.5min remaining: 103.3min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 586.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 586.2min finished


0:	learn: 0.4668242	total: 1.66s	remaining: 27m 38s
1:	learn: 0.4223042	total: 3.17s	remaining: 26m 19s
2:	learn: 0.4033777	total: 4.67s	remaining: 25m 50s
3:	learn: 0.3666635	total: 6.2s	remaining: 25m 44s
4:	learn: 0.3664791	total: 6.23s	remaining: 20m 39s
5:	learn: 0.3582000	total: 7.72s	remaining: 21m 19s
6:	learn: 0.3445114	total: 9.25s	remaining: 21m 51s
7:	learn: 0.3286049	total: 10.8s	remaining: 22m 13s
8:	learn: 0.3094895	total: 12.2s	remaining: 22m 28s
9:	learn: 0.2880985	total: 13.8s	remaining: 22m 43s
10:	learn: 0.2826095	total: 15.3s	remaining: 22m 56s
11:	learn: 0.2595693	total: 16.8s	remaining: 23m 5s
12:	learn: 0.2379646	total: 18.4s	remaining: 23m 14s
13:	learn: 0.2331621	total: 19.9s	remaining: 23m 18s
14:	learn: 0.2294934	total: 21.4s	remaining: 23m 22s
15:	learn: 0.2266360	total: 22.9s	remaining: 23m 25s
16:	learn: 0.2210052	total: 24.4s	remaining: 23m 29s
17:	learn: 0.2129244	total: 25.9s	remaining: 23m 32s
18:	learn: 0.2123260	total: 27.4s	remaining: 23m 35s
19:	l

153:	learn: 0.0563096	total: 3m 49s	remaining: 21m 2s
154:	learn: 0.0563093	total: 3m 51s	remaining: 21m
155:	learn: 0.0562490	total: 3m 52s	remaining: 20m 59s
156:	learn: 0.0558459	total: 3m 54s	remaining: 20m 58s
157:	learn: 0.0558316	total: 3m 55s	remaining: 20m 56s
158:	learn: 0.0557856	total: 3m 57s	remaining: 20m 55s
159:	learn: 0.0557575	total: 3m 58s	remaining: 20m 53s
160:	learn: 0.0557432	total: 4m	remaining: 20m 52s
161:	learn: 0.0557289	total: 4m 1s	remaining: 20m 51s
162:	learn: 0.0556933	total: 4m 3s	remaining: 20m 49s
163:	learn: 0.0556733	total: 4m 4s	remaining: 20m 48s
164:	learn: 0.0555914	total: 4m 6s	remaining: 20m 46s
165:	learn: 0.0555897	total: 4m 7s	remaining: 20m 45s
166:	learn: 0.0553974	total: 4m 9s	remaining: 20m 44s
167:	learn: 0.0552028	total: 4m 10s	remaining: 20m 42s
168:	learn: 0.0550462	total: 4m 12s	remaining: 20m 41s
169:	learn: 0.0550317	total: 4m 13s	remaining: 20m 39s
170:	learn: 0.0549415	total: 4m 15s	remaining: 20m 38s
171:	learn: 0.0549375	tot

304:	learn: 0.0346006	total: 7m 37s	remaining: 17m 22s
305:	learn: 0.0345760	total: 7m 39s	remaining: 17m 21s
306:	learn: 0.0345372	total: 7m 40s	remaining: 17m 19s
307:	learn: 0.0345358	total: 7m 42s	remaining: 17m 18s
308:	learn: 0.0344360	total: 7m 43s	remaining: 17m 16s
309:	learn: 0.0343160	total: 7m 45s	remaining: 17m 15s
310:	learn: 0.0340565	total: 7m 46s	remaining: 17m 13s
311:	learn: 0.0339368	total: 7m 48s	remaining: 17m 12s
312:	learn: 0.0339240	total: 7m 49s	remaining: 17m 10s
313:	learn: 0.0339165	total: 7m 51s	remaining: 17m 9s
314:	learn: 0.0336665	total: 7m 52s	remaining: 17m 7s
315:	learn: 0.0334642	total: 7m 54s	remaining: 17m 6s
316:	learn: 0.0334538	total: 7m 55s	remaining: 17m 4s
317:	learn: 0.0333209	total: 7m 57s	remaining: 17m 3s
318:	learn: 0.0332519	total: 7m 58s	remaining: 17m 1s
319:	learn: 0.0331550	total: 8m	remaining: 17m
320:	learn: 0.0331534	total: 8m 1s	remaining: 16m 59s
321:	learn: 0.0331530	total: 8m 3s	remaining: 16m 57s
322:	learn: 0.0329347	tota

454:	learn: 0.0249620	total: 11m 22s	remaining: 13m 37s
455:	learn: 0.0249542	total: 11m 23s	remaining: 13m 35s
456:	learn: 0.0249535	total: 11m 25s	remaining: 13m 34s
457:	learn: 0.0248826	total: 11m 26s	remaining: 13m 32s
458:	learn: 0.0248671	total: 11m 28s	remaining: 13m 31s
459:	learn: 0.0248630	total: 11m 30s	remaining: 13m 30s
460:	learn: 0.0248579	total: 11m 31s	remaining: 13m 28s
461:	learn: 0.0248434	total: 11m 33s	remaining: 13m 27s
462:	learn: 0.0248390	total: 11m 34s	remaining: 13m 25s
463:	learn: 0.0248177	total: 11m 36s	remaining: 13m 24s
464:	learn: 0.0247547	total: 11m 37s	remaining: 13m 22s
465:	learn: 0.0242892	total: 11m 39s	remaining: 13m 21s
466:	learn: 0.0241781	total: 11m 40s	remaining: 13m 19s
467:	learn: 0.0241780	total: 11m 42s	remaining: 13m 18s
468:	learn: 0.0241775	total: 11m 43s	remaining: 13m 16s
469:	learn: 0.0241324	total: 11m 45s	remaining: 13m 15s
470:	learn: 0.0241322	total: 11m 46s	remaining: 13m 13s
471:	learn: 0.0241303	total: 11m 48s	remaining: 

602:	learn: 0.0203571	total: 15m 4s	remaining: 9m 55s
603:	learn: 0.0203571	total: 15m 5s	remaining: 9m 53s
604:	learn: 0.0203571	total: 15m 7s	remaining: 9m 52s
605:	learn: 0.0203559	total: 15m 8s	remaining: 9m 50s
606:	learn: 0.0203524	total: 15m 10s	remaining: 9m 49s
607:	learn: 0.0203293	total: 15m 11s	remaining: 9m 47s
608:	learn: 0.0203288	total: 15m 13s	remaining: 9m 46s
609:	learn: 0.0203287	total: 15m 14s	remaining: 9m 44s
610:	learn: 0.0203224	total: 15m 16s	remaining: 9m 43s
611:	learn: 0.0198145	total: 15m 17s	remaining: 9m 41s
612:	learn: 0.0198055	total: 15m 19s	remaining: 9m 40s
613:	learn: 0.0198024	total: 15m 20s	remaining: 9m 38s
614:	learn: 0.0198022	total: 15m 22s	remaining: 9m 37s
615:	learn: 0.0198020	total: 15m 23s	remaining: 9m 35s
616:	learn: 0.0197861	total: 15m 25s	remaining: 9m 34s
617:	learn: 0.0197810	total: 15m 26s	remaining: 9m 32s
618:	learn: 0.0197694	total: 15m 28s	remaining: 9m 31s
619:	learn: 0.0197668	total: 15m 29s	remaining: 9m 29s
620:	learn: 0.

752:	learn: 0.0157537	total: 18m 50s	remaining: 6m 10s
753:	learn: 0.0157530	total: 18m 51s	remaining: 6m 9s
754:	learn: 0.0157528	total: 18m 53s	remaining: 6m 7s
755:	learn: 0.0157487	total: 18m 54s	remaining: 6m 6s
756:	learn: 0.0157223	total: 18m 56s	remaining: 6m 4s
757:	learn: 0.0156988	total: 18m 57s	remaining: 6m 3s
758:	learn: 0.0156690	total: 18m 59s	remaining: 6m 1s
759:	learn: 0.0156682	total: 19m	remaining: 6m
760:	learn: 0.0156664	total: 19m 2s	remaining: 5m 58s
761:	learn: 0.0155533	total: 19m 4s	remaining: 5m 57s
762:	learn: 0.0155428	total: 19m 5s	remaining: 5m 55s
763:	learn: 0.0155427	total: 19m 7s	remaining: 5m 54s
764:	learn: 0.0155139	total: 19m 8s	remaining: 5m 52s
765:	learn: 0.0155091	total: 19m 10s	remaining: 5m 51s
766:	learn: 0.0155090	total: 19m 11s	remaining: 5m 49s
767:	learn: 0.0155079	total: 19m 13s	remaining: 5m 48s
768:	learn: 0.0155036	total: 19m 14s	remaining: 5m 46s
769:	learn: 0.0155036	total: 19m 16s	remaining: 5m 45s
770:	learn: 0.0155017	total: 

903:	learn: 0.0138936	total: 22m 38s	remaining: 2m 24s
904:	learn: 0.0138936	total: 22m 39s	remaining: 2m 22s
905:	learn: 0.0138920	total: 22m 41s	remaining: 2m 21s
906:	learn: 0.0138912	total: 22m 42s	remaining: 2m 19s
907:	learn: 0.0138911	total: 22m 44s	remaining: 2m 18s
908:	learn: 0.0138908	total: 22m 45s	remaining: 2m 16s
909:	learn: 0.0138866	total: 22m 47s	remaining: 2m 15s
910:	learn: 0.0138782	total: 22m 48s	remaining: 2m 13s
911:	learn: 0.0138324	total: 22m 50s	remaining: 2m 12s
912:	learn: 0.0138323	total: 22m 51s	remaining: 2m 10s
913:	learn: 0.0138323	total: 22m 53s	remaining: 2m 9s
914:	learn: 0.0138323	total: 22m 54s	remaining: 2m 7s
915:	learn: 0.0138322	total: 22m 56s	remaining: 2m 6s
916:	learn: 0.0138322	total: 22m 57s	remaining: 2m 4s
917:	learn: 0.0138322	total: 22m 59s	remaining: 2m 3s
918:	learn: 0.0138322	total: 23m	remaining: 2m 1s
919:	learn: 0.0138318	total: 23m 2s	remaining: 2m
920:	learn: 0.0138318	total: 23m 3s	remaining: 1m 58s
921:	learn: 0.0138315	tota

In [36]:
grid_search_depth[0].cv_results_



{'mean_fit_time': array([35125.58761344, 13963.97145429,  6222.39387894,   684.65190063]),
 'std_fit_time': array([   24.23979584, 11443.64122879,  9612.49847428,   138.23273707]),
 'mean_score_time': array([0.47124877, 3.47578225, 2.80326657, 1.39029899]),
 'std_score_time': array([0.23544497, 1.38793642, 1.76570763, 0.45425595]),
 'param_depth': masked_array(data=[10, 8, 6, 4],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'depth': 10}, {'depth': 8}, {'depth': 6}, {'depth': 4}],
 'split0_test_score': array([0.903, 0.899, 0.9  , 0.889]),
 'split1_test_score': array([0.9105, 0.905 , 0.902 , 0.898 ]),
 'split2_test_score': array([0.9085, 0.9005, 0.902 , 0.8955]),
 'split3_test_score': array([0.9145, 0.902 , 0.905 , 0.8945]),
 'split4_test_score': array([0.911 , 0.9105, 0.8995, 0.899 ]),
 'mean_test_score': array([0.9095, 0.9034, 0.9017, 0.8952]),
 'std_test_score': array([0.00378153, 0.00406694, 0.00193907, 0.00350143])

In [38]:
params['depth'] = 10
grid_search_l2, param_l2 = run_iterative_grid_search(order_of_tuning = [['l2_leaf_reg']])
pickle.dump(grid_search_l2, open(DIRECTORY + 'grid_search_l2_11_notebook.p',   'wb'))
pickle.dump(param_l2, open(DIRECTORY + 'param_l2_11_notebook.p', 'wb'))

Tuning hyperparameters ['l2_leaf_reg']
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done  19 out of  30 | elapsed: 319.2min remaining: 184.8min
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed: 331.7min remaining: 100.9min
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed: 409.0min remaining: 45.4min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 413.3min finished


0:	learn: 0.4922453	total: 1.34s	remaining: 22m 17s
1:	learn: 0.4715111	total: 2.06s	remaining: 17m 9s
2:	learn: 0.4585728	total: 3.55s	remaining: 19m 39s
3:	learn: 0.4543453	total: 3.94s	remaining: 16m 22s
4:	learn: 0.4389661	total: 5.45s	remaining: 18m 4s
5:	learn: 0.4221483	total: 6.98s	remaining: 19m 16s
6:	learn: 0.4173186	total: 7.39s	remaining: 17m 27s
7:	learn: 0.3969933	total: 8.92s	remaining: 18m 26s
8:	learn: 0.3864164	total: 10.4s	remaining: 19m 10s
9:	learn: 0.3793591	total: 10.9s	remaining: 17m 54s
10:	learn: 0.3638443	total: 12.4s	remaining: 18m 34s
11:	learn: 0.3573821	total: 13.9s	remaining: 19m 6s
12:	learn: 0.3477239	total: 15.4s	remaining: 19m 31s
13:	learn: 0.3459232	total: 16.9s	remaining: 19m 53s
14:	learn: 0.3370604	total: 18.5s	remaining: 20m 13s
15:	learn: 0.3312436	total: 20s	remaining: 20m 28s
16:	learn: 0.3296781	total: 21.5s	remaining: 20m 41s
17:	learn: 0.3186097	total: 23s	remaining: 20m 55s
18:	learn: 0.3143620	total: 24.5s	remaining: 21m 6s
19:	learn: 

153:	learn: 0.0656073	total: 3m 48s	remaining: 20m 56s
154:	learn: 0.0650178	total: 3m 50s	remaining: 20m 55s
155:	learn: 0.0647162	total: 3m 51s	remaining: 20m 54s
156:	learn: 0.0639973	total: 3m 53s	remaining: 20m 52s
157:	learn: 0.0639686	total: 3m 54s	remaining: 20m 51s
158:	learn: 0.0636551	total: 3m 56s	remaining: 20m 50s
159:	learn: 0.0629760	total: 3m 57s	remaining: 20m 48s
160:	learn: 0.0621349	total: 3m 59s	remaining: 20m 47s
161:	learn: 0.0615872	total: 4m	remaining: 20m 46s
162:	learn: 0.0613787	total: 4m 2s	remaining: 20m 44s
163:	learn: 0.0607508	total: 4m 3s	remaining: 20m 43s
164:	learn: 0.0600640	total: 4m 5s	remaining: 20m 42s
165:	learn: 0.0598305	total: 4m 6s	remaining: 20m 40s
166:	learn: 0.0594497	total: 4m 8s	remaining: 20m 39s
167:	learn: 0.0591740	total: 4m 9s	remaining: 20m 38s
168:	learn: 0.0589661	total: 4m 11s	remaining: 20m 36s
169:	learn: 0.0581392	total: 4m 12s	remaining: 20m 35s
170:	learn: 0.0575712	total: 4m 14s	remaining: 20m 33s
171:	learn: 0.057064

304:	learn: 0.0281684	total: 7m 37s	remaining: 17m 22s
305:	learn: 0.0281485	total: 7m 38s	remaining: 17m 20s
306:	learn: 0.0280839	total: 7m 40s	remaining: 17m 19s
307:	learn: 0.0279376	total: 7m 41s	remaining: 17m 17s
308:	learn: 0.0278666	total: 7m 43s	remaining: 17m 16s
309:	learn: 0.0278496	total: 7m 45s	remaining: 17m 15s
310:	learn: 0.0278377	total: 7m 46s	remaining: 17m 13s
311:	learn: 0.0277837	total: 7m 48s	remaining: 17m 12s
312:	learn: 0.0277663	total: 7m 49s	remaining: 17m 10s
313:	learn: 0.0274917	total: 7m 51s	remaining: 17m 9s
314:	learn: 0.0274601	total: 7m 52s	remaining: 17m 7s
315:	learn: 0.0273970	total: 7m 54s	remaining: 17m 6s
316:	learn: 0.0273036	total: 7m 55s	remaining: 17m 4s
317:	learn: 0.0271673	total: 7m 57s	remaining: 17m 3s
318:	learn: 0.0268413	total: 7m 58s	remaining: 17m 1s
319:	learn: 0.0266577	total: 8m	remaining: 17m
320:	learn: 0.0264436	total: 8m 1s	remaining: 16m 58s
321:	learn: 0.0262938	total: 8m 3s	remaining: 16m 57s
322:	learn: 0.0261241	tota

454:	learn: 0.0171901	total: 11m 24s	remaining: 13m 39s
455:	learn: 0.0171301	total: 11m 25s	remaining: 13m 37s
456:	learn: 0.0170403	total: 11m 27s	remaining: 13m 36s
457:	learn: 0.0169781	total: 11m 28s	remaining: 13m 34s
458:	learn: 0.0169725	total: 11m 30s	remaining: 13m 33s
459:	learn: 0.0169432	total: 11m 31s	remaining: 13m 31s
460:	learn: 0.0167963	total: 11m 33s	remaining: 13m 30s
461:	learn: 0.0167717	total: 11m 34s	remaining: 13m 28s
462:	learn: 0.0167275	total: 11m 36s	remaining: 13m 27s
463:	learn: 0.0166976	total: 11m 37s	remaining: 13m 25s
464:	learn: 0.0166829	total: 11m 39s	remaining: 13m 24s
465:	learn: 0.0166634	total: 11m 40s	remaining: 13m 22s
466:	learn: 0.0166427	total: 11m 42s	remaining: 13m 21s
467:	learn: 0.0166304	total: 11m 43s	remaining: 13m 19s
468:	learn: 0.0165861	total: 11m 45s	remaining: 13m 18s
469:	learn: 0.0165211	total: 11m 46s	remaining: 13m 16s
470:	learn: 0.0164605	total: 11m 48s	remaining: 13m 15s
471:	learn: 0.0164498	total: 11m 49s	remaining: 

602:	learn: 0.0128150	total: 15m 7s	remaining: 9m 57s
603:	learn: 0.0128080	total: 15m 8s	remaining: 9m 55s
604:	learn: 0.0127950	total: 15m 10s	remaining: 9m 54s
605:	learn: 0.0127817	total: 15m 11s	remaining: 9m 52s
606:	learn: 0.0127583	total: 15m 13s	remaining: 9m 51s
607:	learn: 0.0126816	total: 15m 14s	remaining: 9m 49s
608:	learn: 0.0126544	total: 15m 16s	remaining: 9m 48s
609:	learn: 0.0126387	total: 15m 17s	remaining: 9m 46s
610:	learn: 0.0126091	total: 15m 19s	remaining: 9m 45s
611:	learn: 0.0125845	total: 15m 20s	remaining: 9m 43s
612:	learn: 0.0125493	total: 15m 22s	remaining: 9m 42s
613:	learn: 0.0125357	total: 15m 23s	remaining: 9m 40s
614:	learn: 0.0124981	total: 15m 25s	remaining: 9m 39s
615:	learn: 0.0124856	total: 15m 26s	remaining: 9m 37s
616:	learn: 0.0124560	total: 15m 28s	remaining: 9m 36s
617:	learn: 0.0124087	total: 15m 29s	remaining: 9m 34s
618:	learn: 0.0123750	total: 15m 31s	remaining: 9m 33s
619:	learn: 0.0123568	total: 15m 32s	remaining: 9m 31s
620:	learn: 

753:	learn: 0.0101135	total: 18m 55s	remaining: 6m 10s
754:	learn: 0.0100845	total: 18m 57s	remaining: 6m 9s
755:	learn: 0.0100668	total: 18m 58s	remaining: 6m 7s
756:	learn: 0.0100656	total: 19m	remaining: 6m 6s
757:	learn: 0.0100373	total: 19m 1s	remaining: 6m 4s
758:	learn: 0.0100339	total: 19m 3s	remaining: 6m 2s
759:	learn: 0.0100316	total: 19m 4s	remaining: 6m 1s
760:	learn: 0.0100293	total: 19m 6s	remaining: 5m 59s
761:	learn: 0.0100257	total: 19m 7s	remaining: 5m 58s
762:	learn: 0.0100242	total: 19m 9s	remaining: 5m 56s
763:	learn: 0.0100204	total: 19m 10s	remaining: 5m 55s
764:	learn: 0.0100130	total: 19m 12s	remaining: 5m 53s
765:	learn: 0.0099992	total: 19m 13s	remaining: 5m 52s
766:	learn: 0.0099905	total: 19m 15s	remaining: 5m 50s
767:	learn: 0.0099829	total: 19m 16s	remaining: 5m 49s
768:	learn: 0.0099781	total: 19m 18s	remaining: 5m 47s
769:	learn: 0.0099423	total: 19m 19s	remaining: 5m 46s
770:	learn: 0.0099241	total: 19m 21s	remaining: 5m 44s
771:	learn: 0.0099142	tota

904:	learn: 0.0084912	total: 22m 43s	remaining: 2m 23s
905:	learn: 0.0084756	total: 22m 45s	remaining: 2m 21s
906:	learn: 0.0084693	total: 22m 46s	remaining: 2m 20s
907:	learn: 0.0084683	total: 22m 48s	remaining: 2m 18s
908:	learn: 0.0084656	total: 22m 49s	remaining: 2m 17s
909:	learn: 0.0084619	total: 22m 51s	remaining: 2m 15s
910:	learn: 0.0084408	total: 22m 52s	remaining: 2m 14s
911:	learn: 0.0084332	total: 22m 54s	remaining: 2m 12s
912:	learn: 0.0084257	total: 22m 55s	remaining: 2m 11s
913:	learn: 0.0084250	total: 22m 57s	remaining: 2m 9s
914:	learn: 0.0084170	total: 22m 59s	remaining: 2m 8s
915:	learn: 0.0084116	total: 23m	remaining: 2m 6s
916:	learn: 0.0084102	total: 23m 2s	remaining: 2m 5s
917:	learn: 0.0083939	total: 23m 3s	remaining: 2m 3s
918:	learn: 0.0083879	total: 23m 5s	remaining: 2m 2s
919:	learn: 0.0083849	total: 23m 6s	remaining: 2m
920:	learn: 0.0083709	total: 23m 8s	remaining: 1m 59s
921:	learn: 0.0083664	total: 23m 9s	remaining: 1m 57s
922:	learn: 0.0083599	total: 2

In [39]:
grid_search_l2[0].cv_results_



{'mean_fit_time': array([  11.21161628,  330.28494601, 9792.41811857, 9697.86227226,
        9838.81194921, 6018.86084633]),
 'std_fit_time': array([   3.50930959,  259.58807111,  110.70601047,  130.46693104,
          53.707061  , 1798.07170734]),
 'mean_score_time': array([0.70608606, 2.00496793, 1.54483547, 1.65584464, 1.43694491,
        0.60830202]),
 'std_score_time': array([0.41252142, 0.68560833, 0.25312812, 0.13455808, 0.18985422,
        0.39185224]),
 'param_l2_leaf_reg': masked_array(data=[0, 0.01, 0.1, 1, 10, 100],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'l2_leaf_reg': 0},
  {'l2_leaf_reg': 0.01},
  {'l2_leaf_reg': 0.1},
  {'l2_leaf_reg': 1},
  {'l2_leaf_reg': 10},
  {'l2_leaf_reg': 100}],
 'split0_test_score': array([0.777, 0.865, 0.909, 0.913, 0.906, 0.911]),
 'split1_test_score': array([0.7815, 0.8505, 0.9045, 0.9005, 0.907 , 0.9145]),
 'split2_test_score': array([0.7785, 0.85  , 0.9

In [41]:
params_grid['l2_leaf_reg'] = [30, 300, 1000]
grid_search_l2_2, param_l2_2 = run_iterative_grid_search(order_of_tuning = [['l2_leaf_reg']])
pickle.dump(grid_search_l2_2, open(DIRECTORY + 'grid_search_l2_2_11_notebook.p',   'wb'))
pickle.dump(param_l2_2, open(DIRECTORY + 'param_l2_2_11_notebook.p', 'wb'))

Tuning hyperparameters ['l2_leaf_reg']
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed: 166.4min remaining: 1081.6min
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed: 167.2min remaining: 459.8min
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed: 167.9min remaining: 251.9min
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed: 168.0min remaining: 147.0min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed: 314.0min remaining: 157.0min
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed: 314.5min remaining: 78.6min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 315.0min finished


0:	learn: 0.5233496	total: 42.5ms	remaining: 42.5s
1:	learn: 0.4965872	total: 1.49s	remaining: 12m 24s
2:	learn: 0.4867220	total: 1.54s	remaining: 8m 30s
3:	learn: 0.4764542	total: 2.96s	remaining: 12m 18s
4:	learn: 0.4695253	total: 4.38s	remaining: 14m 31s
5:	learn: 0.4662057	total: 4.58s	remaining: 12m 39s
6:	learn: 0.4636369	total: 4.98s	remaining: 11m 46s
7:	learn: 0.4605840	total: 6.46s	remaining: 13m 20s
8:	learn: 0.4552649	total: 7.99s	remaining: 14m 40s
9:	learn: 0.4524157	total: 9.48s	remaining: 15m 39s
10:	learn: 0.4473505	total: 11s	remaining: 16m 28s
11:	learn: 0.4421651	total: 12.5s	remaining: 17m 11s
12:	learn: 0.4375953	total: 14s	remaining: 17m 45s
13:	learn: 0.4328943	total: 15.6s	remaining: 18m 17s
14:	learn: 0.4308579	total: 17.1s	remaining: 18m 40s
15:	learn: 0.4256293	total: 18.6s	remaining: 19m 4s
16:	learn: 0.4209707	total: 20.2s	remaining: 19m 26s
17:	learn: 0.4153446	total: 21.7s	remaining: 19m 44s
18:	learn: 0.4133372	total: 23.3s	remaining: 20m
19:	learn: 0.4

154:	learn: 0.1997243	total: 3m 49s	remaining: 20m 53s
155:	learn: 0.1984219	total: 3m 51s	remaining: 20m 51s
156:	learn: 0.1974683	total: 3m 52s	remaining: 20m 50s
157:	learn: 0.1963508	total: 3m 54s	remaining: 20m 49s
158:	learn: 0.1952339	total: 3m 56s	remaining: 20m 48s
159:	learn: 0.1948368	total: 3m 57s	remaining: 20m 47s
160:	learn: 0.1941859	total: 3m 59s	remaining: 20m 45s
161:	learn: 0.1927657	total: 4m	remaining: 20m 44s
162:	learn: 0.1920414	total: 4m 2s	remaining: 20m 43s
163:	learn: 0.1917504	total: 4m 3s	remaining: 20m 41s
164:	learn: 0.1915348	total: 4m 5s	remaining: 20m 40s
165:	learn: 0.1909799	total: 4m 6s	remaining: 20m 38s
166:	learn: 0.1900635	total: 4m 8s	remaining: 20m 37s
167:	learn: 0.1892103	total: 4m 9s	remaining: 20m 36s
168:	learn: 0.1883362	total: 4m 11s	remaining: 20m 35s
169:	learn: 0.1875722	total: 4m 12s	remaining: 20m 33s
170:	learn: 0.1873297	total: 4m 14s	remaining: 20m 32s
171:	learn: 0.1869635	total: 4m 15s	remaining: 20m 30s
172:	learn: 0.186576

305:	learn: 0.1271970	total: 8m 21s	remaining: 18m 58s
306:	learn: 0.1270266	total: 8m 23s	remaining: 18m 56s
307:	learn: 0.1270043	total: 8m 24s	remaining: 18m 54s
308:	learn: 0.1264515	total: 8m 26s	remaining: 18m 52s
309:	learn: 0.1258843	total: 8m 28s	remaining: 18m 50s
310:	learn: 0.1253839	total: 8m 29s	remaining: 18m 49s
311:	learn: 0.1250004	total: 8m 31s	remaining: 18m 47s
312:	learn: 0.1244524	total: 8m 32s	remaining: 18m 45s
313:	learn: 0.1243623	total: 8m 34s	remaining: 18m 43s
314:	learn: 0.1242816	total: 8m 35s	remaining: 18m 41s
315:	learn: 0.1242546	total: 8m 37s	remaining: 18m 39s
316:	learn: 0.1239086	total: 8m 38s	remaining: 18m 38s
317:	learn: 0.1235834	total: 8m 40s	remaining: 18m 36s
318:	learn: 0.1234718	total: 8m 41s	remaining: 18m 34s
319:	learn: 0.1234405	total: 8m 43s	remaining: 18m 32s
320:	learn: 0.1229789	total: 8m 45s	remaining: 18m 30s
321:	learn: 0.1226436	total: 8m 46s	remaining: 18m 28s
322:	learn: 0.1224090	total: 8m 48s	remaining: 18m 26s
323:	learn

454:	learn: 0.0910308	total: 12m 24s	remaining: 14m 51s
455:	learn: 0.0910117	total: 12m 26s	remaining: 14m 50s
456:	learn: 0.0909545	total: 12m 27s	remaining: 14m 48s
457:	learn: 0.0909437	total: 12m 29s	remaining: 14m 46s
458:	learn: 0.0907774	total: 12m 30s	remaining: 14m 45s
459:	learn: 0.0905565	total: 12m 32s	remaining: 14m 43s
460:	learn: 0.0904887	total: 12m 34s	remaining: 14m 41s
461:	learn: 0.0902096	total: 12m 35s	remaining: 14m 39s
462:	learn: 0.0899381	total: 12m 37s	remaining: 14m 38s
463:	learn: 0.0899009	total: 12m 38s	remaining: 14m 36s
464:	learn: 0.0898102	total: 12m 40s	remaining: 14m 34s
465:	learn: 0.0896751	total: 12m 41s	remaining: 14m 32s
466:	learn: 0.0894917	total: 12m 43s	remaining: 14m 31s
467:	learn: 0.0893239	total: 12m 44s	remaining: 14m 29s
468:	learn: 0.0890702	total: 12m 46s	remaining: 14m 27s
469:	learn: 0.0887850	total: 12m 47s	remaining: 14m 25s
470:	learn: 0.0885459	total: 12m 49s	remaining: 14m 24s
471:	learn: 0.0882483	total: 12m 51s	remaining: 

602:	learn: 0.0698408	total: 16m 21s	remaining: 10m 45s
603:	learn: 0.0696184	total: 16m 22s	remaining: 10m 44s
604:	learn: 0.0693935	total: 16m 24s	remaining: 10m 42s
605:	learn: 0.0691888	total: 16m 26s	remaining: 10m 41s
606:	learn: 0.0691231	total: 16m 28s	remaining: 10m 39s
607:	learn: 0.0690351	total: 16m 30s	remaining: 10m 38s
608:	learn: 0.0687980	total: 16m 31s	remaining: 10m 36s
609:	learn: 0.0687079	total: 16m 33s	remaining: 10m 35s
610:	learn: 0.0686845	total: 16m 35s	remaining: 10m 33s
611:	learn: 0.0686105	total: 16m 37s	remaining: 10m 32s
612:	learn: 0.0684566	total: 16m 38s	remaining: 10m 30s
613:	learn: 0.0683199	total: 16m 40s	remaining: 10m 29s
614:	learn: 0.0682099	total: 16m 42s	remaining: 10m 27s
615:	learn: 0.0680437	total: 16m 44s	remaining: 10m 26s
616:	learn: 0.0679289	total: 16m 46s	remaining: 10m 24s
617:	learn: 0.0677740	total: 16m 47s	remaining: 10m 23s
618:	learn: 0.0676979	total: 16m 49s	remaining: 10m 21s
619:	learn: 0.0675152	total: 16m 51s	remaining: 

752:	learn: 0.0530233	total: 20m 38s	remaining: 6m 46s
753:	learn: 0.0529113	total: 20m 40s	remaining: 6m 44s
754:	learn: 0.0527988	total: 20m 42s	remaining: 6m 43s
755:	learn: 0.0526663	total: 20m 43s	remaining: 6m 41s
756:	learn: 0.0525947	total: 20m 45s	remaining: 6m 39s
757:	learn: 0.0524262	total: 20m 47s	remaining: 6m 38s
758:	learn: 0.0523714	total: 20m 49s	remaining: 6m 36s
759:	learn: 0.0522951	total: 20m 51s	remaining: 6m 35s
760:	learn: 0.0522231	total: 20m 52s	remaining: 6m 33s
761:	learn: 0.0521679	total: 20m 54s	remaining: 6m 31s
762:	learn: 0.0520493	total: 20m 56s	remaining: 6m 30s
763:	learn: 0.0519579	total: 20m 58s	remaining: 6m 28s
764:	learn: 0.0519220	total: 20m 59s	remaining: 6m 26s
765:	learn: 0.0518804	total: 21m 1s	remaining: 6m 25s
766:	learn: 0.0517525	total: 21m 3s	remaining: 6m 23s
767:	learn: 0.0516528	total: 21m 4s	remaining: 6m 22s
768:	learn: 0.0516010	total: 21m 6s	remaining: 6m 20s
769:	learn: 0.0515750	total: 21m 8s	remaining: 6m 18s
770:	learn: 0.0

902:	learn: 0.0422286	total: 24m 53s	remaining: 2m 40s
903:	learn: 0.0421500	total: 24m 55s	remaining: 2m 38s
904:	learn: 0.0421057	total: 24m 57s	remaining: 2m 37s
905:	learn: 0.0420443	total: 24m 58s	remaining: 2m 35s
906:	learn: 0.0419899	total: 25m	remaining: 2m 33s
907:	learn: 0.0419060	total: 25m 2s	remaining: 2m 32s
908:	learn: 0.0418717	total: 25m 3s	remaining: 2m 30s
909:	learn: 0.0418157	total: 25m 5s	remaining: 2m 28s
910:	learn: 0.0417217	total: 25m 7s	remaining: 2m 27s
911:	learn: 0.0416274	total: 25m 8s	remaining: 2m 25s
912:	learn: 0.0415879	total: 25m 10s	remaining: 2m 23s
913:	learn: 0.0415269	total: 25m 12s	remaining: 2m 22s
914:	learn: 0.0415199	total: 25m 13s	remaining: 2m 20s
915:	learn: 0.0415109	total: 25m 15s	remaining: 2m 18s
916:	learn: 0.0415015	total: 25m 17s	remaining: 2m 17s
917:	learn: 0.0414955	total: 25m 18s	remaining: 2m 15s
918:	learn: 0.0414825	total: 25m 20s	remaining: 2m 14s
919:	learn: 0.0414536	total: 25m 22s	remaining: 2m 12s
920:	learn: 0.04139

In [42]:
grid_search_l2_2[0].cv_results_



{'mean_fit_time': array([9986.36949973, 9539.40908203, 8824.48804617]),
 'std_fit_time': array([ 54.25546104, 601.60430871,  14.76270822]),
 'mean_score_time': array([2.23697448, 1.16527019, 0.68199329]),
 'std_score_time': array([1.08098642, 0.2108308 , 0.41217335]),
 'param_l2_leaf_reg': masked_array(data=[30, 300, 1000],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'l2_leaf_reg': 30}, {'l2_leaf_reg': 300}, {'l2_leaf_reg': 1000}],
 'split0_test_score': array([0.9155, 0.909 , 0.9155]),
 'split1_test_score': array([0.906 , 0.9195, 0.9225]),
 'split2_test_score': array([0.907 , 0.9145, 0.917 ]),
 'split3_test_score': array([0.915 , 0.9215, 0.9165]),
 'split4_test_score': array([0.919, 0.933, 0.931]),
 'mean_test_score': array([0.9125, 0.9195, 0.9205]),
 'std_test_score': array([0.00509902, 0.00801873, 0.00578792]),
 'rank_test_score': array([3, 2, 1]),
 'split0_train_score': array([0.998375, 0.99825 , 0.9965  ]),
 'split1_tra

In [43]:
params_grid['l2_leaf_reg'] = [3000, 10000, 30000]
grid_search_l2_3, param_l2_3 = run_iterative_grid_search(order_of_tuning = [['l2_leaf_reg']])
pickle.dump(grid_search_l2_3, open(DIRECTORY + 'grid_search_l2_3_11_notebook.p',   'wb'))
pickle.dump(param_l2_3, open(DIRECTORY + 'param_l2_3_11_notebook.p', 'wb'))

Tuning hyperparameters ['l2_leaf_reg']
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed: 166.6min remaining: 1082.6min
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed: 167.4min remaining: 460.4min
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed: 167.7min remaining: 251.6min
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed: 168.7min remaining: 147.6min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed: 312.6min remaining: 156.3min
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed: 312.8min remaining: 78.2min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 313.1min finished


0:	learn: 0.5304054	total: 57ms	remaining: 56.9s
1:	learn: 0.5065808	total: 503ms	remaining: 4m 11s
2:	learn: 0.4944137	total: 657ms	remaining: 3m 38s
3:	learn: 0.4868789	total: 2.26s	remaining: 9m 22s
4:	learn: 0.4815375	total: 3.1s	remaining: 10m 17s
5:	learn: 0.4768825	total: 3.98s	remaining: 10m 59s
6:	learn: 0.4748157	total: 4.25s	remaining: 10m 2s
7:	learn: 0.4728533	total: 5.81s	remaining: 11m 59s
8:	learn: 0.4698699	total: 7.51s	remaining: 13m 46s
9:	learn: 0.4669102	total: 9.16s	remaining: 15m 6s
10:	learn: 0.4633182	total: 10.8s	remaining: 16m 11s
11:	learn: 0.4616136	total: 11.1s	remaining: 15m 9s
12:	learn: 0.4596531	total: 12.7s	remaining: 16m 3s
13:	learn: 0.4569290	total: 14.3s	remaining: 16m 50s
14:	learn: 0.4544822	total: 16s	remaining: 17m 29s
15:	learn: 0.4524049	total: 17.7s	remaining: 18m 6s
16:	learn: 0.4508382	total: 19.3s	remaining: 18m 36s
17:	learn: 0.4493048	total: 20.9s	remaining: 19m 2s
18:	learn: 0.4476231	total: 22.6s	remaining: 19m 27s
19:	learn: 0.44586

154:	learn: 0.3086298	total: 4m 13s	remaining: 23m 3s
155:	learn: 0.3078359	total: 4m 15s	remaining: 23m 1s
156:	learn: 0.3073613	total: 4m 16s	remaining: 22m 59s
157:	learn: 0.3066408	total: 4m 18s	remaining: 22m 56s
158:	learn: 0.3063393	total: 4m 19s	remaining: 22m 54s
159:	learn: 0.3054939	total: 4m 21s	remaining: 22m 52s
160:	learn: 0.3051143	total: 4m 22s	remaining: 22m 50s
161:	learn: 0.3044558	total: 4m 24s	remaining: 22m 48s
162:	learn: 0.3036235	total: 4m 26s	remaining: 22m 45s
163:	learn: 0.3031342	total: 4m 27s	remaining: 22m 43s
164:	learn: 0.3029756	total: 4m 29s	remaining: 22m 41s
165:	learn: 0.3021784	total: 4m 30s	remaining: 22m 39s
166:	learn: 0.3013377	total: 4m 32s	remaining: 22m 37s
167:	learn: 0.3008916	total: 4m 33s	remaining: 22m 35s
168:	learn: 0.3007918	total: 4m 35s	remaining: 22m 32s
169:	learn: 0.2999691	total: 4m 36s	remaining: 22m 30s
170:	learn: 0.2995718	total: 4m 38s	remaining: 22m 28s
171:	learn: 0.2993900	total: 4m 39s	remaining: 22m 26s
172:	learn: 

305:	learn: 0.2352230	total: 8m 4s	remaining: 18m 18s
306:	learn: 0.2347954	total: 8m 5s	remaining: 18m 16s
307:	learn: 0.2344134	total: 8m 7s	remaining: 18m 14s
308:	learn: 0.2343264	total: 8m 8s	remaining: 18m 12s
309:	learn: 0.2338715	total: 8m 10s	remaining: 18m 11s
310:	learn: 0.2332638	total: 8m 11s	remaining: 18m 9s
311:	learn: 0.2331935	total: 8m 13s	remaining: 18m 7s
312:	learn: 0.2325574	total: 8m 14s	remaining: 18m 6s
313:	learn: 0.2323351	total: 8m 16s	remaining: 18m 4s
314:	learn: 0.2319269	total: 8m 17s	remaining: 18m 2s
315:	learn: 0.2313856	total: 8m 19s	remaining: 18m
316:	learn: 0.2309008	total: 8m 20s	remaining: 17m 59s
317:	learn: 0.2308292	total: 8m 22s	remaining: 17m 57s
318:	learn: 0.2302765	total: 8m 23s	remaining: 17m 55s
319:	learn: 0.2300196	total: 8m 25s	remaining: 17m 54s
320:	learn: 0.2299192	total: 8m 26s	remaining: 17m 52s
321:	learn: 0.2294030	total: 8m 28s	remaining: 17m 50s
322:	learn: 0.2293010	total: 8m 29s	remaining: 17m 48s
323:	learn: 0.2288416	t

454:	learn: 0.1905608	total: 11m 54s	remaining: 14m 16s
455:	learn: 0.1902036	total: 11m 56s	remaining: 14m 14s
456:	learn: 0.1899192	total: 11m 58s	remaining: 14m 13s
457:	learn: 0.1897456	total: 11m 59s	remaining: 14m 11s
458:	learn: 0.1894086	total: 12m 1s	remaining: 14m 9s
459:	learn: 0.1891153	total: 12m 2s	remaining: 14m 8s
460:	learn: 0.1890379	total: 12m 4s	remaining: 14m 6s
461:	learn: 0.1886451	total: 12m 5s	remaining: 14m 5s
462:	learn: 0.1882379	total: 12m 7s	remaining: 14m 3s
463:	learn: 0.1880260	total: 12m 8s	remaining: 14m 1s
464:	learn: 0.1878976	total: 12m 10s	remaining: 14m
465:	learn: 0.1875111	total: 12m 11s	remaining: 13m 58s
466:	learn: 0.1870993	total: 12m 13s	remaining: 13m 57s
467:	learn: 0.1868050	total: 12m 15s	remaining: 13m 55s
468:	learn: 0.1865380	total: 12m 16s	remaining: 13m 53s
469:	learn: 0.1862281	total: 12m 18s	remaining: 13m 52s
470:	learn: 0.1860688	total: 12m 19s	remaining: 13m 50s
471:	learn: 0.1858738	total: 12m 21s	remaining: 13m 49s
472:	lea

602:	learn: 0.1605150	total: 15m 42s	remaining: 10m 20s
603:	learn: 0.1603901	total: 15m 44s	remaining: 10m 19s
604:	learn: 0.1603671	total: 15m 45s	remaining: 10m 17s
605:	learn: 0.1602152	total: 15m 47s	remaining: 10m 15s
606:	learn: 0.1599880	total: 15m 48s	remaining: 10m 14s
607:	learn: 0.1597090	total: 15m 50s	remaining: 10m 12s
608:	learn: 0.1595520	total: 15m 51s	remaining: 10m 11s
609:	learn: 0.1593799	total: 15m 53s	remaining: 10m 9s
610:	learn: 0.1592448	total: 15m 54s	remaining: 10m 7s
611:	learn: 0.1590094	total: 15m 56s	remaining: 10m 6s
612:	learn: 0.1587536	total: 15m 58s	remaining: 10m 4s
613:	learn: 0.1585520	total: 15m 59s	remaining: 10m 3s
614:	learn: 0.1584027	total: 16m 1s	remaining: 10m 1s
615:	learn: 0.1580507	total: 16m 2s	remaining: 10m
616:	learn: 0.1577952	total: 16m 4s	remaining: 9m 58s
617:	learn: 0.1575876	total: 16m 5s	remaining: 9m 56s
618:	learn: 0.1574189	total: 16m 7s	remaining: 9m 55s
619:	learn: 0.1571832	total: 16m 8s	remaining: 9m 53s
620:	learn: 

752:	learn: 0.1403738	total: 19m 32s	remaining: 6m 24s
753:	learn: 0.1401697	total: 19m 34s	remaining: 6m 23s
754:	learn: 0.1399430	total: 19m 35s	remaining: 6m 21s
755:	learn: 0.1398067	total: 19m 37s	remaining: 6m 19s
756:	learn: 0.1395894	total: 19m 38s	remaining: 6m 18s
757:	learn: 0.1395560	total: 19m 40s	remaining: 6m 16s
758:	learn: 0.1394178	total: 19m 41s	remaining: 6m 15s
759:	learn: 0.1392325	total: 19m 43s	remaining: 6m 13s
760:	learn: 0.1390544	total: 19m 45s	remaining: 6m 12s
761:	learn: 0.1388379	total: 19m 46s	remaining: 6m 10s
762:	learn: 0.1387518	total: 19m 48s	remaining: 6m 9s
763:	learn: 0.1385337	total: 19m 49s	remaining: 6m 7s
764:	learn: 0.1383990	total: 19m 51s	remaining: 6m 5s
765:	learn: 0.1383903	total: 19m 52s	remaining: 6m 4s
766:	learn: 0.1382601	total: 19m 54s	remaining: 6m 2s
767:	learn: 0.1381457	total: 19m 55s	remaining: 6m 1s
768:	learn: 0.1381055	total: 19m 57s	remaining: 5m 59s
769:	learn: 0.1380308	total: 19m 58s	remaining: 5m 58s
770:	learn: 0.13

903:	learn: 0.1229237	total: 23m 24s	remaining: 2m 29s
904:	learn: 0.1228035	total: 23m 26s	remaining: 2m 27s
905:	learn: 0.1227833	total: 23m 27s	remaining: 2m 26s
906:	learn: 0.1226705	total: 23m 29s	remaining: 2m 24s
907:	learn: 0.1225204	total: 23m 30s	remaining: 2m 22s
908:	learn: 0.1224620	total: 23m 32s	remaining: 2m 21s
909:	learn: 0.1223131	total: 23m 33s	remaining: 2m 19s
910:	learn: 0.1221631	total: 23m 35s	remaining: 2m 18s
911:	learn: 0.1220739	total: 23m 36s	remaining: 2m 16s
912:	learn: 0.1219793	total: 23m 38s	remaining: 2m 15s
913:	learn: 0.1218501	total: 23m 39s	remaining: 2m 13s
914:	learn: 0.1218149	total: 23m 41s	remaining: 2m 12s
915:	learn: 0.1216445	total: 23m 42s	remaining: 2m 10s
916:	learn: 0.1215832	total: 23m 44s	remaining: 2m 8s
917:	learn: 0.1214418	total: 23m 45s	remaining: 2m 7s
918:	learn: 0.1212718	total: 23m 47s	remaining: 2m 5s
919:	learn: 0.1212007	total: 23m 48s	remaining: 2m 4s
920:	learn: 0.1211369	total: 23m 50s	remaining: 2m 2s
921:	learn: 0.1

In [44]:
grid_search_l2_3[0].cv_results_



{'mean_fit_time': array([10017.28428283,  9531.09591208,  8695.2385787 ]),
 'std_fit_time': array([ 44.11833357, 618.89988422,  38.99075754]),
 'mean_score_time': array([2.15261936, 2.70873861, 1.09661598]),
 'std_score_time': array([0.89260111, 1.22814233, 0.98711121]),
 'param_l2_leaf_reg': masked_array(data=[3000, 10000, 30000],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'l2_leaf_reg': 3000},
  {'l2_leaf_reg': 10000},
  {'l2_leaf_reg': 30000}],
 'split0_test_score': array([0.9005, 0.865 , 0.817 ]),
 'split1_test_score': array([0.901 , 0.8705, 0.824 ]),
 'split2_test_score': array([0.9065, 0.872 , 0.823 ]),
 'split3_test_score': array([0.91  , 0.864 , 0.8285]),
 'split4_test_score': array([0.909 , 0.8785, 0.8405]),
 'mean_test_score': array([0.9054, 0.87  , 0.8266]),
 'std_test_score': array([0.00396737, 0.00524404, 0.00785748]),
 'rank_test_score': array([1, 2, 3]),
 'split0_train_score': array([0.976625, 0.92925 , 0.85

In [None]:
params['l2_leaf_reg'] = 1000  
params_grid['iterations'] = [10000, 3000, 1000]
grid_search_iterations, param_iterations = run_iterative_grid_search(order_of_tuning = [['iterations']])
pickle.dump(grid_search_iterations, open(DIRECTORY + 'grid_search_iterations_11_notebook.p',   'wb'))
pickle.dump(param_iterations, open(DIRECTORY + 'param_iterations_11_notebook.p', 'wb'))

Tuning hyperparameters ['iterations']
Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [None]:
grid_search_iterations[0].cv_results_