# Generate submission based on bag of catboost classifiers
### 31-01-2019

In [1]:
## Variables specific for competition

ID = 'card_id'                                            
TARGET = 'target'    

RAW_DIRECTORY = 'C:/Users/judit/Documents/learning/kaggle/Elo_201812/rawdata/'  
DIRECTORY = 'C:/Users/judit/Documents/learning/kaggle/Elo_201812/data/'
HIST_TRANS_FILE = RAW_DIRECTORY + 'historical_transactions.csv'
MERCHANTS_FILE = RAW_DIRECTORY + 'merchants.csv'
NEW_MERCH_TRANS_FILE = RAW_DIRECTORY + 'new_merchant_transactions.csv'
TRAIN_FILE = RAW_DIRECTORY + 'train.csv'    
TEST_FILE = RAW_DIRECTORY +'test.csv'
SAMPLE_SUBMISSION_FILE = RAW_DIRECTORY + 'sample_submission.csv'

SUBMISSION_DIRECTORY = 'C:/Users/judit/Documents/learning/kaggle/Elo_201812/submissions/'

In [2]:
## Variables specific for notebook
MODELS_FILE = DIRECTORY + 'models_12_notebook.p'
PREDICTIONS_CLASS_FILE = DIRECTORY + 'predictions_class_12_notebook_1.p'
PREDICTIONS_PROBA_FILE = DIRECTORY + 'predictions_proba_12_notebook_1.p'
LOCALTEST_SET_FILE = DIRECTORY + 'localtest_set_12_notebook.p'

NUM = 0
SUBMIT_FILENAME = SUBMISSION_DIRECTORY + 'submit_190131_'

HIST_AGG_FILE_1 = DIRECTORY + 'hist_agg_20.pkl'        # only authorized, domestic, last 3 months
HIST_AGG_FILE_2 = DIRECTORY + 'hist_agg_28.pkl'        # only authorized, non-domestic, last 3 months
NEW_AGG_FILE_1 = DIRECTORY + 'new_agg_3.pkl'           # only authorized, domestic, whole period
NEW_AGG_FILE_2 = DIRECTORY + 'new_agg_5.pkl'           # only authorized, non-domestic, whole period

categorical_features = ['first_active_month',
                        'card_id',
                        'feature_1',
                        'feature_2',
                        'feature_3',
                        'hist_city_id_mode_auth_dom_lag2m',
                        'hist_category_3_mode_auth_dom_lag2m',
                        'hist_merchant_category_id_mode_auth_dom_lag2m',
                        'hist_merchant_id_mode_auth_dom_lag2m',
                        'hist_category_2_mode_auth_dom_lag2m',
                        'hist_state_id_mode_auth_dom_lag2m',
                        'hist_subsector_id_mode_auth_dom_lag2m',
                        'hist_city_id_mode_auth_nondom_lag2m',
                        'hist_category_3_mode_auth_nondom_lag2m',
                        'hist_merchant_category_id_mode_auth_nondom_lag2m',
                        'hist_merchant_id_mode_auth_nondom_lag2m',
                        'hist_category_2_mode_auth_nondom_lag2m',
                        'hist_state_id_mode_auth_nondom_lag2m',
                        'hist_subsector_id_mode_auth_nondom_lag2m',
                        'new_city_id_mode_dom_all',
                        'new_category_3_mode_dom_all',
                        'new_merchant_category_id_mode_dom_all',
                        'new_merchant_id_mode_dom_all',
                        'new_category_2_mode_dom_all',
                        'new_state_id_mode_dom_all',
                        'new_subsector_id_mode_dom_all',
                        'new_city_id_mode_nondom_all',
                        'new_category_3_mode_nondom_all',
                        'new_merchant_category_id_mode_nondom_all',
                        'new_merchant_id_mode_nondom_all',
                        'new_category_2_mode_nondom_all',
                        'new_state_id_mode_nondom_all',
                        'new_subsector_id_mode_nondom_all',
                        'merchant_group_id_hist_merchant_id_mode_auth_dom_lag2m',
                        'merchant_category_id_hist_merchant_id_mode_auth_dom_lag2m',
                        'subsector_id_hist_merchant_id_mode_auth_dom_lag2m',
                        'city_id_hist_merchant_id_mode_auth_dom_lag2m',
                        'state_id_hist_merchant_id_mode_auth_dom_lag2m',
                        'category_2_hist_merchant_id_mode_auth_dom_lag2m',
                        'merchant_group_id_hist_merchant_id_mode_auth_nondom_lag2m',
                        'merchant_category_id_hist_merchant_id_mode_auth_nondom_lag2m',
                        'subsector_id_hist_merchant_id_mode_auth_nondom_lag2m',
                        'city_id_hist_merchant_id_mode_auth_nondom_lag2m',
                        'state_id_hist_merchant_id_mode_auth_nondom_lag2m',
                        'category_2_hist_merchant_id_mode_auth_nondom_lag2m',
                        'merchant_group_id_new_merchant_id_mode_dom_all',
                        'merchant_category_id_new_merchant_id_mode_dom_all',
                        'subsector_id_new_merchant_id_mode_dom_all',
                        'city_id_new_merchant_id_mode_dom_all',
                        'state_id_new_merchant_id_mode_dom_all',
                        'category_2_new_merchant_id_mode_dom_all',
                        'merchant_group_id_new_merchant_id_mode_nondom_all',
                        'merchant_category_id_new_merchant_id_mode_nondom_all',
                        'subsector_id_new_merchant_id_mode_nondom_all',
                        'city_id_new_merchant_id_mode_nondom_all',
                        'state_id_new_merchant_id_mode_nondom_all',
                        'category_2_new_merchant_id_mode_nondom_all']

In [3]:
from catboost import CatBoostRegressor, CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.utils import resample
from sklearn.metrics import mean_squared_error, accuracy_score, log_loss, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pickle
import random
import time
random.seed(1)
start_time = time.time()

In [4]:
## Useful functions
def reduce_mem_usage(df, verbose = True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Starting memory usage: {:5.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Reduced memory usage: {:5.2f} MB ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem\
                                                                                                   ) / start_mem))
    return df

def mode(series):
    if len(series.mode()) > 0:
        return series.mode().iloc[0]
    else:
        return np.nan

def label_encoder(train_df, test_df = None, valid_df = None, localtest_df = None,
                  prefix = '_labelenc_', suffix = '',
                  target = 'is_outlier',
                  cols_and_encodings = {'feature_1' : ['mean', 'median'],
                                         'feature_2' : [mode]}):
    '''
    Calculates label encodings based on train_df.
    Can be used both for classification and regression problems.
    
    cols_and_encodings is a dictionary. 
    Keys: features that we want to encode (usually categorical features with only a few possible values).
    Values : list of aggregation functions which will be applied on target values.
    '''
    for col, enc_list in cols_and_encodings.items():
        agg_df = train_df.groupby(col).agg({target : enc_list})
        agg_df.columns = [col + prefix + '_'.join(colname).strip() + suffix for colname in agg_df.columns.values]  
        agg_df.reset_index(inplace = True)
        train_df = train_df.merge(agg_df, how = 'left', on = col)
        if test_df:
            test_df = test_df.merge(agg_df, how = 'left', on = col)
        if valid_df:
            valid_df = valid_df.merge(agg_df, how = 'left', on = col)
        if localtest_df:
            localtest_df = localtest_df.merge(agg_df, how = 'left', on = col)

    return train_df, test_df, valid_df, localtest_df

In [5]:
hist_agg_1 = pd.read_pickle(HIST_AGG_FILE_1)
hist_agg_2 = pd.read_pickle(HIST_AGG_FILE_2)
new_agg_1 = pd.read_pickle(NEW_AGG_FILE_1)
new_agg_2 = pd.read_pickle(NEW_AGG_FILE_2)
merch = pd.read_csv(MERCHANTS_FILE)
test = pd.read_csv(TEST_FILE, parse_dates = ["first_active_month"])

test.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
0,2017-04-01,C_ID_0ab67a22ab,3,3,1
1,2017-01-01,C_ID_130fd0cbdd,2,3,0
2,2017-08-01,C_ID_b709037bc5,5,1,1
3,2017-12-01,C_ID_d27d835a9f,2,1,0
4,2015-12-01,C_ID_2b5e3df5c2,5,1,1


In [6]:
def test_data_transformations_and_merges():
    '''
    This function is similar to function data_transformations_and_merges() in notebook 12, but now we don't create
    'is_outlier' column.
    '''
    global test, merch
    # card_id is a hexadecimal number. Convert it into decimal.
    test['card_id'] = test['card_id'].apply(lambda s : int(s[5:], 16))
    test = reduce_mem_usage(df = test, verbose = True)
    
    # merchant_id is a hexadecimal number. Convert it into decimal.
    merch['merchant_id'] = merch['merchant_id'].apply(lambda s : int(s[5:], 16))
    # convert categorical features into numerical ones
    merch['category_1'] = merch['category_1'].apply(lambda x : 1 if x == 'Y' else 0 if x == 'N' else np.nan)
    merch['most_recent_sales_range'] = merch['most_recent_sales_range'].apply(lambda x : 1 if x == 'A' else
                                                                                         2 if x == 'B' else
                                                                                         3 if x == 'C' else
                                                                                         4 if x == 'D' else
                                                                                         5 if x == 'E' else
                                                                                         np.nan)
    merch['most_recent_purchases_range'] = merch['most_recent_purchases_range'].apply(lambda x : 1 if x == 'A' else
                                                                                                 2 if x == 'B' else
                                                                                                 3 if x == 'C' else
                                                                                                 4 if x == 'D' else
                                                                                                 5 if x == 'E' else
                                                                                                 np.nan)
    merch['category_4'] = merch['category_4'].apply(lambda x : 1 if x == 'Y' else 0 if x == 'N' else np.nan)
    merch = reduce_mem_usage(df = merch, verbose = True)
    
    # merges
    df = test.merge(hist_agg_1, how = 'left', on = 'card_id')
    df = df.merge(hist_agg_2, how = 'left', on = 'card_id')
    df = df.merge(new_agg_1, how = 'left', on = 'card_id')
    df = df.merge(new_agg_2, how = 'left', on = 'card_id')
    df = df.merge(merch, how = 'left', left_on = 'hist_merchant_id_mode_auth_dom_lag2m', right_on = 'merchant_id',
                  suffixes = ('', '_hist_merchant_id_mode_auth_dom_lag2m'))
    df = df.merge(merch, how = 'left', left_on = 'hist_merchant_id_mode_auth_nondom_lag2m', right_on = 'merchant_id',
                  suffixes = ('', '_hist_merchant_id_mode_auth_nondom_lag2m'))
    df = df.merge(merch, how = 'left', left_on = 'new_merchant_id_mode_dom_all', right_on = 'merchant_id',
                  suffixes = ('', '_new_merchant_id_mode_dom_all'))
    df = df.merge(merch, how = 'left', left_on = 'new_merchant_id_mode_nondom_all', right_on = 'merchant_id',
                  suffixes = ('', '_new_merchant_id_mode_nondom_all'))
    for colname in merch.columns:
        df.rename(columns = {colname: colname + '_hist_merchant_id_mode_auth_dom_lag2m'}, inplace = True)
    df.drop(['merchant_id_hist_merchant_id_mode_auth_dom_lag2m',
             'merchant_id_hist_merchant_id_mode_auth_nondom_lag2m',
             'merchant_id_new_merchant_id_mode_dom_all',
             'merchant_id_new_merchant_id_mode_nondom_all'], inplace = True, axis = 1)
    
    # Prepair data for CatBoostClassifier
    # impute string for missing categorical values (catboost can't handle np.nan in categorical features)
    df[categorical_features] = df[categorical_features].fillna('nan')
    
    for col in categorical_features:
        df[col] = df[col].astype('category').cat.codes
    
    return df

In [7]:
df = test_data_transformations_and_merges()

Starting memory usage:  4.72 MB
Reduced memory usage:  2.24 MB (52.5% reduction)
Starting memory usage: 56.18 MB
Reduced memory usage: 21.39 MB (61.9% reduction)


In [8]:
def make_list_of_predictions(models_list, test_set):
    predictions_class = []
    predictions_proba = []
    idx = 0
    for model in models_list:
        print('Predict for model', idx)
        prediction_class = model.predict(test_set)
        prediction_proba = model.predict_proba(test_set)
        predictions_class.append(prediction_class)
        predictions_proba.append(prediction_proba)
        idx += 1
    return predictions_class, predictions_proba

In [9]:
models = pickle.load(open(DIRECTORY + 'models_12_notebook.p', 'rb'))

predictions_class, predictions_proba = make_list_of_predictions(models_list = models, test_set = df)

Predict for model 0
Predict for model 1
Predict for model 2
Predict for model 3
Predict for model 4
Predict for model 5
Predict for model 6
Predict for model 7
Predict for model 8
Predict for model 9
Predict for model 10
Predict for model 11
Predict for model 12
Predict for model 13
Predict for model 14
Predict for model 15
Predict for model 16
Predict for model 17
Predict for model 18
Predict for model 19
Predict for model 20
Predict for model 21
Predict for model 22
Predict for model 23
Predict for model 24
Predict for model 25
Predict for model 26
Predict for model 27
Predict for model 28
Predict for model 29
Predict for model 30
Predict for model 31


In [10]:
# Matrix of predictions for probabilities of the positive class:
all_preds = np.array(predictions_proba)[:,:,1].T
all_preds.shape

(123623, 32)

In [11]:
weights = pickle.load(open(DIRECTORY + 'best_catboost_weights_13_notebook.p', 'rb'))
thr = 0.9

weighted_prediction = np.sum(np.multiply(all_preds, weights) / np.sum(weights), axis = 1)
preds_class = np.array([(weighted_prediction > thr).astype(int)]).squeeze()

In [12]:
preds_class

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
end_time = time.time()
print('Elapsed time:', end_time - start_time)

Elapsed time: 1321.0123896598816


In [14]:
# Number of positive predictions
sum(preds_class)

64

In [15]:
preds_class[preds_class == 1] = -33.219281

In [16]:
submission = pd.read_csv(SAMPLE_SUBMISSION_FILE)
submission['target'] = preds_class
submission.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,0
1,C_ID_130fd0cbdd,0
2,C_ID_b709037bc5,0
3,C_ID_d27d835a9f,0
4,C_ID_2b5e3df5c2,0


In [17]:
NUM = 0
submission.to_csv(SUBMIT_FILENAME + str(NUM) + '.csv', index = False)