# Elo Merchant Categories
### Competition on Kaggle
## Catboost regression -- second quick and dirty: outlier classification + regression
### 23-12-2018

In [1]:
## Variables specific for competition

ID = 'card_id'                                            
TARGET = 'target'    

RAW_DIRECTORY = 'C:/Users/judit/Documents/learning/kaggle/Elo_201812/rawdata/'  
DIRECTORY = 'C:/Users/judit/Documents/learning/kaggle/Elo_201812/data/'
HIST_TRANS_FILE = RAW_DIRECTORY + 'historical_transactions.csv'
MERCHANTS_FILE = RAW_DIRECTORY + 'merchants.csv'
NEW_MERCH_TRANS_FILE = RAW_DIRECTORY + 'new_merchant_transactions.csv'
TRAIN_FILE = RAW_DIRECTORY + 'train.csv'    
TEST_FILE = RAW_DIRECTORY +'test.csv'
SAMPLE_SUBMISSION_FILE = RAW_DIRECTORY + 'sample_submission.csv'

SUBMISSION_DIRECTORY = 'C:/Users/judit/Documents/learning/kaggle/Elo_201812/submissions/'
NUM = 3
SUBMIT_FILENAME = SUBMISSION_DIRECTORY + 'submit_181223_'

In [2]:
from catboost import CatBoostRegressor, CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, log_loss, confusion_matrix
import random
random.seed(1)

In [3]:
train = pd.read_csv(TRAIN_FILE)
test = pd.read_csv(TEST_FILE)
train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.159749


In [4]:
test.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
0,2017-04,C_ID_0ab67a22ab,3,3,1
1,2017-01,C_ID_130fd0cbdd,2,3,0
2,2017-08,C_ID_b709037bc5,5,1,1
3,2017-12,C_ID_d27d835a9f,2,1,0
4,2015-12,C_ID_2b5e3df5c2,5,1,1


In [5]:
train['start_year'] = train['first_active_month'].apply(lambda x : int(x[:4]))
train['start_month'] = train['first_active_month'].apply(lambda x : int(x[5:]))
train['num_months'] = (2018 - train['start_year']) * 12 + (13 - train['start_month'])

test['start_year'] = test['first_active_month'].map(lambda x : int(str(x)[:4]) if pd.notnull(x) else x)
test['start_month'] = test['first_active_month'].map(lambda x : int(str(x)[5:]) if pd.notnull(x) else x)
test['num_months'] = (2018 - test['start_year']) * 12 + (13 - test['start_month'])

train['outlier'] = train['target'].apply(lambda x : int(x < -30))

train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,start_year,start_month,num_months,outlier
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283,2017,6,19,0
1,2017-01,C_ID_3d0044924f,4,1,0,0.392913,2017,1,24,0
2,2016-08,C_ID_d639edf6cd,2,2,0,0.688056,2016,8,29,0
3,2017-09,C_ID_186d6a6901,4,3,0,0.142495,2017,9,16,0
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.159749,2017,11,14,0


In [6]:
train['outlier'].sum()

2207

In [7]:
train.shape

(201917, 10)

In [8]:
train.shape[0] / train['outlier'].sum()

91.48935206162211

In [9]:
# create training, validation and local test sets
X = train.drop(['first_active_month', 'start_year', 'start_month', 'target'], axis = 1)
y = train.target

# split into training set (70%), validation set (15%) and local test set (15%)
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size = 0.7, random_state = 1,
                                                                stratify = X[['feature_1', 'feature_2', 'feature_3', 'outlier']])
X_validation, X_localtest, y_validation, y_localtest = train_test_split(X_validation, y_validation, train_size = 0.5, 
                                                                        random_state = 1,
                                                                        stratify = X_validation[['feature_1', 'feature_2', 
                                                                                                 'feature_3', 'outlier']])

# y_class is the target for the classification problem of outlier/not-outlier
y_class_train = X_train.outlier.copy()
y_class_validation = X_validation.outlier.copy()
y_class_localtest = X_localtest.outlier.copy()
X_train = X_train.drop('outlier', axis = 1)
X_validation = X_validation.drop('outlier', axis = 1)
X_localtest = X_localtest.drop('outlier', axis = 1)



In [10]:
y_class_train.sum(), y_class_validation.sum(), y_class_localtest.sum()

(1544, 332, 331)

### Step 1: classification to find outliers

In [11]:
def train_catboost_classifier(iterations = 300,
                              depth = 2,
                              learning_rate = 0.01,
                              loss_function = 'Logloss',
                              class_weights = [0.1, 9],
                              categorical_features_indices = np.array([0, 1, 2]),
                              early_stopping_rounds = 30):
    random.seed(1)
    print('========================================================================')
    print('Categorical_features_indices =', categorical_features_indices)
    print('Model parameters:')
    print('Iterations =', iterations, ', depth =', depth, ', learning_rate =', learning_rate, ', loss_function =', loss_function)
    print('class_weights =', class_weights)
    model = CatBoostClassifier(iterations = iterations, 
                               depth = depth, 
                               learning_rate = learning_rate, 
                               loss_function = loss_function, 
                               class_weights = class_weights,
                               early_stopping_rounds = early_stopping_rounds)
    model.fit(X_train.drop('card_id', axis = 1), 
              y_class_train, 
              cat_features = categorical_features_indices, 
              eval_set = (X_validation.drop('card_id', axis = 1), y_validation),
              silent = True)
    print('Best iteration:', model.get_best_iteration())
    localtest_class_pred_proba = model.predict_proba(X_localtest.drop('card_id', axis = 1))
    threshold = np.sort(localtest_class_pred_proba[:, 0])[330]
    localtest_class_pred = localtest_class_pred_proba[:, 0] <= threshold
    conf_mat = confusion_matrix(y_class_localtest, localtest_class_pred)
    print('Accuracy:', accuracy_score(y_class_localtest,localtest_class_pred))
    print('Confusion matrix:')
    print(conf_mat)

In [12]:
depth_grid = [2, 4, 6, 8, 10]
learning_rate_grid = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3]
class_weights_grid = [[0.1, 1], [0.1, 5], [0.1, 10], [0.1, 20], [0.1, 30]]
categorical_features_indices_grid = [np.array([0, 1, 2]), np.array([])]

for d in depth_grid:
    for lr in learning_rate_grid:
        for cw in class_weights_grid:
            for cfi in categorical_features_indices_grid:
                train_catboost_classifier(iterations = 300,
                                          depth = d,
                                          learning_rate = lr,
                                          loss_function = 'Logloss',
                                          class_weights = cw,
                                          categorical_features_indices = np.array([0, 1, 2]))

Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 2 , learning_rate = 0.001 , loss_function = Logloss
class_weights = [0.1, 1]
Best iteration: 0
Accuracy: 0.7815636555731643
Confusion matrix:
[[23572  6385]
 [  231   100]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 2 , learning_rate = 0.001 , loss_function = Logloss
class_weights = [0.1, 1]
Best iteration: 0
Accuracy: 0.7815636555731643
Confusion matrix:
[[23572  6385]
 [  231   100]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 2 , learning_rate = 0.001 , loss_function = Logloss
class_weights = [0.1, 5]
Best iteration: 0
Accuracy: 0.758122028526149
Confusion matrix:
[[22845  7112]
 [  214   117]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 2 , learning_rate = 0.001 , loss_function = Logloss
class_weights = [0.1, 5]
Best iteration: 0
Accuracy: 0.758122028526149
Confusion matrix:
[[228

Best iteration: 0
Accuracy: 0.595648441627047
Confusion matrix:
[[17852 12105]
 [  142   189]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 2 , learning_rate = 0.01 , loss_function = Logloss
class_weights = [0.1, 20]
Best iteration: 52
Accuracy: 0.9783412572636028
Confusion matrix:
[[29619   338]
 [  318    13]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 2 , learning_rate = 0.01 , loss_function = Logloss
class_weights = [0.1, 20]
Best iteration: 52
Accuracy: 0.9783412572636028
Confusion matrix:
[[29619   338]
 [  318    13]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 2 , learning_rate = 0.01 , loss_function = Logloss
class_weights = [0.1, 30]
Best iteration: 75
Accuracy: 0.9783412572636028
Confusion matrix:
[[29619   338]
 [  318    13]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 2 , learning_rate = 0.01 , loss_function = Logl

Best iteration: 0
Accuracy: 0.7815636555731643
Confusion matrix:
[[23572  6385]
 [  231   100]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 2 , learning_rate = 0.3 , loss_function = Logloss
class_weights = [0.1, 1]
Best iteration: 0
Accuracy: 0.7815636555731643
Confusion matrix:
[[23572  6385]
 [  231   100]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 2 , learning_rate = 0.3 , loss_function = Logloss
class_weights = [0.1, 5]
Best iteration: 0
Accuracy: 0.758122028526149
Confusion matrix:
[[22845  7112]
 [  214   117]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 2 , learning_rate = 0.3 , loss_function = Logloss
class_weights = [0.1, 5]
Best iteration: 0
Accuracy: 0.758122028526149
Confusion matrix:
[[22845  7112]
 [  214   117]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 2 , learning_rate = 0.3 , loss_function = Logloss
class_w

Best iteration: 0
Accuracy: 0.6678222398309561
Confusion matrix:
[[20060  9897]
 [  164   167]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 4 , learning_rate = 0.003 , loss_function = Logloss
class_weights = [0.1, 20]
Best iteration: 139
Accuracy: 0.9783742736397253
Confusion matrix:
[[29620   337]
 [  318    13]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 4 , learning_rate = 0.003 , loss_function = Logloss
class_weights = [0.1, 20]
Best iteration: 139
Accuracy: 0.9783742736397253
Confusion matrix:
[[29620   337]
 [  318    13]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 4 , learning_rate = 0.003 , loss_function = Logloss
class_weights = [0.1, 30]
Best iteration: 242
Accuracy: 0.9783412572636028
Confusion matrix:
[[29619   338]
 [  318    13]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 4 , learning_rate = 0.003 , loss_functio

Best iteration: 0
Accuracy: 0.7803420496566297
Confusion matrix:
[[23533  6424]
 [  229   102]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 4 , learning_rate = 0.1 , loss_function = Logloss
class_weights = [0.1, 1]
Best iteration: 0
Accuracy: 0.7803420496566297
Confusion matrix:
[[23533  6424]
 [  229   102]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 4 , learning_rate = 0.1 , loss_function = Logloss
class_weights = [0.1, 5]
Best iteration: 0
Accuracy: 0.6678222398309561
Confusion matrix:
[[20060  9897]
 [  164   167]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 4 , learning_rate = 0.1 , loss_function = Logloss
class_weights = [0.1, 5]
Best iteration: 0
Accuracy: 0.6678222398309561
Confusion matrix:
[[20060  9897]
 [  164   167]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 4 , learning_rate = 0.1 , loss_function = Logloss
class

Best iteration: 0
Accuracy: 0.6678222398309561
Confusion matrix:
[[20060  9897]
 [  164   167]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 6 , learning_rate = 0.001 , loss_function = Logloss
class_weights = [0.1, 20]
Best iteration: 299
Accuracy: 0.9783412572636028
Confusion matrix:
[[29619   338]
 [  318    13]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 6 , learning_rate = 0.001 , loss_function = Logloss
class_weights = [0.1, 20]
Best iteration: 299
Accuracy: 0.9783412572636028
Confusion matrix:
[[29619   338]
 [  318    13]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 6 , learning_rate = 0.001 , loss_function = Logloss
class_weights = [0.1, 30]
Best iteration: 299
Accuracy: 0.9777469624933968
Confusion matrix:
[[29603   354]
 [  320    11]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 6 , learning_rate = 0.001 , loss_functio

Best iteration: 0
Accuracy: 0.7803420496566297
Confusion matrix:
[[23533  6424]
 [  229   102]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 6 , learning_rate = 0.03 , loss_function = Logloss
class_weights = [0.1, 1]
Best iteration: 0
Accuracy: 0.7803420496566297
Confusion matrix:
[[23533  6424]
 [  229   102]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 6 , learning_rate = 0.03 , loss_function = Logloss
class_weights = [0.1, 5]
Best iteration: 0
Accuracy: 0.9001254622292657
Confusion matrix:
[[27214  2743]
 [  282    49]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 6 , learning_rate = 0.03 , loss_function = Logloss
class_weights = [0.1, 5]
Best iteration: 0
Accuracy: 0.9001254622292657
Confusion matrix:
[[27214  2743]
 [  282    49]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 6 , learning_rate = 0.03 , loss_function = Logloss
c

Best iteration: 0
Accuracy: 0.6678222398309561
Confusion matrix:
[[20060  9897]
 [  164   167]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 6 , learning_rate = 0.3 , loss_function = Logloss
class_weights = [0.1, 20]
Best iteration: 1
Accuracy: 0.9277931854199684
Confusion matrix:
[[28049  1908]
 [  279    52]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 6 , learning_rate = 0.3 , loss_function = Logloss
class_weights = [0.1, 20]
Best iteration: 1
Accuracy: 0.9277931854199684
Confusion matrix:
[[28049  1908]
 [  279    52]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 6 , learning_rate = 0.3 , loss_function = Logloss
class_weights = [0.1, 30]
Best iteration: 3
Accuracy: 0.9333069202324353
Confusion matrix:
[[28220  1737]
 [  283    48]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 6 , learning_rate = 0.3 , loss_function = Logloss
cl

Best iteration: 0
Accuracy: 0.7803420496566297
Confusion matrix:
[[23533  6424]
 [  229   102]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 8 , learning_rate = 0.01 , loss_function = Logloss
class_weights = [0.1, 1]
Best iteration: 0
Accuracy: 0.7803420496566297
Confusion matrix:
[[23533  6424]
 [  229   102]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 8 , learning_rate = 0.01 , loss_function = Logloss
class_weights = [0.1, 5]
Best iteration: 0
Accuracy: 0.9001254622292657
Confusion matrix:
[[27214  2743]
 [  282    49]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 8 , learning_rate = 0.01 , loss_function = Logloss
class_weights = [0.1, 5]
Best iteration: 0
Accuracy: 0.9001254622292657
Confusion matrix:
[[27214  2743]
 [  282    49]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 8 , learning_rate = 0.01 , loss_function = Logloss
c

Best iteration: 0
Accuracy: 0.6678222398309561
Confusion matrix:
[[20060  9897]
 [  164   167]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 8 , learning_rate = 0.1 , loss_function = Logloss
class_weights = [0.1, 20]
Best iteration: 3
Accuracy: 0.9333069202324353
Confusion matrix:
[[28220  1737]
 [  283    48]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 8 , learning_rate = 0.1 , loss_function = Logloss
class_weights = [0.1, 20]
Best iteration: 3
Accuracy: 0.9333069202324353
Confusion matrix:
[[28220  1737]
 [  283    48]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 8 , learning_rate = 0.1 , loss_function = Logloss
class_weights = [0.1, 30]
Best iteration: 8
Accuracy: 0.9785063391442155
Confusion matrix:
[[29624   333]
 [  318    13]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 8 , learning_rate = 0.1 , loss_function = Logloss
cl

Best iteration: 0
Accuracy: 0.7803420496566297
Confusion matrix:
[[23533  6424]
 [  229   102]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 10 , learning_rate = 0.003 , loss_function = Logloss
class_weights = [0.1, 1]
Best iteration: 0
Accuracy: 0.7803420496566297
Confusion matrix:
[[23533  6424]
 [  229   102]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 10 , learning_rate = 0.003 , loss_function = Logloss
class_weights = [0.1, 5]
Best iteration: 0
Accuracy: 0.9001254622292657
Confusion matrix:
[[27214  2743]
 [  282    49]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 10 , learning_rate = 0.003 , loss_function = Logloss
class_weights = [0.1, 5]
Best iteration: 0
Accuracy: 0.9001254622292657
Confusion matrix:
[[27214  2743]
 [  282    49]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 10 , learning_rate = 0.003 , loss_function = L

Best iteration: 0
Accuracy: 0.6678222398309561
Confusion matrix:
[[20060  9897]
 [  164   167]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 10 , learning_rate = 0.03 , loss_function = Logloss
class_weights = [0.1, 20]
Best iteration: 15
Accuracy: 0.9783412572636028
Confusion matrix:
[[29619   338]
 [  318    13]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 10 , learning_rate = 0.03 , loss_function = Logloss
class_weights = [0.1, 20]
Best iteration: 15
Accuracy: 0.9783412572636028
Confusion matrix:
[[29619   338]
 [  318    13]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 10 , learning_rate = 0.03 , loss_function = Logloss
class_weights = [0.1, 30]
Best iteration: 32
Accuracy: 0.978770470153196
Confusion matrix:
[[29634   323]
 [  320    11]]
Categorical_features_indices = [0 1 2]
Model parameters:
Iterations = 300 , depth = 10 , learning_rate = 0.03 , loss_function = 

In [13]:
sample_submission = pd.read_csv(SAMPLE_SUBMISSION_FILE)
sample_submission.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,0
1,C_ID_130fd0cbdd,0
2,C_ID_b709037bc5,0
3,C_ID_d27d835a9f,0
4,C_ID_2b5e3df5c2,0


In [14]:
sample_submission['target'].sum()

0

In [15]:
sample_submission.shape

(123623, 2)

In [16]:
train.shape

(201917, 10)

In [18]:
train_pred_of_zeros = np.zeros(train.shape[0])
train_rmse_of_zeros = np.sqrt(mean_squared_error(train['target'], train_pred_of_zeros))
train_rmse_of_zeros

3.8705589161316296

sample submission rmse on the leaderboard: 3.847

In [19]:
train_no_outliers = train[train['target'] > -30]
train_no_outliers.shape

(199710, 10)

In [20]:
train_no_outliers_pred_of_zeros = np.zeros(train_no_outliers.shape[0])
train_no_outliers_rmse_of_zeros = np.sqrt(mean_squared_error(train_no_outliers['target'], train_no_outliers_pred_of_zeros))
train_no_outliers_rmse_of_zeros

1.718066151175359

In [21]:
train_no_outliers['target'].describe()

count    199710.000000
mean         -0.030879
std           1.717793
min         -17.608147
25%          -0.844513
50%          -0.007783
75%           0.777331
max          17.965068
Name: target, dtype: float64

In [22]:
train['target'].describe()

count    201917.000000
mean         -0.393636
std           3.850500
min         -33.219281
25%          -0.883110
50%          -0.023437
75%           0.765453
max          17.965068
Name: target, dtype: float64