# Elo Competition
## Isolation Forest
### Outlier Detection
#### 03-02-2019

In [1]:
## Variables specific for competition

ID = 'card_id'                                            
TARGET = 'target'    

RAW_DIRECTORY = 'C:/Users/judit/Documents/learning/kaggle/Elo_201812/rawdata/'  
DIRECTORY = 'C:/Users/judit/Documents/learning/kaggle/Elo_201812/data/'
HIST_TRANS_FILE = RAW_DIRECTORY + 'historical_transactions.csv'
MERCHANTS_FILE = RAW_DIRECTORY + 'merchants.csv'
NEW_MERCH_TRANS_FILE = RAW_DIRECTORY + 'new_merchant_transactions.csv'
TRAIN_FILE = RAW_DIRECTORY + 'train.csv'    
TEST_FILE = RAW_DIRECTORY +'test.csv'
SAMPLE_SUBMISSION_FILE = RAW_DIRECTORY + 'sample_submission.csv'

SUBMISSION_DIRECTORY = 'C:/Users/judit/Documents/learning/kaggle/Elo_201812/submissions/'

In [2]:
## Variables specific for notebook
NUM = 0
SUBMIT_FILENAME = SUBMISSION_DIRECTORY + 'iso_forest_submit_190104_'

HIST_AGG_FILE_1 = DIRECTORY + 'hist_agg_20.pkl'        # only authorized, domestic, last 3 months
HIST_AGG_FILE_2 = DIRECTORY + 'hist_agg_28.pkl'        # only authorized, non-domestic, last 3 months
NEW_AGG_FILE_1 = DIRECTORY + 'new_agg_3.pkl'           # only authorized, domestic, whole period
NEW_AGG_FILE_2 = DIRECTORY + 'new_agg_5.pkl'           # only authorized, non-domestic, whole period

categorical_features = ['first_active_month',
                        'card_id',
           #             'feature_1',
           #             'feature_2',
           #             'feature_3',
                        'hist_city_id_mode_auth_dom_lag2m',
                        'hist_category_3_mode_auth_dom_lag2m',
                        'hist_merchant_category_id_mode_auth_dom_lag2m',
                        'hist_merchant_id_mode_auth_dom_lag2m',
                        'hist_category_2_mode_auth_dom_lag2m',
                        'hist_state_id_mode_auth_dom_lag2m',
                        'hist_subsector_id_mode_auth_dom_lag2m',
                        'hist_city_id_mode_auth_nondom_lag2m',
                        'hist_category_3_mode_auth_nondom_lag2m',
                        'hist_merchant_category_id_mode_auth_nondom_lag2m',
                        'hist_merchant_id_mode_auth_nondom_lag2m',
                        'hist_category_2_mode_auth_nondom_lag2m',
                        'hist_state_id_mode_auth_nondom_lag2m',
                        'hist_subsector_id_mode_auth_nondom_lag2m',
                        'new_city_id_mode_dom_all',
                        'new_category_3_mode_dom_all',
                        'new_merchant_category_id_mode_dom_all',
                        'new_merchant_id_mode_dom_all',
                        'new_category_2_mode_dom_all',
                        'new_state_id_mode_dom_all',
                        'new_subsector_id_mode_dom_all',
                        'new_city_id_mode_nondom_all',
                        'new_category_3_mode_nondom_all',
                        'new_merchant_category_id_mode_nondom_all',
                        'new_merchant_id_mode_nondom_all',
                        'new_category_2_mode_nondom_all',
                        'new_state_id_mode_nondom_all',
                        'new_subsector_id_mode_nondom_all',
                        'merchant_group_id_hist_merchant_id_mode_auth_dom_lag2m',
                        'merchant_category_id_hist_merchant_id_mode_auth_dom_lag2m',
                        'subsector_id_hist_merchant_id_mode_auth_dom_lag2m',
                        'city_id_hist_merchant_id_mode_auth_dom_lag2m',
                        'state_id_hist_merchant_id_mode_auth_dom_lag2m',
                        'category_2_hist_merchant_id_mode_auth_dom_lag2m',
                        'merchant_group_id_hist_merchant_id_mode_auth_nondom_lag2m',
                        'merchant_category_id_hist_merchant_id_mode_auth_nondom_lag2m',
                        'subsector_id_hist_merchant_id_mode_auth_nondom_lag2m',
                        'city_id_hist_merchant_id_mode_auth_nondom_lag2m',
                        'state_id_hist_merchant_id_mode_auth_nondom_lag2m',
                        'category_2_hist_merchant_id_mode_auth_nondom_lag2m',
                        'merchant_group_id_new_merchant_id_mode_dom_all',
                        'merchant_category_id_new_merchant_id_mode_dom_all',
                        'subsector_id_new_merchant_id_mode_dom_all',
                        'city_id_new_merchant_id_mode_dom_all',
                        'state_id_new_merchant_id_mode_dom_all',
                        'category_2_new_merchant_id_mode_dom_all',
                        'merchant_group_id_new_merchant_id_mode_nondom_all',
                        'merchant_category_id_new_merchant_id_mode_nondom_all',
                        'subsector_id_new_merchant_id_mode_nondom_all',
                        'city_id_new_merchant_id_mode_nondom_all',
                        'state_id_new_merchant_id_mode_nondom_all',
                        'category_2_new_merchant_id_mode_nondom_all']

In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import random
import time
random.seed(1)
start_time = time.time()

  from numpy.core.umath_tests import inner1d


In [4]:
## Useful functions
def reduce_mem_usage(df, verbose = True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Starting memory usage: {:5.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Reduced memory usage: {:5.2f} MB ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem\
                                                                                                   ) / start_mem))
    return df

def mode(series):
    if len(series.mode()) > 0:
        return series.mode().iloc[0]
    else:
        return np.nan

def label_encoder(train_df, test_df = None, valid_df = None, localtest_df = None,
                  prefix = '_labelenc_', suffix = '',
                  target = 'is_outlier',
                  cols_and_encodings = {'feature_1' : ['mean', 'median'],
                                         'feature_2' : [mode]}):
    '''
    Calculates label encodings based on train_df.
    Can be used both for classification and regression problems.
    
    cols_and_encodings is a dictionary. 
    Keys: features that we want to encode (usually categorical features with only a few possible values).
    Values : list of aggregation functions which will be applied on target values.
    '''
    for col, enc_list in cols_and_encodings.items():
        agg_df = train_df.groupby(col).agg({target : enc_list})
        agg_df.columns = [col + prefix + '_'.join(colname).strip() + suffix for colname in agg_df.columns.values]  
        agg_df.reset_index(inplace = True)
        train_df = train_df.merge(agg_df, how = 'left', on = col)
        if test_df:
            test_df = test_df.merge(agg_df, how = 'left', on = col)
        if valid_df:
            valid_df = valid_df.merge(agg_df, how = 'left', on = col)
        if localtest_df:
            localtest_df = localtest_df.merge(agg_df, how = 'left', on = col)

    return train_df, test_df, valid_df, localtest_df

In [5]:
hist_agg_1 = pd.read_pickle(HIST_AGG_FILE_1)
hist_agg_2 = pd.read_pickle(HIST_AGG_FILE_2)
new_agg_1 = pd.read_pickle(NEW_AGG_FILE_1)
new_agg_2 = pd.read_pickle(NEW_AGG_FILE_2)
merch = pd.read_csv(MERCHANTS_FILE)
train = pd.read_csv(TRAIN_FILE, parse_dates = ["first_active_month"])
test = pd.read_csv(TEST_FILE, parse_dates = ["first_active_month"])

train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749


In [6]:
def data_transformations_and_merges_eliminating_nans(df = train, merch_df = merch.copy(), hist_agg_1_df = hist_agg_1, 
                                                     hist_agg_2_df = hist_agg_2, new_agg_1_df = new_agg_1, 
                                                     new_agg_2_df = new_agg_2, istrain = True):
    
    # card_id is a hexadecimal number. Convert it into decimal.
    df['card_id'] = df['card_id'].apply(lambda s : int(s[5:], 16))
    df = reduce_mem_usage(df = df, verbose = True)
    
    if istrain:
        # merchant_id is a hexadecimal number. Convert it into decimal.
        merch_df['merchant_id'] = merch_df['merchant_id'].apply(lambda s : int(s[5:], 16))
    # convert categorical features into numerical ones using one-hot encoding (to avoid nan values)
    merch_cat_feats = ['category_1', 'most_recent_sales_range', 'most_recent_purchases_range', 'category_4']
    merch_df = pd.get_dummies(merch_df, columns = merch_cat_feats)
    
    # impute mean for missing values in numerical columns of hist_agg_1, hist_agg_2, new_agg_1, new_agg_2
    num_cols_with_nans = [x for x in hist_agg_1_df.columns[hist_agg_1_df.isna().any()] if x not in categorical_features]
    hist_agg_1_df[num_cols_with_nans] = hist_agg_1_df[num_cols_with_nans].fillna(hist_agg_1_df[num_cols_with_nans].mean())
    
    num_cols_with_nans = [x for x in hist_agg_2_df.columns[hist_agg_2.isna().any()] if x not in categorical_features]
    hist_agg_2_df[num_cols_with_nans] = hist_agg_2_df[num_cols_with_nans].fillna(hist_agg_2_df[num_cols_with_nans].mean())
    
    num_cols_with_nans = [x for x in new_agg_1_df.columns[new_agg_1.isna().any()] if x not in categorical_features]
    new_agg_1_df[num_cols_with_nans] = new_agg_1_df[num_cols_with_nans].fillna(new_agg_1_df[num_cols_with_nans].mean())
    
    num_cols_with_nans = [x for x in new_agg_2_df.columns[new_agg_2.isna().any()] if x not in categorical_features]
    new_agg_2_df[num_cols_with_nans] = new_agg_2_df[num_cols_with_nans].fillna(new_agg_2_df[num_cols_with_nans].mean())
    
    # merges
    df = df.merge(hist_agg_1_df, how = 'left', on = 'card_id')
    df = df.merge(hist_agg_2_df, how = 'left', on = 'card_id')
    df = df.merge(new_agg_1_df, how = 'left', on = 'card_id')
    df = df.merge(new_agg_2_df, how = 'left', on = 'card_id')
    df = df.merge(merch_df, how = 'left', left_on = 'hist_merchant_id_mode_auth_dom_lag2m', right_on = 'merchant_id',
                  suffixes = ('', '_hist_merchant_id_mode_auth_dom_lag2m'))
    df = df.merge(merch_df, how = 'left', left_on = 'hist_merchant_id_mode_auth_nondom_lag2m', right_on = 'merchant_id',
                  suffixes = ('', '_hist_merchant_id_mode_auth_nondom_lag2m'))
    df = df.merge(merch_df, how = 'left', left_on = 'new_merchant_id_mode_dom_all', right_on = 'merchant_id',
                  suffixes = ('', '_new_merchant_id_mode_dom_all'))
    df = df.merge(merch_df, how = 'left', left_on = 'new_merchant_id_mode_nondom_all', right_on = 'merchant_id',
                  suffixes = ('', '_new_merchant_id_mode_nondom_all'))
    for colname in merch_df.columns:
        df.rename(columns = {colname: colname + '_hist_merchant_id_mode_auth_dom_lag2m'}, inplace = True)
    df.drop(['merchant_id_hist_merchant_id_mode_auth_dom_lag2m',
             'merchant_id_hist_merchant_id_mode_auth_nondom_lag2m',
             'merchant_id_new_merchant_id_mode_dom_all',
             'merchant_id_new_merchant_id_mode_nondom_all'], inplace = True, axis = 1)
    
    # drop categorical features as Isolation Forest can't handle them
    df.drop(categorical_features, axis = 1, inplace = True)
    
    df = reduce_mem_usage(df = df, verbose = True)
    
    return df

In [None]:
train_df = data_transformations_and_merges_eliminating_nans(df = train)
train_df['is_outlier'] = train_df['target'].apply(lambda x : x < -30)
train_df['is_outlier'] = train_df['is_outlier'].apply(lambda x : int(x))
train_target = train_df['target'].copy()
train_df.drop('target', axis = 1, inplace = True)
# split into training set (80%) and local test set (20%)
X_train, X_localtest, y_train, y_localtest = train_test_split(train_df, train_target, train_size = 0.8, random_state = 1,
                                                              stratify = train_df[['feature_1', 'feature_2', 'feature_3', 
                                                                                   'is_outlier']])
y_train_label = X_train['is_outlier'].copy()
y_localtest_label = X_localtest['is_outlier'].copy()
X_train.drop('is_outlier', axis = 1, inplace = True)
X_localtest.drop('is_outlier', axis = 1, inplace = True)

Starting memory usage:  9.24 MB
Reduced memory usage:  4.04 MB (56.2% reduction)
Starting memory usage: 596.37 MB


In [None]:
X_test = data_transformations_and_merges_eliminating_nans(df = test, istrain = False)  # why should I use istrain=False?

In [None]:
X_train.shape, X_localtest.shape, X_test.shape

In [None]:
pickle.dump((X_train, y_train, y_train_label), open(DIRECTORY + 'localtrain_set_16_notebook.p', 'wb'))
pickle.dump((X_localtest, y_localtest, y_localtest_label), open(DIRECTORY + 'localtest_set_16_notebook.p', 'wb'))
pickle.dump(X_test, open(DIRECTORY + 'test_set_16_notebook.p', 'wb'))
pickle.dump((train_df, train_target), open(DIRECTORY + 'full_train_set_16_notebook.p', 'wb'))

In [None]:
outlier_target = y_train.min()
y_train_label.shape, outlier_target

In [None]:
X_train.shape, X_localtest.shape

In [None]:
X_train = X_train.reset_index(drop = True)
X_localtest = X_localtest.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)

In [None]:
# Isolation Forest ----
seed = 1
contamination_ratio = y_train_label.sum() / y_train_label.shape[0]
print('Contamination:', contamination_ratio)

# training the model
clf = IsolationForest(n_estimators = 100, max_samples = 0.02, contamination = contamination_ratio, n_jobs = -1, 
                      random_state = seed, verbose = 10)
clf.fit(X_train.values)

# Predictions should be transformed as Isolation Forest predicts 1 for outliers, -1 for inliers,
# but our labels are 1 for outliers, 0 for inliers
y_pred_train = (clf.predict(X_train.values) + 1) / 2
y_pred_localtest = (clf.predict(X_localtest.values) + 1) /2

In [None]:
# accuracy on localtest
print("Accuracy on localtest:", accuracy_score(y_localtest_label, y_pred_localtest))

In [None]:
# inliers in localtest
print("Accuracy on inliers:", accuracy_score(y_localtest_label[y_localtest_label == 0], 
                                             y_pred_localtest[y_localtest_label == 0]))

In [None]:
# outliers in localtest
print("Accuracy on inliers:", accuracy_score(y_localtest_label[y_localtest_label == 1], 
                                             y_pred_localtest[y_localtest_label == 1]))

In [None]:
print('Confusion matrix:')
confusion_matrix(y_localtest_label, y_pred_localtest)

### Predict for test set

In [None]:
y_pred_test = (clf.predict(X_test.values) + 1) /2

In [None]:
# Number of positive predictions
sum(y_pred_test)

In [None]:
y_pred_test[y_pred_test == 1] = outlier_target

In [None]:
submission = pd.read_csv(SAMPLE_SUBMISSION_FILE)
submission['target'] = y_pred_test
submission.head()

In [None]:
NUM = 0
submission.to_csv(SUBMIT_FILENAME + str(NUM) + '.csv', index = False)

## Normalize features before training Isolation Forest

In [None]:
def normalize(np_array):
    '''Normalizes a numpy 2d array columnwise'''
    x = np_array - np_array.mean(axis = 0)
    x = x / np_array.std(axis = 0)
    return x

In [None]:
norm_train = normalize(X_train.values)
norm_localtest = normalize(X_localtest.values)
norm_test = normalize(X_test.values)

In [None]:
# Isolation Forest ----
seed = 1

# training the model
clf_norm = IsolationForest(n_estimators = 100, max_samples = 0.02, contamination = contamination_ratio, n_jobs = -1, 
                      random_state = seed, verbose = 10)
clf_norm.fit(norm_train)

# Predictions should be transformed as Isolation Forest predicts 1 for outliers, -1 for inliers,
# but our labels are 1 for outliers, 0 for inliers
y_pred_train = (clf_norm.predict(norm_train) + 1) / 2
y_pred_localtest = (clf_norm.predict(norm_localtest) + 1) /2

In [None]:
# accuracy on localtest
print("Accuracy on localtest:", accuracy_score(y_localtest_label, y_pred_localtest))

In [None]:
# inliers in localtest
print("Accuracy on inliers:", accuracy_score(y_localtest_label[y_localtest_label == 0], 
                                             y_pred_localtest[y_localtest_label == 0]))

In [None]:
# outliers in localtest
print("Accuracy on inliers:", accuracy_score(y_localtest_label[y_localtest_label == 1], 
                                             y_pred_localtest[y_localtest_label == 1]))

In [None]:
print('Confusion matrix:')
confusion_matrix(y_localtest_label, y_pred_localtest)

### Predict for normalized test set

In [None]:
y_pred_test = (clf.predict(norm_test) + 1) /2

In [None]:
# Number of positive predictions
sum(y_pred_test)

In [None]:
y_pred_test[y_pred_test == 1] = outlier_target

In [None]:
submission = pd.read_csv(SAMPLE_SUBMISSION_FILE)
submission['target'] = y_pred_test
submission.head()

In [None]:
NUM = 1
submission.to_csv(SUBMIT_FILENAME + str(NUM) + '.csv', index = False)