In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Basics
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import re

# Data Preprocessing and Model Training
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost.sklearn import XGBClassifier

np.random.seed(1)

## Merging Dataset

In [0]:
wd = '/content/drive/My Drive/Colab Notebooks/Kaggle/IEEE_FDS/data/'
# train_transaction = pd.read_csv(wd+'train_transaction.csv', index_col='TransactionID')
# test_transaction = pd.read_csv(wd+'test_transaction.csv', index_col='TransactionID')

# train_identity = pd.read_csv(wd+'train_identity.csv', index_col='TransactionID')
# test_identity = pd.read_csv(wd+'test_identity.csv', index_col='TransactionID')

# sample_submission = pd.read_csv(wd+'sample_submission.csv', index_col='TransactionID')

# train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
# test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)
# del train_transaction, test_transaction, train_identity, test_identity

# train.to_csv(wd+'train_merged.csv', index=False)
# test.to_csv(wd+'test_merged.csv')

# del train, test, sample_submission

## Read Dataset

In [0]:
path = '/content/drive/My Drive/Colab Notebooks/Kaggle/IEEE_FDS/data/train_merged.csv'
df = pd.read_csv(path) # original shape: 590540, 433
print(df.shape)

(590540, 433)


In [0]:
## Function to reduce the memory size of the DataFrame
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (
                start_mem - end_mem) / start_mem))
    return df


df = reduce_mem_usage(df)

Mem. usage decreased to 643.72 Mb (67.0% reduction)


In [0]:
# List of columns to drop
many_null_cols = [col for col in df.columns if df[col].isnull().sum() / df.shape[0] > 0.9]
big_top_value_cols = [col for col in df.columns if df[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
cols_to_drop = list(set(many_null_cols + big_top_value_cols))
cols_to_drop.remove('isFraud')
print("Columns to be dropped during Column Transformer. {}".format(len(cols_to_drop)))

Columns to be dropped during Column Transformer. 66


In [0]:
y = df['isFraud']
X = df.drop('isFraud', axis=1)
del df

In [0]:
# Borrowed from https://www.kaggle.com/dimartinot
def clean_inf_nan(df):
    return df.replace([np.inf, -np.inf], np.nan)


# Cleaning infinite values to NaN
X = clean_inf_nan(X)

In [0]:
def manual_feature_engineering(X):
    # Change 'id_33' to screen_size (simplify by its size)
    X['ScreenSize'] = X['id_33'].str.split('x', n=2, expand=True)[0].astype(np.float32) * \
                       X['id_33'].str.split('x', n=2, expand=True)[1].astype(np.float32)
    del X['id_33']

    # Transform DeviceInfo column which has too many categorical values
    X['zz_di_Windows'] = X['DeviceInfo'].str.contains('Windows')
    X['zz_di_RV'] = X['DeviceInfo'].str.contains('rv')
    X['zz_di_iOS'] = X['DeviceInfo'].str.contains('iOS')
    X['zz_di_MacOS'] = X['DeviceInfo'].str.contains('MacOS')
    X['zz_di_Trident'] = X['DeviceInfo'].str.contains('Trident')
    X['zz_di_SAMSUNG'] = X['DeviceInfo'].str.contains('SAMSUNG')
    X['zz_di_SM'] = X['DeviceInfo'].str.contains('SM-')
    X['zz_di_Huawei'] = X['DeviceInfo'].str.contains('Huawei', flags=re.IGNORECASE, regex=True)
    X['zz_di_Moto'] = X['DeviceInfo'].str.contains('Moto')
    del X['DeviceInfo']

    # New columns related to user agent
    X['zz_ua_ie_PC'] = (X['id_31'].str.contains('ie'))
    X['zz_ua_safari'] = (X['id_31'].str.contains('safari'))
    X['zz_ua_chrome'] = (X['id_31'].str.contains('chrome'))
    X['zz_ua_edge'] = X['id_31'].str.contains('edge')
    X['zz_ua_ff'] = X['id_31'].str.contains('firefox')
    X['zz_ua_tab'] = X['id_31'].str.contains('tablet')
    X['zz_ua_opera'] = X['id_31'].str.contains('opera')
    X['zz_ua_android'] = X['id_31'].str.contains('android')
    X['zz_ua_mobile'] = X['id_31'].str.contains('mobile')
    del X['id_31']
    return X


X = manual_feature_engineering(X)

zz = [col for col in X.columns if 'zz_' in col]

# Replace {True, False, NaN} to {1, 0, -1} in the columns created above
def bool_to_int(df):
    return df.replace(True, 1).replace(False, 0).fillna(-1).astype(np.int8)

X.loc[:, zz] = bool_to_int(X.loc[:, zz])

In [0]:
# List of categorical features
cat_features = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain',
                'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12',
                'id_15', 'id_16', 'id_28', 'id_29', 'id_30',
                'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
                'DeviceType']
# List of numerical features
num_features = list(set(X.columns) - set(cols_to_drop) - set(cat_features) - set(zz))

# Define Column Transformer
num_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=-9)),
                                  ('scaler', RobustScaler())
                                 ])
cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                  ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])
ctf = ColumnTransformer(transformers=[("numerical", num_transformer, num_features),
                                               ("categorical", cat_transformer, cat_features),
                                               ("zz", "passthrough", zz),
                                               ("cols_to_drop", "drop", cols_to_drop)],
                        verbose=True)

ctf.fit(X=X)

print("Shape before ColumnTransformer. | {}".format(X.shape))
X = ctf.transform(X)
print("Shape after ColumnTransformer. | {}".format(X.shape))

[ColumnTransformer] ..... (1 of 3) Processing numerical, total=  15.0s
[ColumnTransformer] ... (2 of 3) Processing categorical, total=   5.6s
[ColumnTransformer] ............ (3 of 3) Processing zz, total=   0.0s
Shape before ColumnTransformer. | (590540, 448)
Shape after ColumnTransformer. | (590540, 632)


## Model

In [0]:
# XGBoost Model
xgb_model = XGBClassifier(scale_pos_weight=(1 - y.mean()), 
                          tree_method='gpu_hist')
param_grid = {
        "model__early_stopping_rounds": [3, 5, 7],
        "model__max_depth": [4, 5, 6],
        "model__min_child_weight": [4, 5, 6],
        "model__n_estimators": [20, 25, 30]
          }
gs = GridSearchCV(estimator=xgb_model, param_grid=param_grid, verbose=2,
                  n_jobs=1, cv=5, scoring="roc_auc")

# Sampling for machine's memory reason (only for hyperparameter search)
idx = np.random.choice(X.shape[0], 100000, replace=False)
idx = sorted(idx)
samp_X = X[idx, :]
samp_y = y[idx]

start_time = time.time()
gs.fit(samp_X, samp_y)
elapsed_time = time.time() - start_time
print("Time Elapsed: " , time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
# Check out best parameters and score
print(gs.best_params_)
print(gs.best_score_)  # before: 0.85

# Retraining with the best parameters learned
xgb_model.set_params(**gs.best_params_)
xgb_model.fit(X, y)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] model__early_stopping_rounds=3, model__max_depth=4, model__min_child_weight=4, model__n_estimators=20 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  model__early_stopping_rounds=3, model__max_depth=4, model__min_child_weight=4, model__n_estimators=20, total=   2.9s
[CV] model__early_stopping_rounds=3, model__max_depth=4, model__min_child_weight=4, model__n_estimators=20 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.9s remaining:    0.0s


[CV]  model__early_stopping_rounds=3, model__max_depth=4, model__min_child_weight=4, model__n_estimators=20, total=   2.6s
[CV] model__early_stopping_rounds=3, model__max_depth=4, model__min_child_weight=4, model__n_estimators=20 
[CV]  model__early_stopping_rounds=3, model__max_depth=4, model__min_child_weight=4, model__n_estimators=20, total=   2.8s
[CV] model__early_stopping_rounds=3, model__max_depth=4, model__min_child_weight=4, model__n_estimators=20 
[CV]  model__early_stopping_rounds=3, model__max_depth=4, model__min_child_weight=4, model__n_estimators=20, total=   2.7s
[CV] model__early_stopping_rounds=3, model__max_depth=4, model__min_child_weight=4, model__n_estimators=20 
[CV]  model__early_stopping_rounds=3, model__max_depth=4, model__min_child_weight=4, model__n_estimators=20, total=   2.5s
[CV] model__early_stopping_rounds=3, model__max_depth=4, model__min_child_weight=4, model__n_estimators=25 
[CV]  model__early_stopping_rounds=3, model__max_depth=4, model__min_child_w

[Parallel(n_jobs=1)]: Done 405 out of 405 | elapsed: 18.2min finished


Time Elapsed:  00:18:14
{'model__early_stopping_rounds': 3, 'model__max_depth': 4, 'model__min_child_weight': 4, 'model__n_estimators': 20}
0.860143586232301


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, model__early_stopping_rounds=3,
              model__max_depth=4, model__min_child_weight=4,
              model__n_estimators=20, n_estimators=100, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=0.9650099908558268, seed=None,
              silent=None, subsample=1, tree_method='gpu_hist', verbosity=1)

In [0]:
test_path = '/content/drive/My Drive/Colab Notebooks/Kaggle/IEEE_FDS/data/test_merged.csv'
test = pd.read_csv(test_path)
print(test.shape)

(506691, 433)


In [0]:
test_X = test.drop(['TransactionID'], axis=1)
test_X = reduce_mem_usage(test_X)
test_X = clean_inf_nan(test_X)
test_X = manual_feature_engineering(test_X)
test_X.loc[:, zz] = bool_to_int(test_X.loc[:, zz])
test_X = ctf.transform(test_X)

Mem. usage decreased to 559.57 Mb (66.5% reduction)


In [0]:
# Make prediction on the test set
test_y_pred = xgb_model.predict(test_X)

In [0]:
submission = pd.DataFrame({'TransactionID':test['TransactionID'],'isFraud':test_y_pred})

In [0]:
submission.to_csv(wd+'submission.csv', index=False)