https://www.kaggle.com/xhlulu/ieee-fraud-xgboost-with-gpu-fit-in-40s

In [None]:
import re
import pandas as pd
import numpy as np
import multiprocessing
import gc
from tqdm import tqdm_notebook
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train_id = pd.read_csv('../input/train_identity.csv')
train_tr = pd.read_csv('../input/train_transaction.csv')
test_id = pd.read_csv('../input/test_identity.csv')
test_tr = pd.read_csv('../input/test_transaction.csv')
sub = pd.read_csv('../input/sample_submission.csv')

train = pd.merge(train_tr, train_id, on='TransactionID', how='left')
test = pd.merge(test_tr, test_id, on='TransactionID', how='left')

del test_id, test_tr, train_id, train_tr
gc.collect()

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
def select_cols_to_drop(df):
    one_value_cols = [col for col in df.columns if df[col].nunique() <= 1]
    many_null_cols = [col for col in df.columns if df[col].isnull().sum() / df.shape[0] > 0.9]
    big_top_value_cols = [col for col in df.columns if df[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
    return one_value_cols + many_null_cols + big_top_value_cols

In [None]:
cols_to_drop = list(set(select_cols_to_drop(train) + select_cols_to_drop(test)))
cols_to_drop.remove('isFraud')

train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

In [None]:
import re

def select_columns_regexp(df, regexp):
    return [c for c in df.columns if re.search(regexp, c)]

In [None]:
card_cols = select_columns_regexp(train, r'card\d+')
C_cols = select_columns_regexp(train, r'C\d+')
D_cols = select_columns_regexp(train, r'D\d+')
V_cols = select_columns_regexp(train, r'V\d+')
M_cols = select_columns_regexp(train, r'M\d+')
id_cols = select_columns_regexp(train, r'id\d+')

In [None]:
# count encoding
for card_col in card_cols:
    train[f'{card_col}_count_full'] = train[card_col].map(pd.concat([train[card_col], test[card_col]], ignore_index=True).value_counts(dropna=False))
    test[f'{card_col}_count_full'] = test[card_col].map(pd.concat([train[card_col], test[card_col]], ignore_index=True).value_counts(dropna=False))
    
train = train.drop(card_cols, axis=1)
test = test.drop(card_cols, axis=1)

In [None]:
train['TransactionAmtLog'] = np.log(train['TransactionAmt'])
test['TransactionAmtLog'] = np.log(test['TransactionAmt'])

In [None]:
def add_dt_info(df):
    dt = pd.to_datetime(df['TransactionDT'], unit='s').dt
    df['hour'] = dt.hour
    df['minute'] = dt.minute
    df['second'] = dt.second
    df['dayofweek'] = dt.dayofweek

In [None]:
add_dt_info(train)
add_dt_info(test)

In [None]:
X_train = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
y_train = train.sort_values('TransactionDT')['isFraud']
X_test = test.sort_values('TransactionDT').drop(['TransactionDT', 'TransactionID'], axis=1)

del train, test
gc.collect()

In [None]:
from sklearn.preprocessing import LabelEncoder

# fill na in float columns
for col in X_train.columns:
    if X_train[col].dtype in ['float16', 'float32', 'float64']:
        X_train[col].fillna(X_train[col].mean())
        X_test[col].fillna(X_train[col].mean())

# fill in -999 for categorical columns
X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

for col in tqdm_notebook(X_train.columns):
    if X_train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(X_train[col].astype(str).values) + list(X_test[col].astype(str).values))
        X_train[col] = le.transform(X_train[col].astype(str).values)
        X_test[col] = le.transform(X_test[col].astype(str).values)

In [None]:
SEED = 42

model_params = {
    'objective': 'binary:logistic',
    'n_estimators': 10000,
    'max_depth': 9,
    'learning_rate': 0.05,
    'reg_alpha': 0.15,
    'reg_lamdba': 0.85,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'missing': -999,
    'random_state': SEED,
    'tree_method': 'gpu_hist'
}

[sklearn.model_selection.TimeSeriesSplit — scikit-learn 0.21.3 documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html)

In [None]:
import time
from datetime import datetime, timedelta
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
import xgboost as xgb

# folds = TimeSeriesSplit(n_splits=5)
folds = StratifiedKFold(n_splits=5, random_state=SEED)

aucs = list()
feature_importances = pd.DataFrame()

y_pred = np.zeros(X_test.shape[0])

training_start_time = time.time()
for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    start_time = time.time()
    print('Training on fold {}'.format(fold + 1))
        
    fit_params = {
        'eval_set': [(X_train.iloc[val_idx], y_train.iloc[val_idx])],
        'eval_metric': 'auc',
        'verbose': 100,
        'early_stopping_rounds': 100,
    }
    
    model = xgb.XGBClassifier(**model_params)
    model.fit(X_train.iloc[train_idx], y_train.iloc[train_idx], **fit_params)
    y_pred += model.predict_proba(X_test)[:, 1] / folds.n_splits
    
    feature_importances['fold_{}'.format(fold + 1)] = model.feature_importances_
    aucs.append(model.best_score)
    
    print('Fold {} finished in {}'.format(fold + 1, str(timedelta(seconds=time.time() - start_time))))

print('-' * 30)
print('Training has finished.')
print('Total training time is {}'.format(str(timedelta(seconds=time.time() - training_start_time))))
print('Mean AUC:', np.mean(aucs))
print('-' * 30)

best_iteration = model.best_iteration

In [None]:
mean_feature_importances = feature_importances.mean(axis=1).values
top_n = mean_feature_importances.argsort()[::-1][:30]
fig, ax = plt.subplots(figsize=(10, 12))
sns.barplot(x=mean_feature_importances[top_n], y=X_train.columns[top_n], ax=ax)
plt.title('Top 30 Features')

In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
fname = f'submission_{timestamp}.csv'

sub['isFraud'] = y_pred
sub.to_csv(fname, index=False)

In [None]:
from IPython.display import FileLink
FileLink(fname)