## Kernel
### [LGB Single model [LB 0.9419] | Kaggle](https://www.kaggle.com/nroman/lgb-single-model-lb-0-9419)

### [Data Description (Details and Discussion) | Kaggle : latest-593951](https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203#latest-593951)

In [3]:
import re
import pandas as pd
import numpy as np
import multiprocessing
import gc
from tqdm import tqdm_notebook
import seaborn as sns

In [4]:
train_id = pd.read_csv('../input/train_identity.csv')
train_tr = pd.read_csv('../input/train_transaction.csv')
test_id = pd.read_csv('../input/test_identity.csv')
test_tr = pd.read_csv('../input/test_transaction.csv')
sub = pd.read_csv('../input/sample_submission.csv')

train = pd.merge(train_tr, train_id, on='TransactionID', how='left')
test = pd.merge(test_tr, test_id, on='TransactionID', how='left')

del test_id, test_tr, train_id, train_tr
gc.collect()

56

In [5]:
def select_cols_to_drop(df):
    one_value_cols = [col for col in df.columns if df[col].nunique() <= 1]
    many_null_cols = [col for col in df.columns if df[col].isnull().sum() / df.shape[0] > 0.9]
    big_top_value_cols = [col for col in df.columns if df[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
    return one_value_cols + many_null_cols + big_top_value_cols

In [6]:
cols_to_drop = list(set(select_cols_to_drop(train) + select_cols_to_drop(test)))
cols_to_drop.remove('isFraud')

train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

In [7]:
train['TransactionAmtLog'] = np.log(train['TransactionAmt'])
test['TransactionAmtLog'] = np.log(test['TransactionAmt'])

In [8]:
def add_dt_info(df):
    dt = pd.to_datetime(df['TransactionDT'], unit='s').dt
    df['hour'] = dt.hour
    df['minute'] = dt.minute
    df['second'] = dt.second
    df['dayofweek'] = dt.dayofweek

In [9]:
add_dt_info(train)
add_dt_info(test)

In [10]:
from sklearn.preprocessing import LabelEncoder

for col in tqdm_notebook(train.columns):
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))

HBox(children=(IntProgress(value=0, max=357), HTML(value='')))




In [11]:
X_train = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
y_train = train.sort_values('TransactionDT')['isFraud']
X_test = test.sort_values('TransactionDT').drop(['TransactionDT', 'TransactionID'], axis=1)

In [12]:
def select_columns_regexp(df, regexp):
    return [c for c in df.columns if re.search(regexp, c)]

In [13]:
card_cols = select_columns_regexp(train, r'card\d+')
C_cols = select_columns_regexp(train, r'C\d+')
D_cols = select_columns_regexp(train, r'D\d+')
V_cols = select_columns_regexp(train, r'V\d+')
M_cols = select_columns_regexp(train, r'M\d+')
id_cols = select_columns_regexp(train, r'id\d+')

In [14]:
SEED = 42

model_params = {
    'max_depth': 8,
    'min_child_weight': 0.03,
    'feature_fraction': 0.3,
    'bagging_fraction': 0.4,
    'objective': 'binary',
    'max_depth': -1,
    'learning_rate': 0.02,
    "boosting_type": 'goss',
    "metric": 'auc',
    "verbosity": -1,
    'reg_alpha': 0.4,
    'reg_lambda': 0.6,
    'random_state': SEED
}

[sklearn.model_selection.TimeSeriesSplit — scikit-learn 0.21.3 documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html)

In [15]:
import time
from datetime import datetime, timedelta
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
import lightgbm as lgbm

folds = TimeSeriesSplit(n_splits=5)

aucs = list()
feature_importances = pd.DataFrame()
feature_importances['feature'] = X_train.columns

training_start_time = time.time()
for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    start_time = time.time()
    print('Training on fold {}'.format(fold + 1))
    
    dtrain = lgbm.Dataset(X_train.iloc[train_idx], label=y_train.iloc[train_idx])
    dval = lgbm.Dataset(X_train.iloc[val_idx], label=y_train.iloc[val_idx])
    
    fit_params = {
        'num_boost_round': 10000,
        'valid_sets': [dval],
        'verbose_eval': 100,
        'early_stopping_rounds': 100
    }
    
    clf = lgbm.train(model_params, dtrain, **fit_params)
    
    feature_importances['fold_{}'.format(fold + 1)] = clf.feature_importance()
    aucs.append(clf.best_score['valid_0']['auc'])
    
    print('Fold {} finished in {}'.format(fold + 1, str(timedelta(seconds=time.time() - start_time))))

print('-' * 30)
print('Training has finished.')
print('Total training time is {}'.format(str(timedelta(seconds=time.time() - training_start_time))))
print('Mean AUC:', np.mean(aucs))
print('-' * 30)

Training on fold 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.874304
[200]	valid_0's auc: 0.886387
[300]	valid_0's auc: 0.892016
[400]	valid_0's auc: 0.895179
[500]	valid_0's auc: 0.896146
[600]	valid_0's auc: 0.8974
[700]	valid_0's auc: 0.898473
[800]	valid_0's auc: 0.898316
Early stopping, best iteration is:
[720]	valid_0's auc: 0.898533


KeyError: 'auc'

In [None]:
clf = lgbm.LGBMClassifier(**model_params, num_boost_round=clf.best_iteration)
clf.fit(X_train, y_train)

In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
fname = f'submission_{timestamp}.csv'

sub['isFraud'] = clf.predict_proba(X_test)[:, 1]
sample_submission.to_csv(fname, index=False)

In [None]:
from IPython.display import FileLink
FileLink(fname)