In [3]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
%matplotlib inline
import seaborn as sns

RSEED = 50

# Load Data

In [4]:
# Load sample training data
df_train_transac = pd.read_csv('./data/train_transaction.csv')
df_train_identity = pd.read_csv('./data/train_identity.csv')
df_train = pd.merge(df_train_transac,df_train_identity,on='TransactionID',how='left')

In [5]:
# Load sample training data
df_test_transac = pd.read_csv('./data/test_transaction.csv')
df_test_identity = pd.read_csv('./data/test_identity.csv')
df_test = pd.merge(df_test_transac,df_test_identity,on='TransactionID',how='left')

In [6]:
# combine train and test
df_total = df_train.append(df_test,sort=False)

In [7]:
df_total.shape

(1097231, 434)

# Feature Engineer

In [8]:
# clean Pemail
df_total['P_emaildomain'] = df_total['P_emaildomain'].str.split('.',expand=True)[0]

In [9]:
# clean R_emaildomain
df_total['R_emaildomain'] = df_total['R_emaildomain'].str.split('.',expand=True)[0]

In [13]:
df_total['id_30'] = df_total['id_30'].str.split(' ',expand=True)[0]

In [10]:
def clean_id31(df):
    df['id_31'] = df['id_31'].str.replace("([0-9\.])", "")
    df['id_31'][df['id_31'].str.contains('chrome', regex=False)==True] = 'chrome'
    df['id_31'][df['id_31'].str.contains('Samsung', regex=False)==True] = 'Samsung'
    df['id_31'][df['id_31'].str.contains('samsung', regex=False)==True] = 'Samsung'
    df['id_31'][df['id_31'].str.contains('firefox', regex=False)==True] = 'firefox'
    df['id_31'][df['id_31'].str.contains('safari', regex=False)==True] = 'safari'
    df['id_31'][df['id_31'].str.contains('opera', regex=False)==True] = 'opera'
    df['id_31'] = df['id_31'].str.replace(" ", "")
    return df
    

In [11]:
df_total = clean_id31(df_total)

In [40]:
def label_encoder(df, categorical_columns=None):
    """Encode categorical values as integers (0,1,2,3...) with pandas.factorize. """
    # if categorical_colunms are not given than treat object as categorical features
    if not categorical_columns:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    for col in categorical_columns:
        df[col], uniques = pd.factorize(df[col])
    return df, categorical_columns

In [41]:
'''
to_encode = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 
             'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15', 
             'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33', 
             'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']
             '''

df_total,colname = label_encoder(df_total, categorical_columns=None)

# Train Model

## Train Test Split

In [42]:
features_train = df_total[df_total['isFraud'].notnull()]
features_test = df_total[df_total['isFraud'].isnull()]

In [43]:
print(features_train.shape)
print(features_test.shape)

(590540, 434)
(506691, 434)


## Prepare Train Set

In [44]:
labels_train = features_train['isFraud']
features_train = features_train.drop(columns = ['isFraud', 'TransactionID'])

In [46]:
categorical = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5','card6',
              'addr1','addr2','P_emaildomain','R_emaildomain','M1','M2','M3',
              'M4','M5','M6','M7','M8','M9','DeviceType','DeviceInfo']
ids = [ 'id_%s'%(i) for i in range(12,39)]

In [48]:
categorical = categorical + ids

In [53]:
# Create a  lgb training set
train_set = lgb.Dataset(features_train, label = labels_train.values,
                        categorical_feature=categorical)

## Cross Validate

In [54]:
# Find default hyperparameters
model = lgb.LGBMClassifier()
params = model.get_params()

In [55]:
cv_results = lgb.cv(params, train_set, num_boost_round = 10000, metrics = 'auc', 
                        early_stopping_rounds = 100, seed = RSEED, nfold = 5)

Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


In [63]:
print('Cross Validation ROC AUC: {:.5f} with std: {:.5f}.'.format(cv_results['auc-mean'][-1],
                                                                               cv_results['auc-stdv'][-1]))


Cross Validation ROC AUC: 0.92753 with std: 0.00128.


In [58]:
model = lgb.LGBMClassifier(n_estimators = len(cv_results['auc-mean']), random_state=RSEED)
model.fit(features_train, labels_train.values)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=50, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

## Feature Importance

In [64]:
fi = pd.DataFrame({'feature': features_train.columns, 
                   'importance': model.feature_importances_})
fi = fi.sort_values('importance', ascending = False)

In [66]:
fi[fi.importance > 0]

Unnamed: 0,feature,importance
9,addr1,129
4,card2,124
3,card1,123
1,TransactionAmt,118
0,TransactionDT,102
30,D2,83
13,P_emaildomain,80
27,C13,79
43,D15,66
7,card5,65


## Predict

In [59]:
id_test = features_test['TransactionID']
features_test = features_test.drop(columns = ['isFraud', 'TransactionID'])

In [60]:
# Make predictions on the testing data
preds = model.predict_proba(features_test)[:, 1]
submission = pd.DataFrame({'TransactionID': id_test, 
                                'isFraud': preds})

In [62]:
submission.to_csv('./data/sub_baseline.csv', index = False)