In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore') 
import numpy as np

import missingno as msno
from tqdm import tqdm_notebook

In [2]:
train_iden = pd.read_csv("train_identity.csv")
train_tran = pd.read_csv("train_transaction.csv")
test_iden = pd.read_csv("test_identity.csv")
test_tran = pd.read_csv("test_transaction.csv")

In [3]:
train_sample = train_tran[['isFraud','ProductCD','card4','card6', # catagory feature
                           'TransactionDT',	'TransactionAmt',
       'card1','card2','card3','card5','addr1','addr2',
        'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 
       'C8', 'C9', 'C10', 'C11','C12', 'C13', 'C14',
        'D1','D10','D15',
                           
        'V12', 'V13', 'V14', 'V15','V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23',
        'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33','V34',
                           
       'V95','V96','V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105',
       'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114',
       'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123',
       'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132','V133', 'V134', 'V135', 'V136','V137',
                           
       'V279','V280','V284', 'V285', 'V286', 'V287', 'V290', 'V291', 'V292',
       'V293', 'V294', 'V295', 'V297', 'V298', 'V299', 'V303', 'V304', 'V305', 'V306']]
sample = train_sample.fillna(train_sample.mean())

In [4]:
def dummy_data(data, columns):
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix = column)], axis=1)
        data = data.drop(column, axis=1)
    return data

In [5]:
dummy_columns = ['ProductCD','card4','card6']
#train_dummy = dummy_data(sample, dummy_columns)
sample = dummy_data(sample, dummy_columns)

In [6]:
sample.columns

Index(['isFraud', 'TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3',
       'card5', 'addr1', 'addr2', 'C1',
       ...
       'ProductCD_S', 'ProductCD_W', 'card4_american express',
       'card4_discover', 'card4_mastercard', 'card4_visa', 'card6_charge card',
       'card6_credit', 'card6_debit', 'card6_debit or credit'],
      dtype='object', length=124)

In [7]:
import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import * 
import imblearn.under_sampling as usam

Using TensorFlow backend.


In [8]:
X = sample.drop('isFraud', axis = 1)
Y = sample['isFraud']
feature_names = X.columns.tolist() # 컬럼을 리스트화
Train_X, Test_X, Train_Y, Test_Y = train_test_split(X, Y, test_size = 0.1, 
                                                    shuffle = False, random_state = 13)

In [9]:
rus = usam.RandomUnderSampler()
undersampled_X, undersampled_Y = rus.fit_sample(Train_X, Train_Y)

In [10]:
import lightgbm as lgb

train_ds = lgb.Dataset(undersampled_X, label = undersampled_Y) 
test_ds = lgb.Dataset(Test_X, label = Test_Y) 

In [11]:
# https://testlightgbm.readthedocs.io/en/latest/Parameters.html
params = {'learning_rate': 0.01, 
          'max_depth': 16, 
          'boosting': 'gbdt', 
          'objective': 'binary', 
          'metric': 'auc', 
          'is_training_metric': True, 
          'num_leaves': 144, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':13,
          'device': 'cpu'}
model = lgb.train(params, train_ds, 1000, test_ds, verbose_eval = 1000, early_stopping_rounds = 100)
predicted = model.predict(Test_X)
print (roc_auc_score(Test_Y, predicted))

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[605]	valid_0's auc: 0.917992
0.9179918785194267


In [12]:
from sklearn.externals import joblib
joblib.dump(model, 'under_sampling_model.pkl')
model = joblib.load('under_sampling_model.pkl') 



In [14]:
test_sample = test_tran[['ProductCD','card4','card6', # catagory feature
                           'TransactionDT',	'TransactionAmt',
       'card1','card2','card3','card5','addr1','addr2',
        'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 
       'C8', 'C9', 'C10', 'C11','C12', 'C13', 'C14',
        'D1','D10','D15',
                           
        'V12', 'V13', 'V14', 'V15','V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23',
        'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33','V34',
                           
       'V95','V96','V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105',
       'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114',
       'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123',
       'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132','V133', 'V134', 'V135', 'V136','V137',
                           
       'V279','V280','V284', 'V285', 'V286', 'V287', 'V290', 'V291', 'V292',
       'V293', 'V294', 'V295', 'V297', 'V298', 'V299', 'V303', 'V304', 'V305', 'V306']]
sample = test_sample.fillna(test_sample.mean())

In [15]:
dummy_columns = ['ProductCD','card4','card6']
#train_dummy = dummy_data(sample, dummy_columns)
sample = dummy_data(sample, dummy_columns)

In [16]:
predicted = model.predict(sample)

In [17]:
dic = {'TransactionID':test_tran['TransactionID'],
       'isFraud':predicted}
submission = pd.DataFrame(dic)
len (submission)

506691

In [18]:
submission.to_csv('submission.csv', index = False)