In [1]:
import sys
sys.path.insert(0, '../')

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import random
import catboost as cb
import lightgbm as lgb
import gc
import pickle
from tqdm import tqdm
from columns import count_cols, card_cols, match_cols
from feature import make_categorical_feature_v3, make_categorical_feature_v4
from utils import df_size, reduce_mem_usage_sd
from columns import nonull_vxxx_cols
from catboost import CatBoostClassifier,Pool
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from utils import feature_test

In [3]:
seed = 10
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)

### read data

In [4]:
sample_submission = pd.read_csv('/data/ieee/sample_submission.csv.zip', index_col='TransactionID', compression='zip')

In [None]:
if False:
#     train_transaction = pd.read_csv('/data/ieee/train_transaction.csv.zip', index_col='TransactionID', compression='zip')
#     test_transaction = pd.read_csv('/data/ieee/test_transaction.csv.zip', index_col='TransactionID', compression='zip')
#     train_identity = pd.read_csv('/data/ieee/train_identity.csv.zip', index_col='TransactionID', compression='zip')
#     test_identity = pd.read_csv('/data/ieee/test_identity.csv.zip', index_col='TransactionID', compression='zip')
    train_transaction = pd.read_csv('/data/ieee/train_transaction_sample.csv', index_col='TransactionID')
    test_transaction = pd.read_csv('/data/ieee/test_transaction_sample.csv', index_col='TransactionID')
    train_identity = pd.read_csv('/data/ieee/train_identity_sample.csv', index_col='TransactionID')
    test_identity = pd.read_csv('/data/ieee/test_identity_sample.csv', index_col='TransactionID')
    
    train_df = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
    test_df = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)
    del train_transaction, train_identity, test_transaction, test_identity
    ### do only this one
    train_df, test_df = make_categorical_feature_v4(train_df, test_df)
else:
    train_df = pd.read_csv('/data/ieee/train_preprocessed_v5.csv')
    test_df = pd.read_csv('/data/ieee/test_preprocessed_v5.csv')

In [None]:
train_df.shape, test_df.shape

#### additional feature engineering

In [7]:
train_df['os_productcd'] = train_df['os'] + '_' + train_df['ProductCD']
test_df['os_productcd'] = test_df['os'] + '_' + test_df['ProductCD']

In [8]:
# global_fraud_rate = train_df.isFraud.mean()
# global_fraud_entropy = train_df['isFraud'].value_counts(normalize=True).apply(lambda x: x*np.log(1/x)).sum()
# global_fraud_rate, global_fraud_entropy

In [9]:
# majority_of_fraud_df = []
# for c in str_categorical_features + int_categorical_features + merged_features:
#     majority_idx = train_df.groupby(c)['isFraud'].mean().sort_values(ascending=False) > (global_fraud_rate*3)
#     rate = (majority_idx).mean()
#     unique = (majority_idx).shape[0]
#     majority_of_fraud_df.append({'column':c, 'unique':unique, 'majority':rate})
# majority_of_fraud_df = pd.DataFrame(majority_of_fraud_df)[['column','unique','majority']]

#### reduce dataframe size

In [10]:
df_size(train_df)

Counter({'float64': 524, 'int64': 16, 'object': 32})

In [11]:
%%time
# From kernel https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
# WARNING! THIS CAN DAMAGE THE DATA 

train_df = reduce_mem_usage_sd(train_df)
test_df = reduce_mem_usage_sd(test_df)

100%|██████████| 572/572 [03:51<00:00,  2.47it/s]


Mem. usage decreased from 3327.71 Mb to 1829.08 Mb (45.0% reduction)


100%|██████████| 571/571 [03:12<00:00,  2.97it/s]


Mem. usage decreased from 2859.12 Mb to 1615.31 Mb (43.5% reduction)
CPU times: user 4min 9s, sys: 3min 26s, total: 7min 35s
Wall time: 7min 33s


In [12]:
df_size(train_df)

Counter({'float16': 312,
         'float32': 150,
         'float64': 48,
         'int16': 15,
         'int32': 2,
         'int8': 13,
         'object': 32})

In [13]:
# train_df.to_csv('/data/ieee/train_preprocessed_v6.csv.zip', index=False, compression='zip')
# test_df.to_csv('/data/ieee/test_preprocessed_v6.csv.zip', index=False, compression='zip')

In [14]:
def get_too_many_null_features(data):
    many_null_cols = [col for col in data.columns if data[col].isnull().mean() > 0.9]
    return many_null_cols

In [15]:
too_many_null_features = get_too_many_null_features(train_df)
too_many_null_features

['dist2',
 'D7',
 'id_07',
 'id_08',
 'id_18',
 'id_21',
 'id_22',
 'id_23',
 'id_24',
 'id_25',
 'id_26',
 'id_27',
 'card1_mean_id_03',
 'card1_std_id_03',
 'card1_mean_id_04',
 'card1_std_id_04',
 'card2_mean_id_04',
 'card2_std_id_04',
 'addr1_mean_id_03',
 'addr1_std_id_03',
 'addr1_mean_id_04',
 'addr1_std_id_04',
 'addr2_mean_id_03',
 'addr2_std_id_03',
 'addr2_mean_id_04',
 'addr2_std_id_04']

In [16]:
train_df = train_df.drop(columns=too_many_null_features)
test_df = test_df.drop(columns=too_many_null_features)
train_df.shape, test_df.shape

((590540, 546), (506691, 545))

In [17]:
def get_too_many_repeated_values(data):
    big_top_value_cols = [col for col in data.columns if data[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
    return big_top_value_cols

In [18]:
too_many_repeated_value_features = get_too_many_repeated_values(train_df)
if 'isFraud' in too_many_repeated_value_features:
    too_many_repeated_value_features.remove('isFraud')
too_many_repeated_value_features[:5]

['C3', 'V98', 'V101', 'V102', 'V103']

In [19]:
too_many_repeated_value_features

['C3',
 'V98',
 'V101',
 'V102',
 'V103',
 'V104',
 'V105',
 'V106',
 'V107',
 'V108',
 'V109',
 'V110',
 'V111',
 'V112',
 'V113',
 'V114',
 'V115',
 'V116',
 'V117',
 'V118',
 'V119',
 'V120',
 'V121',
 'V122',
 'V123',
 'V124',
 'V125',
 'V129',
 'V132',
 'V133',
 'V134',
 'V135',
 'V136',
 'V137',
 'V281',
 'V284',
 'V286',
 'V290',
 'V293',
 'V295',
 'V296',
 'V297',
 'V298',
 'V299',
 'V300',
 'V301',
 'V305',
 'V309',
 'V311',
 'V316',
 'V318',
 'V319',
 'V320',
 'V321']

In [20]:
train_df = train_df.drop(columns=too_many_repeated_value_features)
test_df = test_df.drop(columns=too_many_repeated_value_features)
train_df.shape, test_df.shape

((590540, 492), (506691, 491))

In [21]:
train_df.columns.tolist()

['isFraud',
 'TransactionDT',
 'TransactionAmt',
 'ProductCD',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'dist1',
 'P_emaildomain',
 'R_emaildomain',
 'C1',
 'C2',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',
 'V67',
 'V68',
 'V69',
 'V70

In [22]:
train_df.shape

(590540, 492)

In [59]:
from columns import nonull_vxxx_cols, delta_cols, match_cols, count_cols

features = [c for c in train_df.columns if 'mean' in c or 'std' in c or 'weight' in c]
features += ['card1','card2','card3','card5'] + ['ProductCD','DeviceType'] + ['P_emaildomain','R_emaildomain']
features += ['card1_card2','card1_productcd','card1_id_31','card1_id_15','card4_DeviceType','weekday_hour','os_productcd']
features += ['card1_count','card2_count','ProductCD_count']
features += ['os','browser','device']
features += ['hour', 'weekday']
features += ['addr1','addr2']
features += [c for c in nonull_vxxx_cols if c in train_df.columns]
features += [c for c in delta_cols if c in train_df.columns]
features += [c for c in match_cols if c in train_df.columns]
features += [c for c in count_cols if c in train_df.columns]
features += ['DeviceInfo', 'TransactionAmt']
len(features), sorted(features)

(189,
 ['C1',
  'C10',
  'C11',
  'C12',
  'C13',
  'C14',
  'C2',
  'C4',
  'C5',
  'C6',
  'C7',
  'C8',
  'C9',
  'D1',
  'D10',
  'D11',
  'D12',
  'D13',
  'D14',
  'D15',
  'D2',
  'D3',
  'D4',
  'D5',
  'D6',
  'D8',
  'D9',
  'DeviceInfo',
  'DeviceType',
  'DeviceType_mean_D10',
  'DeviceType_mean_D15',
  'DeviceType_mean_D4',
  'DeviceType_mean_id_03',
  'DeviceType_mean_id_04',
  'DeviceType_std_D10',
  'DeviceType_std_D15',
  'DeviceType_std_D4',
  'DeviceType_std_id_03',
  'DeviceType_std_id_04',
  'M1',
  'M2',
  'M3',
  'M4',
  'M5',
  'M6',
  'M7',
  'M8',
  'M9',
  'P_emaildomain',
  'ProductCD',
  'ProductCD_count',
  'R_emaildomain',
  'TransactionAmt',
  'V279',
  'V280',
  'V285',
  'V287',
  'V291',
  'V292',
  'V294',
  'V302',
  'V303',
  'V304',
  'V306',
  'V307',
  'V308',
  'V310',
  'V312',
  'V317',
  'addr1',
  'addr1_mean_D10',
  'addr1_mean_D15',
  'addr1_mean_D4',
  'addr1_mean_TransactionAmt',
  'addr1_mean_id_02',
  'addr1_std_D10',
  'addr1_std_D15

In [60]:
train_df[[c for c in match_cols if c in train_df.columns]].head(5)

Unnamed: 0,M1,M2,M3,M4,M5,M6,M7,M8,M9
0,1,1,1,2,0,1,-1,-1,-1
1,-1,-1,-1,0,1,1,-1,-1,-1
2,1,1,1,0,0,0,0,0,0
3,-1,-1,-1,0,1,0,-1,-1,-1
4,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [61]:
str_categorical_features = ['card1_card2','card1_productcd','card1_id_31','card1_id_15','card4_DeviceType','weekday_hour','os_productcd'] + ['os','browser','device'] + ['ProductCD','DeviceType','DeviceInfo'] + ['P_emaildomain','R_emaildomain']
int_categorical_features = ['card1','card2','card3','card5'] + ['hour', 'weekday'] + ['addr1','addr2'] + ['hour', 'weekday'] 

In [62]:
# it should not print anything
for c in str_categorical_features+int_categorical_features:
    if not c in features:
        print(c, c in features)

In [63]:
pd.set_option('max_column', None)

In [58]:
feature_test(
    train_df,
    features,
    cat_cols=str_categorical_features)

ROC AUC score with 174 columns: 0.9316


In [64]:
not_used_columns = [c for c in train_df.columns if c not in features]
if 'TransactionDT' in not_used_columns:
    not_used_columns.remove('TransactionDT')
if 'isFraud' in not_used_columns:
    not_used_columns.remove('isFraud')
sorted(not_used_columns), '{} columns to remove.'.format(len(not_used_columns))

(['V1',
  'V10',
  'V100',
  'V11',
  'V12',
  'V126',
  'V127',
  'V128',
  'V13',
  'V130',
  'V131',
  'V138',
  'V139',
  'V14',
  'V140',
  'V141',
  'V142',
  'V143',
  'V144',
  'V145',
  'V146',
  'V147',
  'V148',
  'V149',
  'V15',
  'V150',
  'V151',
  'V152',
  'V153',
  'V154',
  'V155',
  'V156',
  'V157',
  'V158',
  'V159',
  'V16',
  'V160',
  'V161',
  'V162',
  'V163',
  'V164',
  'V165',
  'V166',
  'V167',
  'V168',
  'V169',
  'V17',
  'V170',
  'V171',
  'V172',
  'V173',
  'V174',
  'V175',
  'V176',
  'V177',
  'V178',
  'V179',
  'V18',
  'V180',
  'V181',
  'V182',
  'V183',
  'V184',
  'V185',
  'V186',
  'V187',
  'V188',
  'V189',
  'V19',
  'V190',
  'V191',
  'V192',
  'V193',
  'V194',
  'V195',
  'V196',
  'V197',
  'V198',
  'V199',
  'V2',
  'V20',
  'V200',
  'V201',
  'V202',
  'V203',
  'V204',
  'V205',
  'V206',
  'V207',
  'V208',
  'V209',
  'V21',
  'V210',
  'V211',
  'V212',
  'V213',
  'V214',
  'V215',
  'V216',
  'V217',
  'V218',
  'V21

In [68]:
train_df.drop(columns=not_used_columns, inplace=True)
test_df.drop(columns=not_used_columns, inplace=True)

In [69]:
vxxx_float_columns = ['V279','V280','V285','V291','V293','V294','V295','V306','V307','V308','V309','V310','V311','V312','V316','V317','V318','V319','V320','V321']

In [66]:
gc.collect()

318

In [65]:
# this should not print anything
set(features+str_categorical_features+int_categorical_features).difference(set(train_df.columns.tolist()))

set()

In [28]:
for c in str_categorical_features:
    train_df[c] = train_df[c].astype('category')
    test_df[c] = test_df[c].astype('category')

In [35]:
train_df.to_csv('/data/ieee/train_preprocessed_v5_oversampled.csv.zip', index=False, compression='zip')
test_df.to_csv('/data/ieee/test_preprocessed_v5_oversampled.csv.zip', index=False, compression='zip')

#### apply negative downsampling

In [67]:
print(train_df.shape)
negative_sampled_df = pd.concat([train_df[train_df['isFraud']==1], train_df[train_df['isFraud']==0].sample(frac=0.2, random_state=0)]).sample(frac=1, random_state=0)
print(negative_sampled_df.shape)

(590540, 492)
(134638, 492)


#### train xgb classifier

In [69]:
x_train = train_df[features].copy()
y_train = train_df.isFraud
x_test = test_df[features].copy()

In [70]:
# Label Encoding: label-encoding is necessary for xgboost
for c in tqdm(features):
    if str(x_train[c].dtype)=='object' or str(x_train[c].dtype)=='object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_train[c].values) + list(x_test[c].values))
        x_train[c] = lbl.transform(list(x_train[c].values))
        x_test[c] = lbl.transform(list(x_test[c].values))

100%|██████████| 189/189 [00:15<00:00, 12.09it/s] 


In [71]:
folds = 3
kf = KFold(n_splits = folds, shuffle = True, random_state=100)

In [72]:
%%time

pred_1 = np.zeros(x_test.shape[0])

for i, (tr_idx, val_idx) in enumerate(kf.split(x_train, y_train)):
    clf = xgb.XGBClassifier(
        n_estimators=800,
        max_depth=15,
        learning_rate=0.03,
        subsample=0.9,
        colsample_bytree=0.9,
        tree_method='gpu_hist'
    )
    
    # train classifier
    x_tr = x_train.iloc[tr_idx, :]
    y_tr = y_train.iloc[tr_idx]
    clf.fit(x_tr, y_tr)
    
    # test classifier
    x_te = x_train.iloc[val_idx, :]
    y_te = y_train.iloc[val_idx]
    y_pred = clf.predict(x_te)
    try:
        print('ROC: {:.4f}'.format(roc_auc_score(y_te, y_pred)))
    except:
        print('ROC is not defined for one-class value')
          
    pred_1 += clf.predict_proba(x_test)[:,1]/folds
    #del clf

gc.collect()

ROC: 0.7757
ROC: 0.7862
ROC: 0.7906
CPU times: user 4min 39s, sys: 1min 15s, total: 5min 55s
Wall time: 5min 54s


In [48]:
pickle.dump(pred_1, open('pred_1.pkl','wb'))

### write submission

In [49]:
sub2_filename = 'submission_2867.csv'  # sub2_filename is always the best submission
sub2_df = pd.read_csv(sub2_filename)
df = pd.DataFrame({
    'm1': pred_1,
    #'m2': pred_2,
    #'m3': pred_3,
    'y': sub2_df.isFraud
})

In [50]:
df.corr()

Unnamed: 0,m1,y
m1,1.0,0.944193
y,0.944193,1.0


In [16]:
#sample_submission.loc[:, 'isFraud'] = (pred_1 + pred_2 + pred_3)/3
sample_submission.loc[:, 'isFraud'] = (pred_1 + pred_2)/2
#test_df.loc[:, 'isFraud'] = (w1/w)*pred_1 + (w2/w)*pred_2 + (w3/w)*pred_3

In [17]:
sample_submission.shape

(506691, 1)

In [28]:
np.random.seed(np.random.randint(1, 10000))
randidx = str(np.random.randint(0, 10000)).zfill(4)
randidx

'7981'

In [29]:
sample_submission = sample_submission.reset_index()

In [30]:
submission = sample_submission[['TransactionID','isFraud']]

In [31]:
submission.head(5)

Unnamed: 0,TransactionID,isFraud
0,3663549,0.003122
1,3663550,0.017639
2,3663551,0.004293
3,3663552,0.003372
4,3663553,0.001773


In [32]:
submission.to_csv('submission_{}.csv'.format(randidx), index=False)

### see difference

In [33]:
sub1_filename = 'submission_7981.csv'  # sub1_filename is always the latest submission
sub2_filename = 'submission_2867.csv'  # sub2_filename is always the best submission

In [34]:
sub1_df = pd.read_csv(sub1_filename)
sub2_df = pd.read_csv(sub2_filename)
sub1_df.shape, sub2_df.shape

((506691, 2), (506691, 2))

In [35]:
(sub1_df.isFraud - sub2_df.isFraud).apply(lambda x: x**2).mean()

0.005067019416986563

#### 낙서장

In [11]:
df = pd.DataFrame({
    'm1': pred_1,
    'm2': pred_2,
    'm3': pred_3,
    'y': sub2_df.isFraud
})

In [13]:
df.corr()

Unnamed: 0,m1,m2,m3,y
m1,1.0,0.910568,0.878668,0.872243
m2,0.910568,1.0,0.903126,0.892816
m3,0.878668,0.903126,1.0,0.928377
y,0.872243,0.892816,0.928377,1.0


In [None]:
cate