# Solution to kaggle HSE Fraud Detection

https://www.kaggle.com/c/hse-2021-fraud-detection#

Author : Kirill Safonov HSE-NES BAE'24

Начало решения взято из baseline : https://colab.research.google.com/drive/1pg1xZa8koJeUaZA35e7g1ISGqNlHDHZW

In [2]:
# from baseline
# скачиваем файлы
'''
!wget --no-check-certificate 'https://www.dropbox.com/s/5iuef7c9ljj84t6/train_transaction.csv?dl=0' -O train_transaction.csv
!wget --no-check-certificate 'https://www.dropbox.com/s/cmy01z5fw7ohlmd/train_identity.csv?dl=0' -O train_identity.csv
!wget --no-check-certificate 'https://www.dropbox.com/s/7thqkuxnwsa7njj/test_transaction.csv?dl=0' -O test_transaction.csv
!wget --no-check-certificate 'https://www.dropbox.com/s/b40nvbb9e2usd5w/test_identity.csv?dl=0' -O test_identity.csv
!wget --no-check-certificate 'https://www.dropbox.com/s/arkyoz0bel8z4d2/sample_submission.csv?dl=0' -O sample_submission.csv
''';

In [4]:
# from baseline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

sns.set()

%matplotlib inline

In [5]:
def reduce_mem_usage(df):
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in tqdm(df.columns):
        if df[col].dtype != object:  # Exclude strings

            # make variables for Int, max and min
            IsInt = False
            col_max_value = df[col].max()
            col_min_value = df[col].min()

            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(df[col]).all(): 
                NAlist.append(col)
                df[col].fillna(col_min_value - 1, inplace=True)

            # test if column can be converted to an integer
            col_as_int = df[col].fillna(0).astype(np.int64)
            diff = (df[col] - col_as_int)
            diff = diff.sum()
            if np.abs(diff) < 0.01:
                IsInt = True

            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if col_min_value >= 0:
                    if col_max_value < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif col_max_value < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif col_max_value < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if col_min_value > np.iinfo(np.int8).min and col_max_value < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif col_min_value > np.iinfo(np.int16).min and col_max_value < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif col_min_value > np.iinfo(np.int32).min and col_max_value < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif col_min_value > np.iinfo(np.int64).min and col_max_value < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)    

            # Make float datatypes 32 bit
            else:
                df[col] = df[col].astype(np.float32)

    return df, NAlist

In [6]:
INPUT_DIR = '.'

train_transaction = pd.read_csv(os.path.join(INPUT_DIR, 'train_transaction.csv'))
train_identity = pd.read_csv(os.path.join(INPUT_DIR, 'train_identity.csv'))
test_transaction = pd.read_csv(os.path.join(INPUT_DIR, 'test_transaction.csv'))
test_identity = pd.read_csv(os.path.join(INPUT_DIR, 'test_identity.csv'))
sample_submission = pd.read_csv(os.path.join(INPUT_DIR, 'sample_submission.csv'))

df_train = train_transaction.merge(train_identity, how='left', on='TransactionID')
del train_transaction, train_identity
df_train, df_train_NAlist = reduce_mem_usage(df_train)

df_test = test_transaction.merge(test_identity, how='left', on='TransactionID')
del test_transaction, test_identity
df_test, df_test_NAlist = reduce_mem_usage(df_test)

100%|██████████| 434/434 [01:09<00:00,  6.25it/s]
100%|██████████| 433/433 [00:29<00:00, 14.90it/s]


In [7]:
df_train['TransactionAmt_log'] = np.log1p(df_train['TransactionAmt'])
df_test['TransactionAmt_log'] = np.log1p(df_test['TransactionAmt'])

In [8]:
df_train['TransactionAmt_Cents'] = np.modf(df_train['TransactionAmt'])[0] * 100
df_test['TransactionAmt_Cents'] = np.modf(df_test['TransactionAmt'])[0] * 100

In [9]:
for col in ['P_emaildomain', 'R_emaildomain']:
    df_train[col + '_suffix'] = df_train[col].map(lambda x: str(x).split('.')[-1])
    df_test[col + '_suffix'] = df_test[col].map(lambda x: str(x).split('.')[-1])

df_train[['P_emaildomain', 'P_emaildomain_suffix', 'R_emaildomain', 'R_emaildomain_suffix']].tail(10)

Unnamed: 0,P_emaildomain,P_emaildomain_suffix,R_emaildomain,R_emaildomain_suffix
417549,gmail.com,com,,
417550,gmail.com,com,,
417551,att.net,net,,
417552,anonymous.com,com,,
417553,yahoo.com,com,,
417554,gmail.com,com,,
417555,gmail.com,com,,
417556,hotmail.com,com,hotmail.com,com
417557,gmail.com,com,,
417558,gmail.com,com,,


In [10]:
df_train['same_emaildomain'] = (df_train['P_emaildomain'] == df_train['R_emaildomain']).astype('uint8')
df_test['same_emaildomain'] = (df_test['P_emaildomain'] == df_test['R_emaildomain']).astype('uint8')
df_train[['P_emaildomain', 'R_emaildomain', 'same_emaildomain']].tail()

Unnamed: 0,P_emaildomain,R_emaildomain,same_emaildomain
417554,gmail.com,,0
417555,gmail.com,,0
417556,hotmail.com,hotmail.com,1
417557,gmail.com,,0
417558,gmail.com,,0


In [11]:
df_train['card3_card5'] = df_train['card3'].astype(str) + '_' + df_train['card5'].astype(str)
df_test['card3_card5'] = df_test['card3'].astype(str) + '_' + df_test['card5'].astype(str)
df_train[['card3', 'card5', 'card3_card5']].tail(10)

Unnamed: 0,card3,card5,card3_card5
417549,150,126,150_126
417550,150,166,150_166
417551,150,226,150_226
417552,150,224,150_224
417553,150,226,150_226
417554,150,166,150_166
417555,150,226,150_226
417556,200,226,200_226
417557,150,195,150_195
417558,150,166,150_166


In [12]:
for col in ['card1', 'card2']:
    card_freq = df_train[col].value_counts().to_dict()
    df_train['{}_cnt'.format(col)] = df_train[col].map(card_freq)
    df_test['{}_cnt'.format(col)] = df_test[col].map(card_freq)
df_train[['card1', 'card1_cnt', 'card2', 'card2_cnt']].head(10)

Unnamed: 0,card1,card1_cnt,card2,card2_cnt
0,13926,29,99,6563
1,2755,526,404,2371
2,4663,778,490,26366
3,18132,2970,567,4329
4,4497,9,514,10587
5,5937,6,555,29919
6,12308,163,360,10518
7,12695,4837,490,26366
8,2803,4291,100,5372
9,17399,1293,111,31710


In [13]:
new_cols = []
for col in ['card1', 'card2']:
    for agg_type in ['mean', 'median', 'min', 'max']:
        agg_col_name = 'TransactionAmt_{}_{}'.format(col, agg_type)
        card_agg = df_train.groupby(col)['TransactionAmt'].agg([agg_type]).rename({agg_type: agg_col_name}, axis=1)
        df_train = df_train.merge(card_agg, how='left', on=col)
        df_test = df_test.merge(card_agg, how='left', on=col)
        new_cols.append(agg_col_name)
df_train[['TransactionAmt', 'card1'] + new_cols[:4] + ['card2'] + new_cols[4:]].head(10)

Unnamed: 0,TransactionAmt,card1,TransactionAmt_card1_mean,TransactionAmt_card1_median,TransactionAmt_card1_min,TransactionAmt_card1_max,card2,TransactionAmt_card2_mean,TransactionAmt_card2_median,TransactionAmt_card2_min,TransactionAmt_card2_max
0,68.5,13926,368.656219,150.0,40.0,1343.140015,99,177.598236,78.5,5.0,3069.25
1,29.0,2755,243.561661,108.949997,10.0,6085.22998,404,229.298645,108.949997,5.0,6085.22998
2,59.0,4663,96.343185,59.0,12.5,994.0,490,132.193436,77.0,0.424,6450.970215
3,50.0,18132,122.751434,67.949997,6.0,3190.0,567,131.370285,77.949997,6.0,3190.0
4,50.0,4497,105.083328,108.949997,30.0,200.0,514,217.431381,102.0,5.0,31937.390625
5,49.0,5937,148.25,144.0,49.0,317.5,555,125.474594,68.094002,0.272,3594.949951
6,159.0,12308,107.105888,59.0,12.5,2161.0,360,98.937828,58.950001,5.0,3472.949951
7,422.5,12695,143.267792,85.0,7.97,3162.949951,490,132.193436,77.0,0.424,6450.970215
8,15.0,2803,145.065338,77.0,5.0,3511.949951,100,172.21048,92.0,5.0,5543.22998
9,117.0,17399,127.806297,67.949997,10.95,2775.0,111,148.041214,87.949997,3.5,4301.950195


In [14]:
df_train.drop('TransactionAmt', axis=1, inplace=True)
df_test.drop('TransactionAmt', axis=1, inplace=True)

In [15]:
for col in tqdm(df_train.columns.drop('isFraud')):
    if df_train[col].dtype == 'O':
        df_train[col] = df_train[col].fillna('unseen_category')
        df_test[col] = df_test[col].fillna('unseen_category')
        
        le = LabelEncoder()
        le.fit(list(df_train[col]) + list(df_test[col]))
        df_train[col] = le.transform(df_train[col])
        df_test[col] = le.transform(df_test[col])
        
        df_train[col] = df_train[col].astype('category')
        df_test[col] = df_test[col].astype('category')

    else:
        df_train[col] = df_train[col].fillna(-1)
        df_test[col] = df_test[col].fillna(-1)

# выделяем фолды
month_length = 3600 * 24 * 30
fold0_idx = df_train[df_train['TransactionDT'] < df_train['TransactionDT'].min() + month_length].index
fold1_idx = df_train[(df_train['TransactionDT'].min() + month_length <= df_train['TransactionDT']) & (df_train['TransactionDT'] < df_train['TransactionDT'].min() + 2 * month_length)].index
fold2_idx = df_train[(df_train['TransactionDT'].min() + 2 * month_length <= df_train['TransactionDT']) & (df_train['TransactionDT'] < df_train['TransactionDT'].min() + 3 * month_length)].index
fold3_idx = df_train[df_train['TransactionDT'].min() + 3 * month_length <= df_train['TransactionDT']].index
folds_idx = [fold0_idx, fold1_idx, fold2_idx, fold3_idx]

# выделяем идентификационный и временной признаки
df_train.drop(['TransactionID', 'TransactionDT'], axis=1, inplace=True)
df_test.drop(['TransactionID', 'TransactionDT'], axis=1, inplace=True)

100%|██████████| 448/448 [00:18<00:00, 23.64it/s] 


In [16]:
cat_features = []
for i,j in enumerate(df_train.columns[1:]):
    if isinstance(df_train[j].dtype,
                  pd.core.dtypes.dtypes.CategoricalDtype):
        cat_features.append(i)

In [19]:
from catboost import CatBoostClassifier, Pool

In [26]:
y_val

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [39]:
set(y_train)

{0, 1}

In [93]:
X_train = df_train.drop(fold1_idx, axis=0)
y_train = X_train['isFraud'].values
X_val = df_train.iloc[fold1_idx]
y_val = X_val['isFraud'].values
X_train = X_train.drop('isFraud', axis=1)
X_val = X_val.drop('isFraud', axis=1)


CatBoost_model = CatBoostClassifier(
    iterations=2000,
    early_stopping_rounds=200,
    task_type="GPU",
    eval_metric="AUC"
)

CatBoost_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_val, y_val),
    verbose=100
)


feature_importances = CatBoost_model.feature_importances_

y_pred = CatBoost_model.predict_proba(X_val)[:, 1]
score_fold = roc_auc_score(y_val, y_pred)
y_test_pred = CatBoost_model.predict(df_test)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.032661
0:	learn: 0.7479422	test: 0.7205881	best: 0.7205881 (0)	total: 212ms	remaining: 7m 3s
100:	learn: 0.8812045	test: 0.8708998	best: 0.8709112 (99)	total: 22s	remaining: 6m 54s
200:	learn: 0.8957749	test: 0.8821788	best: 0.8821788 (200)	total: 43.6s	remaining: 6m 29s
300:	learn: 0.9043829	test: 0.8882214	best: 0.8882214 (300)	total: 1m 4s	remaining: 6m 6s
400:	learn: 0.9106163	test: 0.8920618	best: 0.8921335 (399)	total: 1m 26s	remaining: 5m 45s
500:	learn: 0.9149205	test: 0.8948430	best: 0.8948430 (500)	total: 1m 48s	remaining: 5m 23s
600:	learn: 0.9184338	test: 0.8971157	best: 0.8971157 (600)	total: 2m 9s	remaining: 5m 1s
700:	learn: 0.9217918	test: 0.8993582	best: 0.8993880 (699)	total: 2m 31s	remaining: 4m 40s
800:	learn: 0.9247103	test: 0.9009711	best: 0.9009845 (798)	total: 2m 53s	remaining: 4m 19s
900:	learn: 0.9272652	test: 0.9021989	best: 0.9022448 (895)	total: 3m 14s	remaining: 3m 57s
1000:	learn: 0.9293230	test: 0.9034826	best: 0.9035263 (992)	tota

In [75]:
CatBoost_model.best_score_

{'learn': {'Logloss': 0.07009910014360372, 'AUC': 0.9265406131744385},
 'validation': {'Logloss': 0.09671724297461382, 'AUC': 0.9023482203483582}}

In [84]:
CatBoost_model.predict(X_train)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [90]:
score_fold_1 = roc_auc_score(y_val,CatBoost_model.predict_proba(X_val)[:, 1])

In [91]:
score_fold_1

0.9023480046571873

In [63]:
from sklearn.metrics import accuracy_score

In [67]:
score_fold_1

0.6750999935521342

In [64]:
accuracy_score(y_pred_1, y_pred)

0.9999552567702099

In [62]:
y_pred_1

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [58]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [59]:
score_fold

0.6750999935521342

In [60]:
y_test_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [95]:
scores = []

feature_importances = pd.DataFrame()
feature_importances['feature'] = df_train.columns.drop('isFraud')

test_preds = []
test_preds_proba = []

for i in range(len(folds_idx)):
    X_train = df_train.drop(folds_idx[i], axis=0)
    y_train = X_train['isFraud'].values
    X_val = df_train.iloc[folds_idx[i]]
    y_val = X_val['isFraud'].values
    X_train = X_train.drop('isFraud', axis=1)
    X_val = X_val.drop('isFraud', axis=1)

    CatBoost_model = CatBoostClassifier(
        iterations=2000,
        task_type="GPU",
        eval_metric="AUC"
    )

    CatBoost_model.fit(
        X_train, y_train,
        cat_features=cat_features,
        eval_set=(X_val, y_val),
        early_stopping_rounds=200,
        verbose=200
    )
    
    feature_importances['fold_{}'.format(i)] = CatBoost_model.feature_importances_

    y_pred = CatBoost_model.predict_proba(X_val)[:, 1]
    score_fold = roc_auc_score(y_val, y_pred)
    scores.append(score_fold)
    y_test_pred = CatBoost_model.predict(df_test)
    y_test_pred_proba = CatBoost_model.predict_proba(df_test)[:, 1]
    test_preds.append(y_test_pred)
    test_preds_proba.append(y_test_pred_proba)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.033073
0:	learn: 0.7386703	test: 0.6584010	best: 0.6584010 (0)	total: 334ms	remaining: 11m 8s
200:	learn: 0.8983387	test: 0.8532302	best: 0.8532302 (200)	total: 38.2s	remaining: 5m 42s
400:	learn: 0.9142665	test: 0.8749225	best: 0.8749225 (400)	total: 1m 16s	remaining: 5m 3s
600:	learn: 0.9227056	test: 0.8841200	best: 0.8841200 (600)	total: 1m 54s	remaining: 4m 25s
800:	learn: 0.9289402	test: 0.8892760	best: 0.8892760 (800)	total: 2m 32s	remaining: 3m 48s
1000:	learn: 0.9336908	test: 0.8933517	best: 0.8933517 (1000)	total: 3m 10s	remaining: 3m 10s
1200:	learn: 0.9377566	test: 0.8958744	best: 0.8958744 (1200)	total: 3m 48s	remaining: 2m 32s
1400:	learn: 0.9410815	test: 0.8976216	best: 0.8976267 (1399)	total: 4m 27s	remaining: 1m 54s
1600:	learn: 0.9441493	test: 0.8998711	best: 0.8998711 (1600)	total: 5m 5s	remaining: 1m 16s
1800:	learn: 0.9469239	test: 0.9016124	best: 0.9016401 (1784)	total: 5m 44s	remaining: 38s
1999:	learn: 0.9493247	test: 0.9028190	best: 0.9028

In [96]:
scores

[0.9028928667043573,
 0.9122202446609706,
 0.9119328324606399,
 0.8986069626803953]

In [97]:
np.mean(scores)

0.9064132266265909

In [100]:
np.array(test_preds_proba).shape

(4, 172981)

In [103]:
final_pred = np.average(test_preds_proba, axis=0)
final_pred

array([0.18761382, 0.15179981, 0.26761248, ..., 0.04718227, 0.08400515,
       0.25145308])

In [104]:
sub = pd.DataFrame({'TransactionID': sample_submission['TransactionID'], 'isFraud': final_pred})
sub.head()

Unnamed: 0,TransactionID,isFraud
0,3404559,0.187614
1,3404560,0.1518
2,3404561,0.267612
3,3404562,0.116095
4,3404563,0.478346


In [105]:
sub.to_csv('submission_baseline.csv', index=False)