In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
root = '../data/ieee-fraud-detection'

for dirname, _, filenames in os.walk(root):
    for filename in filenames:
        path=os.path.join(dirname, filename)
        if 'train' in path:
            __training_path=path
        elif 'test' in path:
            __test_path=path

print(f'Training path:{__training_path}\nTest path:{__test_path}')

import sklearn; sklearn.show_versions()

import pandas as pd
tr_transaction = pd.read_csv(root + '/train_transaction.csv', index_col='TransactionID')
tr_identity = pd.read_csv(root + '/train_identity.csv', index_col='TransactionID')

#generate a single training dataset file
train = tr_transaction.merge(tr_identity, how='left', left_index=True, right_index=True)
del tr_identity
del tr_transaction
train.to_csv(root + "/train.csv", index=False)
del train
ts_identity = pd.read_csv(root + '/test_identity.csv', index_col='TransactionID')
ts_transaction = pd.read_csv(root + '/test_transaction.csv', index_col='TransactionID')

#generate a single test dataset file
test = ts_transaction.merge(ts_identity, how='left', left_index=True, right_index=True)
__test_dataset_submission_columns = ts_transaction.index
submission = pd.DataFrame(columns=['TransactionID'], data=__test_dataset_submission_columns)
del ts_identity
del ts_transaction
test.rename(columns={x: x.replace('-', '_') for x in test.columns}, inplace=True)
test.to_csv(root + "/test.csv", index=False)
del test
__training_path=root + "/train.csv"
__test_path=root + "/test.csv"

def __load__data(__training_path, __test_path, concat=False):
    """load data as input dataset
    params: __training_path: the training path of input dataset
    params: __test_path: the path of test dataset
    params: if it is True, then it will concatinate the training and test dataset as output
    returns: generate final loaded dataset as dataset, input and test
    """
    # LOAD DATA
    import pandas as pd
    __train_dataset = pd.read_csv(__training_path, delimiter=',' if __training_path.endswith('csv') else '\t')
    __test_dataset = pd.read_csv(__test_path, delimiter=',' if __training_path.endswith('csv') else '\t')
    return __train_dataset, __test_dataset
__train_dataset, __test_dataset = __load__data(__training_path, __test_path, concat=True)
__train_dataset.head()

# PREPROCESSING-1
from sklearn.impute import SimpleImputer
import numpy as np
_NUMERIC_COLS_WITH_MISSING_VALUES = ['id_24', 'V157', 'V256', 'V292', 'V122', 'V308', 'V332', 'V304', 'addr1', 'D5', 'id_08', 'V87', 'V259', 'V189', 'id_14', 'V176', 'id_19', 'V147', 'V61', 'V124', 'V251', 'V43', 'V186', 'V29', 'V298', 'V254', 'D6', 'V21', 'V70', 'id_01', 'V5', 'V72', 'V77', 'V272', 'V127', 'V211', 'V8', 'V240', 'V318', 'V151', 'id_05', 'V76', 'V115', 'V85', 'C7', 'V62', 'V237', 'D4', 'V167', 'V130', 'addr2', 'V242', 'dist2', 'V35', 'V179', 'V51', 'V213', 'V191', 'V153', 'V90', 'TransactionDT', 'V335', 'V59', 'V253', 'V20', 'V220', 'V1', 'V262', 'V286', 'V105', 'card5', 'V270', 'V284', 'V269', 'V123', 'id_03', 'V279', 'C2', 'TransactionAmt', 'V3', 'V73', 'V27', 'V71', 'V173', 'D14', 'V339', 'V174', 'V44', 'V324', 'V231', 'V146', 'id_06', 'V328', 'V248', 'id_10', 'V198', 'V75', 'V184', 'V131', 'V215', 'V67', 'id_04', 'V98', 'V195', 'V57', 'V178', 'C9', 'V316', 'V275', 'V297', 'C11', 'V320', 'V39', 'V296', 'V89', 'V30', 'V80', 'V14', 'V121', 'V135', 'V239', 'V50', 'C3', 'V64', 'V79', 'V168', 'V54', 'id_11', 'V78', 'V145', 'V34', 'V280', 'V261', 'id_25', 'C13', 'D10', 'id_26', 'V113', 'V129', 'V69', 'V100', 'V327', 'V249', 'V305', 'id_17', 'V202', 'V294', 'V329', 'V247', 'V260', 'V190', 'V306', 'V6', 'V317', 'V212', 'id_02', 'V182', 'V41', 'V282', 'V225', 'V11', 'id_07', 'V276', 'V58', 'card2', 'V206', 'V45', 'V22', 'V302', 'id_18', 'dist1', 'V148', 'V180', 'V199', 'V152', 'V48', 'V99', 'V230', 'V210', 'V4', 'V245', 'V235', 'V24', 'V103', 'V154', 'V227', 'V2', 'V216', 'V106', 'V46', 'V236', 'V338', 'V246', 'V309', 'V337', 'V111', 'id_09', 'V257', 'V12', 'C12', 'id_20', 'V138', 'V243', 'C10', 'V68', 'V326', 'V170', 'V49', 'V194', 'V188', 'V132', 'V158', 'V134', 'V330', 'V32', 'V60', 'V16', 'V33', 'D3', 'V136', 'V150', 'V203', 'V52', 'V222', 'V258', 'id_21', 'V37', 'V268', 'V219', 'V214', 'V149', 'V285', 'V156', 'V88', 'V287', 'V265', 'V226', 'V104', 'V144', 'V175', 'V204', 'V141', 'V155', 'V208', 'V161', 'V290', 'V84', 'V255', 'V23', 'V283', 'V300', 'V207', 'V17', 'V91', 'V197', 'V205', 'V164', 'V19', 'id_32', 'V114', 'V83', 'V172', 'V322', 'V218', 'V217', 'V55', 'V177', 'V53', 'V163', 'V31', 'V334', 'V196', 'V291', 'V126', 'V315', 'V13', 'V63', 'V118', 'V128', 'V193', 'V200', 'V108', 'V288', 'D15', 'V119', 'V102', 'C5', 'V26', 'V25', 'V289', 'V319', 'V229', 'V273', 'C1', 'D8', 'V312', 'V7', 'V281', 'V325', 'V228', 'V159', 'V74', 'V162', 'V36', 'V331', 'V95', 'V277', 'V65', 'V295', 'id_22', 'V264', 'V267', 'V97', 'V112', 'C4', 'V56', 'V169', 'V223', 'V133', 'V125', 'V323', 'V209', 'V120', 'V107', 'D9', 'V266', 'V310', 'V92', 'V38', 'V321', 'C8', 'V28', 'V224', 'C14', 'V140', 'V201', 'V10', 'V137', 'V314', 'V18', 'V307', 'V333', 'V274', 'V221', 'V47', 'D11', 'V15', 'V9', 'V81', 'V143', 'id_13', 'V233', 'V313', 'V160', 'card3', 'V82', 'V117', 'card1', 'V139', 'C6', 'D13', 'V101', 'V241', 'V181', 'V109', 'V171', 'V110', 'V278', 'V42', 'V185', 'V299', 'V311', 'V271', 'V93', 'V238', 'V244', 'V336', 'V66', 'V232', 'V301', 'V250', 'V263', 'V165', 'V252', 'V166', 'V116', 'V94', 'V192', 'D2', 'V40', 'V293', 'V86', 'V96', 'D1', 'V142', 'V303', 'V187', 'D7', 'D12', 'V183', 'V234']
for _col in _NUMERIC_COLS_WITH_MISSING_VALUES:
    __simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    __train_dataset[_col] = __simple_imputer.fit_transform(__train_dataset[_col].values.reshape(-1,1))[:,0]
    if _col in __test_dataset:
        __test_dataset[_col] = __simple_imputer.transform(__test_dataset[_col].astype(__train_dataset[_col].dtypes).values.reshape(-1,1))[:,0]

# PREPROCESSING-2
from sklearn.impute import SimpleImputer
import numpy as np
_STRING_COLS_WITH_MISSING_VALUES = ['M4', 'id_29', 'id_35', 'M9', 'id_15', 'id_27', 'M3', 'id_23', 'P_emaildomain', 'M5', 'DeviceInfo', 'id_31', 'id_38', 'id_36', 'id_30', 'M1', 'card6', 'id_28', 'ProductCD', 'id_37', 'id_16', 'DeviceType', 'M8', 'M2', 'id_34', 'id_12', 'card4', 'id_33', 'R_emaildomain', 'M6', 'M7']
for _col in _STRING_COLS_WITH_MISSING_VALUES:
    __simple_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    __train_dataset[_col] = __simple_imputer.fit_transform(__train_dataset[_col].values.reshape(-1,1))[:,0]
    if _col in __test_dataset:
        __test_dataset[_col] = __simple_imputer.transform(__test_dataset[_col].astype(__train_dataset[_col].dtypes).values.reshape(-1,1))[:,0]

# PREPROCESSING-3
from sklearn.preprocessing import OrdinalEncoder
_CATEGORICAL_COLS = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']
_ohe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
__train_dataset[_CATEGORICAL_COLS] = pd.DataFrame(_ohe.fit_transform(__train_dataset[_CATEGORICAL_COLS]), columns=_CATEGORICAL_COLS)
__test_dataset[_CATEGORICAL_COLS] = pd.DataFrame(_ohe.transform(__test_dataset[_CATEGORICAL_COLS]), columns=_CATEGORICAL_COLS)


# DETACH TARGET
__feature_train = __train_dataset.drop(['isFraud'], axis=1)
__target_train =__train_dataset['isFraud']
__feature_test = __test_dataset
del __train_dataset
del __test_dataset

# PREPROCESSING-4
from sklearn.preprocessing import StandardScaler
__standard_scaler = StandardScaler()
__scaled_features = __standard_scaler.fit_transform(__feature_train.values)
__feature_train = pd.DataFrame(__scaled_features, index=__feature_train.index, columns=__feature_train.columns)
__scaled_features = __standard_scaler.transform(__feature_test.values)
__feature_test = pd.DataFrame(__scaled_features, index=__feature_test.index, columns=__feature_test.columns)

# PREPROCESSING-5
from imblearn.over_sampling import SMOTE
smote = SMOTE()
__feature_train, __target_train = smote.fit_resample(__feature_train, __target_train)

Training path:../data/ieee-fraud-detection\train_transaction.csv
Test path:../data/ieee-fraud-detection\test_transaction.csv

System:
    python: 3.8.12 | packaged by conda-forge | (default, Oct 12 2021, 21:22:46) [MSC v.1916 64 bit (AMD64)]
executable: C:\Users\verdi\anaconda3\envs\anomaly_detection\python.exe
   machine: Windows-10-10.0.19044-SP0

Python dependencies:
          pip: 21.2.4
   setuptools: 58.0.4
      sklearn: 1.0
        numpy: 1.21.2
        scipy: 1.7.1
       Cython: None
       pandas: 1.3.4
   matplotlib: 3.4.3
       joblib: 1.0.1
threadpoolctl: 2.2.0

Built with OpenMP: True


In [5]:
__feature_test.to_csv("../data/processed/test_transactions_v2.csv", index=False)

In [8]:
# MODEL
import numpy as np
from catboost import CatBoostClassifier
__model = CatBoostClassifier()
__model.fit(__feature_train, __target_train)
__y_pred = __model.predict_proba(__feature_test)
if np.shape(__y_pred)[1] == 2:
    __y_pred = __y_pred[:, 1]

# SUBMISSION
submission = pd.concat([submission, pd.DataFrame(__y_pred, columns=['isFraud'])], axis=1)
submission.head()

# save submission file
submission.to_csv("kaggle_submission.csv", index=False)

Learning rate set to 0.208062
0:	learn: 0.5494018	total: 479ms	remaining: 7m 58s
1:	learn: 0.4696355	total: 835ms	remaining: 6m 56s
2:	learn: 0.4237728	total: 1.12s	remaining: 6m 12s
3:	learn: 0.3946183	total: 1.45s	remaining: 6m
4:	learn: 0.3570501	total: 1.78s	remaining: 5m 53s
5:	learn: 0.3306342	total: 2.08s	remaining: 5m 43s
6:	learn: 0.3073643	total: 2.37s	remaining: 5m 35s
7:	learn: 0.2781179	total: 2.71s	remaining: 5m 36s
8:	learn: 0.2688894	total: 3.02s	remaining: 5m 32s
9:	learn: 0.2514300	total: 3.3s	remaining: 5m 26s
10:	learn: 0.2454760	total: 3.55s	remaining: 5m 19s
11:	learn: 0.2358328	total: 3.81s	remaining: 5m 13s
12:	learn: 0.2273032	total: 4.14s	remaining: 5m 14s
13:	learn: 0.2227452	total: 4.42s	remaining: 5m 11s
14:	learn: 0.2192780	total: 4.71s	remaining: 5m 9s
15:	learn: 0.2134096	total: 4.99s	remaining: 5m 6s
16:	learn: 0.2099554	total: 5.3s	remaining: 5m 6s
17:	learn: 0.2073368	total: 5.58s	remaining: 5m 4s
18:	learn: 0.2021191	total: 5.89s	remaining: 5m 4s
19:

In [38]:
# Save data
(pd.concat((__feature_train, __target_train), axis=1)).to_csv("../data/processed/train_transactions_v2.csv", index=False)
__feature_test.to_csv("../data/processed/test_transactions_v2.csv", index=False)

In [4]:
def load_train_set():
    df_train = pd.read_csv("../data/processed/train_transactions_v2.csv")
    df_target = df_train["isFraud"]

    data = torch.Tensor(df_train.drop(columns=["isFraud"]).to_numpy())
    targets = torch.Tensor(df_target.to_numpy())
    return data, targets

In [5]:
# DeepSVDD
from src.tabular.models.OneClass import DeepSVDD
from src.tabular.trainers.DeepSVDDTrainer import DeepSVDDTrainer
from torch.utils.data import TensorDataset, DataLoader
import torch

n_features = __feature_train.shape[1]
model = DeepSVDD(in_features=n_features)
batch_size = 128
n_epochs = 200
lr = 1e-04


trainer = DeepSVDDTrainer(
    model=model,
    lr=lr,
    n_epochs=n_epochs,
    batch_size=batch_size,
    device='cuda'
)

print("Training %s with shape %s" % ("DeepSVDD", __feature_train.shape))

X_train, y_train = load_train_set()

ds = TensorDataset(X_train, y_train)
train_ldr = DataLoader(ds, batch_size=batch_size)

Training DeepSVDD with shape (1139754, 432)


In [6]:
trainer.train(train_ldr, None)
trainer.model.save("../models/deepsvdd_ieee_fraud_detection.pklz")

Initializing center c...
Center c initialized.
Started training


  6%|▌         | 556/8905 [00:07<01:56, 71.61it/s, epoch=1, loss=751.006]


KeyboardInterrupt: 

In [1]:
from src.tabular.models.OneClass import DeepSVDD
from src.tabular.trainers.DeepSVDDTrainer import DeepSVDDTrainer
from torch.utils.data import TensorDataset, DataLoader
import torch
import pandas as pd

df_test = pd.read_csv("../data/processed/test_transactions_v2.csv")
submission = pd.DataFrame(columns=['TransactionID'], data=df_test.index)

model = DeepSVDD(in_features=df_test.shape[1])
model = model.load("../models/deepsvdd_ieee_fraud_detection.pklz")
X_test = torch.Tensor(df_test.to_numpy())
del df_test


In [6]:
def init_center_c(model, train_loader: DataLoader, eps=0.1):
    """Initialize hypersphere center c as the mean from an initial forward pass on the data.
       Code taken from https://github.com/lukasruff/Deep-SVDD-PyTorch/blob/master/src/optim/deepSVDD_trainer.py"""
    n_samples = 0
    c = torch.zeros(model.rep_dim, device='cuda')

    model.eval()
    with torch.no_grad():
        for sample in train_loader:
            # get the inputs of the batch
            X, _ = sample
            X = X.to('cuda').float()
            outputs = model(X)
            n_samples += outputs.shape[0]
            c += torch.sum(outputs, dim=0)

    if c.isnan().sum() > 0:
        raise Exception("NaN value encountered during init_center_c")

    c /= n_samples

    # If c_i is too close to 0, set to +-eps. Reason: a zero unit can be trivially matched with zero weights.
    c[(abs(c) < eps) & (c < 0)] = -eps
    c[(abs(c) < eps) & (c > 0)] = eps

    return c

In [7]:
# Reset center c
# Loading training set
X_train, y_train = load_train_set()
ds = TensorDataset(X_train, y_train)
train_ldr = DataLoader(ds, batch_size=batch_size)
model.eval()
model.c = init_center_c(model, train_ldr)

NameError: name 'batch_size' is not defined

In [3]:
# Generate the scores
logits = model(X_test).to('cuda')
dist = torch.sum((logits - model.c) ** 2, dim=1)
# Convert the scores to probabilities
# This is a binary classification task, hence a sigmoid is appropriate
y_pred = (1 - torch.sigmoid(dist)).detach().cpu()
y_pred

NameError: name 'train_ldr' is not defined

In [38]:
# SUBMISSION
submission = pd.concat([submission, pd.DataFrame(y_pred, columns=['isFraud'])], axis=1)
submission.head()

# save submission file
submission.to_csv("dsvdd_kaggle_submission.csv", index=False)