# Fraud Detection

Classify transactions as fraud or non-fraud using an LSTM-based neural network. Data coming from https://github.com/IBM/TabFormer/tree/main/data/credit_card

## Authors

Natalie Jann [natalie.jann@ibm.com](mailto:natalie.jann@ibm.com)

Sebastian Lehrig [sebastian.lehrig1@ibm.com](mailto:sebastian.lehrig1@ibm.com)

Marvin Giessing [MARVING@de.ibm.com](mailto:MARVING@de.ibm.com)

## License

Apache-2.0 License

The following cells are "Raw"-formatted; change them to "Code" if you need to install the respective packages!

In [None]:
!mamba install -y pydot libgfortran5

In [None]:
!pip install --prefer-binary --no-cache-dir sklearn-pandas imbalanced-learn

In [None]:
!wget https://ibm.ent.box.com/v/tabformer-data/file/770766751708 -O transactions.tgz

In [None]:
# download data from https://ibm.ent.box.com/v/tabformer-data/file/770766751708 and upload here
!tar -xvf transactions.tgz

## 0.) Imports & Constants

In [None]:
import math
import os
import numpy as np
import pandas as pd
import re
from requests import get
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    FunctionTransformer,
    MinMaxScaler,
    LabelBinarizer,
)
from sklearn_pandas import DataFrameMapper

%load_ext lab_black

## 1.) Preprocess the dataset

In [None]:
f = get("https://ibm.box.com/shared/static/wamc5d0yve71jm46ntdp23q6xc001ew2.csv")
with open("card_transaction.v1.csv", "wb") as fd:
    fd.write(f.content)

dist = pd.DataFrame({"No": [], "Yes": []})
df_nf = pd.DataFrame()
df_f = pd.DataFrame()

with pd.read_csv("./card_transaction.v1.csv", chunksize=1_000_000) as reader:
    for chunk in reader:
        df_nf = pd.concat([df_nf, chunk[chunk["Is Fraud?"] == "No"].sample(frac=0.05)])
        df_f = pd.concat([df_f, chunk[chunk["Is Fraud?"] == "Yes"]])
        vc = chunk["Is Fraud?"].value_counts()
        new = pd.DataFrame({"No": [vc[0]], "Yes": [vc[1]]})
        dist = pd.concat([dist, new])

df_nf.to_csv("./card_transactions_non-frauds.csv")
df_f.to_csv("./card_transactions_frauds.csv")
print(f"Ratio Fraud/Non-Fraud: {dist['Yes'].sum()/dist['No'].sum()}")
dist

In [None]:
import math
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    FunctionTransformer,
    MinMaxScaler,
    LabelBinarizer,
)
from sklearn_pandas import DataFrameMapper


def timeEncoder(X):
    X_hm = X["Time"].str.split(":", expand=True)
    d = pd.to_datetime(
        dict(
            year=X["Year"], month=X["Month"], day=X["Day"], hour=X_hm[0], minute=X_hm[1]
        )
    ).astype(int)
    return pd.DataFrame(d)


def amtEncoder(X):
    amt = (
        X.apply(lambda x: x[1:])
        .astype(float)
        .map(lambda amt: max(1, amt))
        .map(math.log)
    )
    return pd.DataFrame(amt)


def decimalEncoder(X, length=5):
    dnew = pd.DataFrame()
    for i in range(length):
        dnew[i] = np.mod(X, 10)
        X = np.floor_divide(X, 10)
    return dnew


def fraudEncoder(X):
    return np.where(X == "Yes", 1, 0).astype(int)


# df_nf = pd.read_csv(f"{os.getenv('HOME')}/card_transactions_non-frauds.csv")
# df_f = pd.read_csv(f"{os.getenv('HOME')}/card_transactions_frauds.csv")
# tdf = pd.concat([df_nf, df_f])
tdf = pd.read_csv("./card_transaction.v1.csv", nrows=1_000_000)
tdf["Merchant Name"] = tdf["Merchant Name"].astype(str)
tdf.drop(["MCC", "Zip", "Merchant State"], axis=1, inplace=True)
tdf.sort_values(by=["User", "Card"], inplace=True)
tdf.reset_index(inplace=True, drop=True)

mapper = DataFrameMapper(
    [
        ("Is Fraud?", FunctionTransformer(fraudEncoder)),
        (
            "Merchant Name",
            [LabelEncoder(), FunctionTransformer(decimalEncoder), OneHotEncoder()],
        ),
        (
            "Merchant City",
            [LabelEncoder(), FunctionTransformer(decimalEncoder), OneHotEncoder()],
        ),
        (["Use Chip"], [SimpleImputer(strategy="constant"), LabelBinarizer()]),
        (["Errors?"], [SimpleImputer(strategy="constant"), LabelBinarizer()]),
        (
            ["Year", "Month", "Day", "Time"],
            [FunctionTransformer(timeEncoder), MinMaxScaler()],
        ),
        ("Amount", [FunctionTransformer(amtEncoder), MinMaxScaler()]),
    ],
    input_df=True,
    df_out=True,
)
mapper.fit(tdf)
tdf = mapper.transform(tdf)

tdf.head()

In [None]:
tdf["Is Fraud?"].value_counts()

In [None]:
tdf.to_csv("./preprocessed_transactions.csv", index=False)

### 2.) Construct the SQL CREATE statement

In [None]:
# build "CREATE TABLE" statement for postgresql
sql = "CREATE TABLE IF NOT EXISTS public.transactions ("


def get_dtype(d):
    if d == "float64":
        return "real"
    elif d == "int64":
        return "bigint"
    else:
        return "varchar"


for col, d in zip(tdf.columns, tdf.dtypes):
    sql += f'"{col.replace("?", "")}" {get_dtype(d)}, '
sql += ";"

In [None]:
sql = re.sub(r", ;$", ");", sql)
sql

### 3.) Transfer the data and fill the database
The following cells are "Raw"-formatted; change them to "Code" and adapt it if you need to copy the file to PostgreSQL!

> create `init_transactions.sql` with the statement stored in `sql` (see above) and add the following line to the end of the file:

```\copy public.transactions FROM './preprocessed_transactions.csv' WITH (FORMAT csv, HEADER true, DELIMITER ',');```