# Package and Bibliotheque importation

In [3]:
import requests
from io import BytesIO

from datetime import date, datetime, timedelta
import os
import math

import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import matplotlib as mpl

import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

# Functions definition

In [5]:
def get_data_from_web(url):
    # Téléchargez le fichier depuis l'URL
    response = requests.get(url)

    if response.status_code == 200:
        # Créez un objet BytesIO à partir des données téléchargées
        data = BytesIO(response.content)

        # Utilisez la fonction read_pickle pour charger le fichier depuis BytesIO
        dataframe = pd.read_pickle(data)
        return  dataframe
    else:
        print("Échec du téléchargement du fichier.")

# Permettra le regroupement de tout les fichier en un dataFrames
def get_data_from_folder(folder):
    dataframes = []
    # Parcourez les fichiers du dossier
    for file in os.listdir(folder):
        if file.endswith(".pkl"):
            file_path = os.path.join(folder, file)
            df = pd.read_pickle(file_path)
            dataframes.append(df)
    combined_df = pd.concat(dataframes, ignore_index=True)

    return combined_df

## Calculat fraud risk

In [39]:
DAY_DELAY = 7

def get_count_risk_rolling_window(terminal_transactions, window_size, delay_period=DAY_DELAY):
    frauds_in_delay = terminal_transactions.rolling(
        str(delay_period) + "d", on="TX_DATETIME"
    )["TX_FRAUD"].sum()
    transactions_in_delay = terminal_transactions.rolling(
        str(delay_period) + "d", on="TX_DATETIME"
    )["TX_FRAUD"].count()

    frauds_until_window = terminal_transactions.rolling(
        str(delay_period + window_size) + "d", on="TX_DATETIME"
    )["TX_FRAUD"].sum()
    transactions_until_window = terminal_transactions.rolling(
        str(delay_period + window_size) + "d", on="TX_DATETIME"
    )["TX_FRAUD"].count()

    frauds_in_window = frauds_until_window - frauds_in_delay
    transactions_in_window = transactions_until_window - transactions_in_delay

    terminal_transactions["fraud_risk"] = (
            frauds_in_window / transactions_in_window
    ).fillna(0)

    return terminal_transactions



## Data Split function

In [45]:
def get_train_test_set(df, start_date_training, delta_train=7, delta_delay=DAY_DELAY, delta_test=7, random_state=0):
    # Obtenir les données de l'ensemble de formation
    train_df = df[
        (df["datetime"] >= start_date_training)
        & (df["datetime"] < start_date_training + timedelta(days=delta_train))
        ]

    # Obtenir les données de l'ensemble de test
    test_df = []

    # Remarque : les cartes dont on sait qu'elles ont été compromises après la période de retard sont retirées de l'ensemble de test.
    # C'est-à-dire que pour chaque jour de test, toutes les fraudes connues à (jour_de_test_période_de_retard) sont supprimées.

    # Tout d'abord, récupérer les clients frauduleux connus dans l'ensemble d'apprentissage.
    known_defrauded_customers = set(train_df[train_df["is_fraud"] == 1]["customer_id"])

    # Obtenir le jour de début relatif de l'ensemble d'apprentissage (plus facile que TX_DATETIME pour collecter les données de test)
    start_tx_time_days_training = train_df["day"].min()

    # Ensuite, pour chaque jour de l'ensemble de tests
    for day in range(delta_test):
        # Obtenir les données de test pour ce jour-là
        test_df_day = df[
            df["day"] == start_tx_time_days_training + delta_train + delta_delay + day
            ]

        # Les cartes compromises de cette journée de test, moins la période de retard, sont ajoutées à l'ensemble des clients frauduleux connus.
        test_df_day_delay_period = df[
            df["day"] == start_tx_time_days_training + delta_train + day - 1
            ]

        new_defrauded_customers = set(
            test_df_day_delay_period[test_df_day_delay_period["is_fraud"] == 1][
                "customer_id"
            ]
        )
        known_defrauded_customers = known_defrauded_customers.union(
            new_defrauded_customers
        )

        test_df_day = test_df_day[
            ~test_df_day["customer_id"].isin(known_defrauded_customers)
        ]

        test_df.append(test_df_day)

    test_df = pd.concat(test_df)

    # Trier les ensembles de données par ordre croissant de l'ID de la transaction
    train_df = train_df.sort_values("id")
    test_df = test_df.sort_values("id")

    return (train_df, test_df)

# Importation data

In [6]:
## Visualisation sur la composition d'un fichier
url = "https://github.com/Fraud-Detection-Handbook/simulated-data-raw/raw/main/data/2018-04-02.pkl"
transactions_df = get_data_from_web(url)

In [7]:
transactions_df.sample(10, random_state=0)

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO
11847,11847,2018-04-02 08:17:29,3420,5510,55.93,116249,1,0,0
17292,17292,2018-04-02 16:43:52,53,7030,13.63,146632,1,0,0
18669,18669,2018-04-02 20:53:53,2342,4358,27.41,161633,1,0,0
11677,11677,2018-04-02 07:57:48,2777,9759,4.3,115068,1,0,0
17843,17843,2018-04-02 17:55:37,2819,1635,91.61,150937,1,0,0
14165,14165,2018-04-02 11:49:15,1117,6067,21.39,128955,1,0,0
15711,15711,2018-04-02 14:01:43,3688,1962,28.62,136903,1,0,0
12841,12841,2018-04-02 09:53:52,4944,9157,92.83,122032,1,0,0
17260,17260,2018-04-02 16:38:58,3285,5928,73.05,146338,1,0,0
12652,12652,2018-04-02 09:35:30,4956,1871,51.94,120930,1,0,0


In [8]:
folder = "/Users/jeanmermozeffi/Desktop/Deep Learning/Credit Card Fraud Detection/simulated-data-raw-main/data"
transactions_df = get_data_from_folder(folder)

In [9]:
transactions_df.shape

(1754155, 9)

In [10]:
not_fraud_count, fraud_count = np.bincount(transactions_df["TX_FRAUD"])

total_count = not_fraud_count + fraud_count
print(
    (
        f"Data:\n"
        f"    Total: {total_count}\n"
        f"    Fraud: {fraud_count} ({100 * fraud_count / total_count:.2f}% du total)\n"
    )
)

Data:
    Total: 1754155
    Fraud: 14681 (0.84% du total)



In [12]:
df = pd.concat(
    [
        transactions_df[transactions_df["TX_FRAUD"] == 0].sample(1000, random_state=0),
        transactions_df[transactions_df["TX_FRAUD"] == 1].sample(1000, random_state=0),
    ]
)

fig = px.histogram(
    df,
    title="Transaction count for different amounts",
    x="TX_AMOUNT",
    color="TX_FRAUD",
    marginal="box",
)
fig.update_traces(opacity=0.75)
fig.update_layout(barmode="overlay")
fig.show()

# Feature engineering

In [13]:
cleaned_df = pd.DataFrame()

In [14]:
cleaned_df["amount"] = transactions_df["TX_AMOUNT"]
cleaned_df["is_fraud"] = transactions_df["TX_FRAUD"]
cleaned_df["is_weekend"] = transactions_df["TX_DATETIME"].dt.weekday >= 5
cleaned_df["is_night"] = transactions_df["TX_DATETIME"].dt.hour <= 6

In [16]:
cleaned_df.sample(10, random_state=0)

Unnamed: 0,amount,is_fraud,is_weekend,is_night
1733953,7.67,0,False,False
241598,11.27,0,False,False
1643723,51.81,0,False,False
1064505,165.6,0,False,False
1030163,23.8,0,False,False
109238,7.47,0,False,False
873682,55.61,0,False,True
335594,80.54,0,False,False
1589071,106.54,0,False,False
436905,10.98,0,False,False


In [26]:
# Comprehension du fénetrages
data = {
    'date': pd.date_range(start='2023-01-01', periods=10, freq='D'),
    'valeur': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
}

In [27]:
pd.DataFrame(data)

Unnamed: 0,date,valeur
0,2023-01-01,10
1,2023-01-02,20
2,2023-01-03,30
3,2023-01-04,40
4,2023-01-05,50
5,2023-01-06,60
6,2023-01-07,70
7,2023-01-08,80
8,2023-01-09,90
9,2023-01-10,100


In [30]:
# Utilisez la méthode rolling pour calculer une moyenne mobile sur une fenêtre de 3 jours
df['moyenne_mobile'] = df['valeur'].rolling(window=3).mean()
print(df)

        date  valeur  moyenne_mobile
0 2023-01-01      10             NaN
1 2023-01-02      20             NaN
2 2023-01-03      30            20.0
3 2023-01-04      40            30.0
4 2023-01-05      50            40.0
5 2023-01-06      60            50.0
6 2023-01-07      70            60.0
7 2023-01-08      80            70.0
8 2023-01-09      90            80.0
9 2023-01-10     100            90.0


In [35]:
transactions_df = transactions_df.sort_values(by="TX_DATETIME")
transactions_df.sample(20, random_state=0)

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO
449348,1733953,2018-09-28 18:21:41,83,5339,140.08,15618101,180,0,0
519423,241598,2018-04-26 07:08:02,281,3845,100.52,2185682,25,0,0
1404203,1643723,2018-09-19 11:44:22,2141,3037,75.85,14816662,171,0,0
1466441,1064505,2018-07-21 00:49:43,4512,9804,66.85,9593383,111,0,0
214912,1030163,2018-07-17 11:03:21,3098,4279,14.42,9284601,107,0,0
915167,109238,2018-04-12 10:38:13,286,1669,17.0,988693,11,0,0
20127,873682,2018-07-01 05:02:13,2839,1753,54.92,7880533,91,0,0
123978,335594,2018-05-05 21:18:48,652,4909,133.03,3014328,34,0,0
179763,1589071,2018-09-13 16:11:17,3343,9734,81.1,14314277,165,0,0
244788,436905,2018-05-16 12:23:57,4812,3522,106.22,3932637,45,0,0


In [37]:
# Applications sur les données
cleaned_df["customer_num_transactions_1_day"] = transactions_df.groupby(
    "CUSTOMER_ID"
).apply(
    lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("1d", on="TX_DATETIME").count()
)[
    "TX_AMOUNT"
]

cleaned_df["customer_num_transactions_7_day"] = transactions_df.groupby(
    "CUSTOMER_ID"
).apply(
    lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("7d", on="TX_DATETIME").count()
)[
    "TX_AMOUNT"
]

cleaned_df["customer_num_transactions_30_day"] = transactions_df.groupby(
    "CUSTOMER_ID"
).apply(
    lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("30d", on="TX_DATETIME").count()
)[
    "TX_AMOUNT"
]

cleaned_df["customer_avg_amount_1_day"] = transactions_df.groupby("CUSTOMER_ID").apply(
    lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("1d", on="TX_DATETIME").mean()
)["TX_AMOUNT"]

cleaned_df["customer_avg_amount_7_day"] = transactions_df.groupby("CUSTOMER_ID").apply(
    lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("7d", on="TX_DATETIME").mean()
)["TX_AMOUNT"]

cleaned_df["customer_avg_amount_30_day"] = transactions_df.groupby("CUSTOMER_ID").apply(
    lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("30d", on="TX_DATETIME").mean()
)["TX_AMOUNT"]


Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)


Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)


Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)


Not prepending group keys to t

In [40]:
cleaned_df["terminal_num_transactions_1_day"] = transactions_df.groupby(
    "TERMINAL_ID"
).apply(
    lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("1d", on="TX_DATETIME").count()
)[
    "TX_AMOUNT"
]
cleaned_df["terminal_num_transactions_7_day"] = transactions_df.groupby(
    "TERMINAL_ID"
).apply(
    lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("7d", on="TX_DATETIME").count()
)[
    "TX_AMOUNT"
]
cleaned_df["terminal_num_transactions_30_day"] = transactions_df.groupby(
    "TERMINAL_ID"
).apply(
    lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("30d", on="TX_DATETIME").count()
)[
    "TX_AMOUNT"
]

cleaned_df["terminal_fraud_risk_1_day"] = transactions_df.groupby("TERMINAL_ID").apply(
    lambda x: get_count_risk_rolling_window(x, 1, 7)
)["fraud_risk"]
cleaned_df["terminal_fraud_risk_7_day"] = transactions_df.groupby("TERMINAL_ID").apply(
    lambda x: get_count_risk_rolling_window(x, 7, 7)
)["fraud_risk"]
cleaned_df["terminal_fraud_risk_30_day"] = transactions_df.groupby("TERMINAL_ID").apply(
    lambda x: get_count_risk_rolling_window(x, 30, 7)
)["fraud_risk"]


Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)


Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)


Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)


Not prepending group keys to t

In [41]:
cleaned_df["day"] = transactions_df["TX_TIME_DAYS"]
cleaned_df["datetime"] = transactions_df["TX_DATETIME"]
cleaned_df["customer_id"] = transactions_df["CUSTOMER_ID"]
cleaned_df["id"] = transactions_df["TRANSACTION_ID"]

In [42]:
pd.concat(
    # Permet l'affichage de quelques transcations frauduleuse et non frauduleuse
    [
        cleaned_df[cleaned_df["is_fraud"] == 1].sample(5, random_state=0),
        cleaned_df[cleaned_df["is_fraud"] == 0].sample(5, random_state=0),
    ]
).sample(10, random_state=0)

Unnamed: 0,amount,is_fraud,is_weekend,is_night,customer_num_transactions_1_day,customer_num_transactions_7_day,customer_num_transactions_30_day,customer_avg_amount_1_day,customer_avg_amount_7_day,customer_avg_amount_30_day,terminal_num_transactions_1_day,terminal_num_transactions_7_day,terminal_num_transactions_30_day,terminal_fraud_risk_1_day,terminal_fraud_risk_7_day,terminal_fraud_risk_30_day,day,datetime,customer_id,id
658128,25.2,1,False,False,5.0,21.0,87.0,16.556,25.194762,25.351379,2.0,2.0,16.0,0.0,1.0,0.315789,80,2018-06-20 13:40:04,4986,773378
237618,57.26,0,True,False,4.0,28.0,117.0,33.1075,35.576429,33.270171,1.0,11.0,34.0,0.0,0.0,0.0,41,2018-05-12 16:15:47,4692,401127
320052,218.55,1,False,False,8.0,25.0,111.0,176.6775,81.9196,45.015495,2.0,9.0,34.0,0.0,0.0,0.0,169,2018-09-17 09:58:57,112,1623245
286832,40.87,0,False,False,5.0,23.0,95.0,30.934,31.351304,31.386421,2.0,6.0,24.0,0.0,0.0,0.0,103,2018-07-13 18:23:45,2782,996405
1012662,29.12,1,False,False,5.0,30.0,123.0,29.14,20.813667,19.03252,3.0,7.0,27.0,0.0,1.0,0.535714,88,2018-06-28 13:04:15,2424,849350
951448,22.16,0,True,False,6.0,36.0,112.0,13.935,17.2975,18.104643,4.0,12.0,37.0,0.0,0.0,0.0,62,2018-06-02 07:24:49,2355,596546
1499528,186.55,0,False,False,2.0,12.0,85.0,149.465,96.156667,92.545294,1.0,7.0,37.0,0.0,0.0,0.0,53,2018-05-24 11:46:44,1094,513249
1418862,2.25,1,False,True,3.0,20.0,90.0,5.86,6.9035,6.469444,6.0,11.0,39.0,0.0,0.0,0.0,177,2018-09-25 01:43:40,4844,1697041
407738,79.68,1,False,False,1.0,8.0,42.0,79.68,67.61875,68.59381,1.0,3.0,24.0,1.0,1.0,0.64,94,2018-07-04 12:09:10,1398,906685
1582131,19.2,0,False,True,3.0,17.0,74.0,50.213333,60.911765,59.624595,1.0,7.0,46.0,0.0,0.0,0.0,116,2018-07-26 05:44:25,681,1113178


# Slicing the dataset

In [46]:
train_df, test_df = get_train_test_set(cleaned_df, datetime(2018, 7, 25), delta_train=21)
train_df, val_df = get_train_test_set(train_df, datetime(2018, 7, 25))

In [47]:
train_df.shape, val_df.shape, test_df.shape

((67240, 20), (58264, 20), (50321, 20))

# Labels determinete

In [48]:
label_columns = ["is_fraud"]
feature_columns = [
    "amount",
    "is_weekend",
    "is_night",
    "customer_num_transactions_1_day",
    "customer_num_transactions_7_day",
    "customer_num_transactions_30_day",
    "customer_avg_amount_1_day",
    "customer_avg_amount_7_day",
    "customer_avg_amount_30_day",
    "terminal_num_transactions_1_day",
    "terminal_num_transactions_7_day",
    "terminal_num_transactions_30_day",
    "terminal_fraud_risk_1_day",
    "terminal_fraud_risk_7_day",
    "terminal_fraud_risk_30_day",
]

train_labels = np.array(train_df[label_columns])
val_labels = np.array(val_df[label_columns])
test_labels = np.array(test_df[label_columns])

train_features = np.array(train_df[feature_columns])
val_features = np.array(val_df[feature_columns])
test_features = np.array(test_df[feature_columns])

In [49]:
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)

val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)

In [50]:
print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)

Training labels shape: (67240, 1)
Validation labels shape: (58264, 1)
Test labels shape: (50321, 1)
Training features shape: (67240, 15)
Validation features shape: (58264, 15)
Test features shape: (50321, 15)


# Le modèle

## Données déséquilibrées / Imbalanced Data

In [51]:
# Nos données sont déséquilibrées, Moins de 1 % de notre ensemble de données contient des transactions frauduleuses.
# calculons les poids pour chaque classe que nous transmettrons à Keras. Les poids rendent les étiquettes frauduleuses 120 fois « plus importantes » que les étiquettes non frauduleuses.

weight_for_not_fraud = (1.0 / not_fraud_count) * total_count / 2.0
weight_for_fraud = (1.0 / fraud_count) * total_count / 2.0

class_weight = {0: weight_for_not_fraud, 1: weight_for_fraud}

class_weight

{0: 0.5042199538481172, 1: 59.74235406307473}

## Structure du modele / Model structure
Taille du lot : 64
Époques : 40
Nombre de couches cachées : 2
Nœuds par couche cachée : 500
Probabilité de couche d'abandon : 0,2
Taux d'apprentissage : 0,001

In [52]:
output_bias = tf.keras.initializers.Constant(np.log([fraud_count / not_fraud_count]))

model = keras.Sequential(
    [
        keras.layers.Dense(
            500, activation="relu", input_shape=(train_features.shape[-1],)
        ),
        keras.layers.Dense(
            500, activation="relu", input_shape=(train_features.shape[-1],)
        ),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1, activation="sigmoid", bias_initializer=output_bias),
    ]
)

In [53]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=[
        keras.metrics.Precision(name="precision"),
        keras.metrics.Recall(name="recall"),
        keras.metrics.AUC(name="auc"),
        keras.metrics.AUC(name="prc", curve="PR"),
    ],
)

In [54]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 500)               8000      
                                                                 
 dense_1 (Dense)             (None, 500)               250500    
                                                                 
 dropout (Dropout)           (None, 500)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 501       
                                                                 
Total params: 259001 (1011.72 KB)
Trainable params: 259001 (1011.72 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# Model training / Entrainement du model

## callback

In [58]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_prc", verbose=1, patience=10, mode="max", restore_best_weights=True
)

In [59]:
BATCH_SIZE = 64
training_history = model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=40,
    callbacks=[early_stopping],
    validation_data=(val_features, val_labels),
    class_weight=class_weight,
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 12: early stopping


In [61]:
res = []

metrics_to_plot = [
    ("loss", "Loss"),
    ("precision", "Precision"),
    ("recall", "Recall"),
    ("auc", "Area under ROC curve"),
    ("prc", "Area under PR curve"),
]
fig = make_subplots(rows=len(metrics_to_plot), cols=1)

for metric, name in metrics_to_plot:
    fig = go.Figure(
        data=[
            go.Scatter(
                x=training_history.epoch,
                y=training_history.history[metric],
                mode="lines",
                name="Training",
            ),
            go.Scatter(
                x=training_history.epoch,
                y=training_history.history["val_" + metric],
                mode="lines",
                line={"dash": "dash"},
                name="Validation",
            ),
        ]
    )
    fig.update_yaxes(title=name)
    fig.update_xaxes(title="Epoch")

    if (metric, name) == metrics_to_plot[0]:
        fig.update_layout(
            height=250, title="Training history", margin={"b": 0, "t": 50}
        )
    else:
        fig.update_layout(height=200, margin={"b": 0, "t": 0})
    fig.show()

# Model performance

In [62]:
train_predictions = model.predict(train_features, batch_size=BATCH_SIZE)
test_predictions = model.predict(test_features, batch_size=BATCH_SIZE)

predictions_df = pd.DataFrame(
    {"Prediction": train_predictions.ravel(), "Label": train_labels.ravel()}
)
predictions_df = pd.concat(
    [
        predictions_df[predictions_df["Label"] == 0].sample(5000, random_state=0),
        predictions_df[predictions_df["Label"] == 1].sample(500, random_state=0),
    ]
)
fig = px.histogram(
    predictions_df,
    x="Prediction",
    title="Prediction values",
    color="Label",
    marginal="box",
    labels={"0": "Legitimate", "1": "Fraudulent"},
)
fig.update_traces(opacity=0.75)
fig.update_layout(barmode="overlay")
fig.show()



In [63]:
def make_roc_df(name, predictions, labels):
    fp, tp, _ = sklearn.metrics.roc_curve(labels, predictions)
    return pd.DataFrame({"fp": fp * 100, "tp": tp * 100, "Dataset": name})


roc_df = pd.concat(
    [
        make_roc_df("Training", train_predictions, train_labels),
        make_roc_df("Test", test_predictions, test_labels),
    ]
)

fig = px.line(
    roc_df,
    title="ROC Curve",
    x="fp",
    y="tp",
    color="Dataset",
    labels={"fp": "False Positives (%)", "tp": "True Positives (%)"},
)
fig.update_yaxes(range=[60, 100])
fig.update_traces(line={"dash": "dash"}, selector={"name": "test"})
fig.show()