# Fraud Detection from Credit Card History, Machine learning algorithms
The following jupyter notebook contains a binary classification for fraud detection, from a credit card history. For this; we will explore the following three machine learning algorithms:
* Logistic Regression
* Decision Tree
* Linear Support Vector Machine

This jupyter notebook would showcase the following:
1. Confusion matrix for each of the models
2. Cross validation metrics (precision, recall, f1_score, accuracy_score).
3. Plot of probability distributions between real test data vs each models predictions.

In [1]:
import sys
import os
import pandas as pd
import copy
import matplotlib.pyplot as plt

# Get the root project path
root_project_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

#Append it to sys
sys.path.append(root_project_path)

#Import the necessary modules
from utils import DataLoader, CreditCardPreprocesser

  from .autonotebook import tqdm as notebook_tqdm


### Load the data

In [2]:
#Set the folder name and data folder
folder_name = "data"
data_holder_path = os.path.join(os.getcwd(), os.pardir)

#Set the folder name
data_loader = DataLoader(data_folder_name=folder_name,
    data_folder_path=data_holder_path)

#Get the data
df_data = data_loader.get_dataset()

### Get the preprocessed dataframe

In [3]:
#Create an instance of the Credit card processer
credit_card_processer = CreditCardPreprocesser(df_data=df_data)

#Obtain the df_preprocessed
df_preprocessed = credit_card_processer.fetch_preprocessed_dataframe()

In [4]:
df_preprocessed.shape

(1296675, 96)

In [5]:
df_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 96 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   cc_num                   1296675 non-null  int64  
 1   amt                      1296675 non-null  float64
 2   gender                   1296675 non-null  int64  
 3   zip                      1296675 non-null  int64  
 4   lat                      1296675 non-null  float64
 5   long                     1296675 non-null  float64
 6   city_pop                 1296675 non-null  int64  
 7   unix_time                1296675 non-null  int64  
 8   merch_lat                1296675 non-null  float64
 9   merch_long               1296675 non-null  float64
 10  is_fraud                 1296675 non-null  int64  
 11  merch_zipcode            1296675 non-null  float64
 12  transaction_year         1296675 non-null  int32  
 13  transaction_month        1296675 non-null 

### Select the X and Y target

In [4]:
X: pd.DataFrame = df_preprocessed[[col for col in df_preprocessed.columns if col != "is_fraud"]]
y: pd.DataFrame = df_preprocessed["is_fraud"]

In [7]:
y.value_counts()

is_fraud
0    1289169
1       7506
Name: count, dtype: int64

### Lets now oversample it using SMOTE

In [5]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

#Obtain the over sampled new values
X_smote, y_smote = smote.fit_resample(X.astype("float"), y)

In [7]:
y_smote.value_counts()

is_fraud
0    1289169
1    1289169
Name: count, dtype: int64

### Select the continous columns:

We will select the continous columns; where we are going to apply our `Standardscaler()` instance from scikit-learn to perform standardscaling on ONLY the continous features.

In [6]:
#List all the continous features
continous_features = ["cc_num", "amt", "zip", "lat", "long", "city_pop", "unix_time",
    "merch_lat", "merch_long", "merch_zipcode", "transaction_year", "transaction_month",
    "transaction_day", "transaction_hour", "transaction_minute", "transaction_second",
    "birth_year", "birth_month", "birth_day", "merchant_encoded", "merchant_freq",
    "first_encoded", "first_freq", "last_encoded", "last_freq", "street_encoded",
    "street_freq", "city_encoded", "city_freq", "job_encoded", "job_freq"]

#Select the continous and not continous 
X_smote_continous = X_smote[continous_features]
X_smote_discontinous = X_smote[[c for c in X_smote.columns if c not in continous_features]]

In [7]:
X_smote_discontinous.head(3)

Unnamed: 0,gender,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Implement the standardscaler to continous features

In [8]:
# Importing standard scaler
from sklearn.preprocessing import StandardScaler

#Create a standard scaler object and fit x_train
standard_scaler = StandardScaler()
standard_scaler.fit(X_smote_continous)

#Transform x_train and x_test
X_continous_scaled = standard_scaler.transform(X_smote_continous)

In [9]:
# Now lets make a pandas dataframe
X_continous_scaled = pd.DataFrame(X_continous_scaled,
    columns=X_smote_continous.columns)

In [10]:
X_continous_scaled.shape

(2578338, 31)

In [13]:
X_continous_scaled.head(3)

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,merch_zipcode,...,first_encoded,first_freq,last_encoded,last_freq,street_encoded,street_freq,city_encoded,city_freq,job_encoded,job_freq
0,-0.314146,-0.849665,-0.730733,-0.493568,0.635088,-0.285761,-1.755668,-0.502642,0.572527,-0.74898,...,-0.081514,1.395395,-0.301663,-0.634622,-0.348284,0.578598,-0.335459,0.201721,-0.255531,-0.059194
1,-0.316236,-0.553411,1.873809,2.008165,-2.005238,-0.296425,-1.755666,2.048246,-2.001306,-0.015091,...,-0.201122,0.425288,-0.22725,-0.472387,-0.348284,1.795535,-0.335459,1.620086,-0.243193,0.686028
2,-0.316207,-0.226389,1.286156,0.698217,-1.581124,-0.28366,-1.755666,0.882526,-1.571696,1.583852,...,-0.274487,-0.747764,-0.054028,0.176073,-0.348284,-1.273525,-0.335459,-1.224123,0.115153,-1.51415


### Create the actual datasets.

In [10]:
# These are the new datasets
X_data = pd.concat([X_continous_scaled, X_smote_discontinous], axis=1)
y_data = copy.copy(y_smote)

In [17]:
# Print the shapes
print(X_data.shape)
print(y_data.shape)

(2578338, 95)
(2578338,)


### Initialize the following classifiers
Now we will initialize each of the classifiers, and perform cross validation to obtain all the different metrics.

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

In [28]:
class LSTMModel:
    def __init__(self, input_shape, epochs=20, batch_size=64, learning_rate=0.001, patience=2):
        self.input_shape = input_shape
        self.epochs = epochs
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.patience = patience

        # Use GPU if available, otherwise fallback to CPU
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Initialize the LSTM model
        self.model = LSTMNet(input_size=input_shape[1], hidden_size=32, output_size=1).to(self.device)
        self.criterion = nn.BCELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

    def prepare_data(self, X, y):
        if isinstance(X, (pd.DataFrame, pd.Series)):
            X = X.to_numpy()
        if isinstance(y, (pd.DataFrame, pd.Series)):
            y = y.to_numpy()

        # Convert data to tensors (remain on CPU initially)
        X_tensor = torch.tensor(X, dtype=torch.float32).unsqueeze(2)
        y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

        # Create DataLoader
        dataset = TensorDataset(X_tensor, y_tensor)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        return dataloader

    def fit(self, dataloader):
        for epoch in range(self.epochs):
            self.model.train()
            epoch_loss = 0
            for X_batch, y_batch in dataloader:
                # Move batch to GPU
                X_batch, y_batch = X_batch.to(self.device), y_batch.to(self.device)

                # Zero gradients
                self.optimizer.zero_grad()

                # Forward pass
                outputs = self.model(X_batch).squeeze()
                loss = self.criterion(outputs, y_batch.squeeze())

                # Backward pass and optimization
                loss.backward()
                self.optimizer.step()

                epoch_loss += loss.item()

            # Print training loss for the epoch
            print(f"Epoch {epoch + 1}/{self.epochs}, Training Loss: {epoch_loss:.4f}")

            # Clear GPU memory after each epoch
            torch.cuda.empty_cache()

        return self


    def predict(self, X):
        if isinstance(X, (pd.DataFrame, pd.Series)):
            X = X.to_numpy()

        # Convert data to tensor and move to GPU
        X_tensor = torch.tensor(X, dtype=torch.float32).unsqueeze(2).to(self.device)
        self.model.eval()

        with torch.no_grad():
            predictions = self.model(X_tensor).squeeze().cpu().numpy()

        return (predictions > 0.5).astype(int)


class LSTMNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMNet, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # LSTM forward pass
        lstm_out, _ = self.lstm(x)
        # Use the last hidden state for classification
        last_hidden = lstm_out[:, -1, :]
        output = self.fc(last_hidden)
        return self.sigmoid(output)


In [29]:
ml_classifiers = {
    # "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    # "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    # "Linear Support Vector Machine": LinearSVC(C=1.0, max_iter=1000),
    "LSTM Classifier": LSTMModel(input_shape=(X_data.shape[1], 1), epochs=20, batch_size=128, learning_rate=0.001)
}

### Split the data
We would like to perform the following:
1. Split our data into train_validation and test; we would hold the test dataset for a final analysis.
2. Perform cross validation using train_validation; and perform shuffled folds to see its accuracies.
3. Keep the saved cross val scores; for later analysis.

In [30]:
# Obtain the data into train_val and test
x_train_val, x_test, y_train_val, y_test = train_test_split(X_data, y_data,
    test_size=0.2, shuffle=True, random_state=42)
print(x_train_val.shape)
print(x_test.shape)
print(y_train_val.shape)
print(y_test.shape)

(2062670, 95)
(515668, 95)
(2062670,)
(515668,)


### Perform Cross validation for all ML algorithms
Perform cross validation for each of the ML algorithms, to obtain its results.

In [32]:
# Create the holders for each metrics
ml_metrics = {}

for name, clf in ml_classifiers.items():
    print(f"\n==========={name}============ Starting")
    # Initialize variables for each
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    # Perform 20 loops for all classifiers
    for i in range(20):
        print(f"Iteration: {i} - {name}")
        # Call the train test split
        x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, shuffle=True)

        # Handle LSTM separately
        if name == "LSTM Classifier":
            # Prepare the DataLoader for training
            dataloader = clf.prepare_data(x_train, y_train)
            clf.fit(dataloader)  # Removed val_data
            y_pred = clf.predict(x_val)
        else:
            # Fit other classifiers
            clf.fit(x_train, y_train)
            y_pred = clf.predict(x_val)

        # Append metrics
        accuracies.append(accuracy_score(y_val, y_pred))
        precisions.append(precision_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
        f1_scores.append(f1_score(y_val, y_pred))

        # Clear GPU memory for LSTM
        if name == "LSTM Classifier":
            del clf.model
            torch.cuda.empty_cache()

    # Add the metrics
    ml_metrics[name + "_accuracies"] = accuracies
    ml_metrics[name + "_precisions"] = precisions
    ml_metrics[name + "_recalls"] = recalls
    ml_metrics[name + "_f1_scores"] = f1_scores

    # Create a dataframe and save it into a dataframe
    df_results = pd.DataFrame(ml_metrics)
    df_results.to_csv('my_data_' + name.replace(" ", "_") + '.csv', index=False)
    print(f"\n==========={name}============ Ending")




Iteration: 0 - LSTM Classifier
Epoch 1/20, Training Loss: 8377.7847
Epoch 2/20, Training Loss: 4591.8136
Epoch 3/20, Training Loss: 3433.6091
Epoch 4/20, Training Loss: 3099.3454
Epoch 5/20, Training Loss: 3165.7602
Epoch 6/20, Training Loss: 4940.2036
Epoch 7/20, Training Loss: 6066.3642
Epoch 8/20, Training Loss: 6230.3784
Epoch 9/20, Training Loss: 4841.6575
Epoch 10/20, Training Loss: 4032.4728
Epoch 11/20, Training Loss: 4886.6333
Epoch 12/20, Training Loss: 6584.0046
Epoch 13/20, Training Loss: 6096.7374
Epoch 14/20, Training Loss: 6260.2776
Epoch 15/20, Training Loss: 6464.3083
Epoch 16/20, Training Loss: 6781.3651
Epoch 17/20, Training Loss: 7637.2723
Epoch 18/20, Training Loss: 7669.5679
Epoch 19/20, Training Loss: 8258.2390
Epoch 20/20, Training Loss: 8276.2994


OutOfMemoryError: CUDA out of memory. Tried to allocate 123.33 GiB. GPU 0 has a total capacity of 23.99 GiB of which 10.20 GiB is free. Of the allocated memory 7.21 GiB is allocated by PyTorch, and 4.97 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Data Visualization

Now that we have finally got the cross validation metrics, we will perform the last steps in order to see the different results of each of the ML models. This would be illustrated as:

1. Plot bar plot of accuracies, precision, recall, and f1 score for cross validation metrics of each of the models.

2. Next; after having the cross validation metrics; we previously have isolated a part of the data as test set. We are going to train a final model for each ML classifier, and obtain the test metrics as well. With these test and cross validation metrics; we are going to perform the following:
3. Plot bar plot of accuracies, precision, recall, and f1 score for test metrics of each of the models.
4. Generate confusion matrices for each of the test final models.
5. Create ROC curves for all models and overlay for comparison

In [None]:
from typing import List
def read_df_data(clfs_names:List[str]) -> pd.DataFrame:
    """
    Function that would retrieve the checkpoints cross validation
    values for accuracy, precision, recall, and f1 score.
    """
    #Create a holder to store the data
    actual_data = {}

    for name in clfs_names:
        #Read the dataframe
        df_data = pd.read_csv("my_data_" + name + ".csv")

        #Assign it to the actual data properly
        actual_data[name + "_accuracies"] = df_data[name + "_accuracies"].to_numpy()
        actual_data[name + "_precisions"] = df_data[name + "_precisions"].to_numpy()
        actual_data[name + "_recalls"] = df_data[name + "_recalls"].to_numpy()
        actual_data[name + "_f1_scores"] = df_data[name + "_f1_scores"].to_numpy()
    
    #Return the total dataframe.
    return pd.DataFrame(actual_data)

In [None]:
#Get the cross validation metrics
df_cross_val_metrics = read_df_data(clfs_names=ml_classifiers.keys())

print(ml_classifiers.keys())


In [None]:
df_cross_val_metrics.columns

In [56]:
df_cross_val_metrics.head(2)

Unnamed: 0,Logistic Regression_accuracies,Logistic Regression_precisions,Logistic Regression_recalls,Logistic Regression_f1_scores,Decition Tree Classifier_accuracies,Decition Tree Classifier_precisions,Decition Tree Classifier_recalls,Decition Tree Classifier_f1_scores,Linear Support Vector Machine_accuracies,Linear Support Vector Machine_precisions,Linear Support Vector Machine_recalls,Linear Support Vector Machine_f1_scores
0,0.899715,0.928023,0.866518,0.896217,0.998449,0.997649,0.999251,0.998449,0.885267,0.9217,0.842319,0.880223
1,0.899538,0.927577,0.866385,0.895937,0.998449,0.997624,0.999284,0.998453,0.884928,0.919574,0.843096,0.879676


### Plot the cross validation bar plots

In [None]:
# Create two lists for a new dataframe, models and metrics
models = ml_classifiers.keys()
metrics = ["accuracies", "precisions", "recalls", "f1_scores"]

#Lets do a df plot cross val
plot_cross_val = []
for model in models:
    model_data = [df_cross_val_metrics[f"{model}_{metric}"].mean() for metric in metrics]
    plot_cross_val.append(model_data)

# Convert to a Pandas DataFrame for plotting
df_plot_cross_val = pd.DataFrame(plot_cross_val, columns=metrics, index=models)

In [None]:
df_plot_cross_val.head(3)

### Plot the bar graph of cross val metrics

In [None]:
fig, axs = plt.subplots(1, figsize = (12, 6))
df_plot_cross_val.T.plot(ax=axs, kind="bar", legend=True, width = 0.8)

# Rotate x-axis ticks
axs.set_xticklabels(axs.get_xticklabels(), rotation=90, fontsize=10)

# Annotate bar values
for container in axs.containers:
    axs.bar_label(container, fmt='%.4f', fontsize=10, padding=3)

axs.set_ylim(0.83, 1.01)

### Train the final test models!
Even thought its obvious the decision tree classifier is the best model between all the ML models; now we will perform the final test models and retrieve the metrics for each of them.

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Create the holders for each metrics
test_metrics = []
conf_matrices = []
class_reports = []
roc_auc_scores = []

for name, clf in ml_classifiers.items():
    print(f"\n==========={name}============ Starting")

    # Fit the model with x_train_val and the y_train_val dataset now!
    clf.fit(X=x_train_val, y=y_train_val)

    #Obtain the predictions for both
    y_pred = clf.predict(x_test)

    # Append them in order per row (accuracy, precision, recall, f1_score)
    test_metrics.append(accuracy_score(y_test, y_pred))
    test_metrics.append(precision_score(y_test, y_pred))
    test_metrics.append(recall_score(y_test, y_pred))
    test_metrics.append(f1_score(y_test, y_pred))

    # Append the confusion matrix
    conf_matrices.append(confusion_matrix(y_test, y_pred))
    class_reports.append(classification_report(y_test, y_pred))
    roc_auc_scores.append(roc_auc_score(y_test, y_pred))

    print(f"\n==========={name}============ Ending")

In [None]:
print(test_metrics)

In [None]:
# Convert this to a numpy array
import numpy as np
np_test_metrics = np.array(test_metrics)
np_test_metrics = np_test_metrics.reshape(3,4)

In [None]:
print(np_test_metrics)

In [71]:
#Create a dataframe of the model the test metrics
df_test_results = pd.DataFrame(np_test_metrics, columns=metrics, index=models)
print(df_test_results.head(4))

                               accuracies  precisions   recalls  f1_scores
Logistic Regression              0.900360    0.929433  0.867051   0.897159
Decition Tree Classifier         0.998751    0.998118  0.999393   0.998755
Linear Support Vector Machine    0.886353    0.922740  0.843939   0.881582


In [None]:
fig, axs = plt.subplots(1, figsize = (12, 6))
df_test_results.T.plot(ax=axs, kind="bar", legend=True, width = 0.8)

# Rotate x-axis ticks
axs.set_xticklabels(axs.get_xticklabels(), rotation=90, fontsize=10)

# Annotate bar values
for container in axs.containers:
    axs.bar_label(container, fmt='%.4f', fontsize=10, padding=3)

axs.set_ylim(0.83, 1.01)

### Print each of the classification reports

In [None]:
for i, name in enumerate(ml_classifiers):
    print(f"\n==========={name}============ class report")
    print(class_reports[i])

### Lets lot the confusion matrices now

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

for i, name in enumerate(ml_classifiers):
    cm = conf_matrices[i]
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                display_labels=clf.classes_)
    disp.plot()
    plt.suptitle(name)

### Finally, print the ROC auc scores

In [None]:
for i, name in enumerate(ml_classifiers):
    print(f"{name} ROC_AUC_Scores: {roc_auc_scores[i]}")