## Inference on TabFormer Data
This notebook loads a pre-trained GNN (GraphSAGE) model and an XGBoost model and runs inference on raw data.

### Goals
* Outline the steps to transform new raw data before feeding it into the models.
* Simulate the use of trained models on new data during inference.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
!cp -r /content/drive/MyDrive/VNPT/data /content/

#### Import packages

In [4]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [5]:
import pandas as pd
import cudf
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
import os
import xgboost as xgb

##### Path to the pre-trained GraphSAGE and the XGBoost models

In [7]:
dataset_base_path = './data/TabFormer'
model_root_dir = os.path.join(dataset_base_path, 'models')
gnn_model_path = os.path.join(model_root_dir, 'node_embedder.pth')
xgb_model_path = os.path.join(model_root_dir, 'embedding_based_xgb_model.json')

#### Definition of the trained GraphSAGE model

In [8]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, n_hops, dropout_prob=0.25):
        super(GraphSAGE, self).__init__()

        # list of conv layers
        self.convs = nn.ModuleList()
        # add first conv layer to the list
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        # add the remaining conv layers to the list
        for _ in range(n_hops - 1):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))

        # output layer
        self.fc = nn.Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index, return_hidden=False):
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=0.5, training=self.training)

        if return_hidden:
            return x
        else:
            return self.fc(x)

### Load the models

##### Load the pre-trained GraphSAGE model

In [10]:
# Load GNN model for generating node embeddings
gnn_model = torch.load(gnn_model_path, weights_only=False)
gnn_model.eval()  # Set the model to evaluation mode

GraphSAGE(
  (convs): ModuleList(
    (0): SAGEConv(74, 32, aggr=mean)
  )
  (fc): Linear(in_features=32, out_features=2, bias=True)
)

##### Load the pre-trained XGBoost model

In [11]:
# Load xgboost model for node classification
loaded_bst = xgb.Booster()
loaded_bst.load_model(xgb_model_path)

#### Define a function to evaluate the XGBoost model

In [12]:
from cuml.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import cupy as cp
from torch.utils.dlpack import to_dlpack

def evaluate_xgboost(bst, embeddings, labels):
    """
    Evaluates the performance of the XGBoost model by calculating different metrics.

    Parameters:
    ----------
    bst : xgboost.Booster
        The trained XGBoost model to be evaluated.
    embeddings : torch.Tensor
        The input feature embeddings for transaction nodes.
    labels : torch.Tensor
        The target labels (Fraud or Non-fraud) transaction, with the same length as the number of
        rows in `embeddings`.
    Returns:
    -------
     Confusion matrix
    """

    # Convert embeddings to cuDF DataFrame
    embeddings_cudf = cudf.DataFrame(cp.from_dlpack(to_dlpack(embeddings)))

    # Create DMatrix for the test embeddings
    dtest = xgb.DMatrix(embeddings_cudf)

    # Predict using XGBoost on GPU
    preds = bst.predict(dtest)
    pred_labels = (preds > 0.5).astype(int)

    # Move labels to CPU for evaluation
    labels_cpu = labels.cpu().numpy()

    # Compute evaluation metrics
    accuracy = accuracy_score(labels_cpu, pred_labels)
    precision = precision_score(labels_cpu, pred_labels, zero_division=0)
    recall = recall_score(labels_cpu, pred_labels, zero_division=0)
    f1 = f1_score(labels_cpu, pred_labels, zero_division=0)
    roc_auc = roc_auc_score(labels_cpu, preds)

    print(f"Performance of XGBoost model trained on node embeddings")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")

    conf_mat = confusion_matrix(labels.cpu().numpy(), pred_labels)
    print('Confusion Matrix:', conf_mat)

___
### Evaluate the XGBoost model on untransformed test data (saved in the preprocessing notebook)

##### Read untransformed data

In [13]:
pd.set_option('future.no_silent_downcasting', True)
path_to_untransformed_data = os.path.join(dataset_base_path, 'xgb', 'untransformed_test.csv')
untransformed_df = pd.read_csv(path_to_untransformed_data)
untransformed_df.head(5)

Unnamed: 0,Amount,Errors,Card,Chip,City,Zip,MCC,Merchant,Fraud
0,488.0,XX,10,Chip Transaction,Rome,0.0,3684,-7807051024009846392,1
1,14.22,XX,10,Chip Transaction,Rome,0.0,5311,9057735476014445185,1
2,13.79,XX,10,Chip Transaction,Rome,0.0,4214,6098563624419731578,1
3,12.27,XX,10,Chip Transaction,Rome,0.0,5812,7069584154815291371,1
4,38.63,XX,10,Chip Transaction,Rome,0.0,5921,3017176960763408508,1


#### Load the data transformer and transform the data using the loaded transformer

In [15]:
!pip install category-encoders==2.6.4 -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/82.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [16]:
with open(os.path.join(dataset_base_path, 'preprocessor.pkl'),'rb') as f:
    loaded_transformer = pickle.load(f)
    transformed_data = loaded_transformer.transform(untransformed_df.loc[:, untransformed_df.columns[:-1]])

##### Evaluate the model on the transformed data

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Convert data to torch tensors
X = torch.tensor(transformed_data).to(torch.float32).to(device)
y = torch.tensor(untransformed_df[untransformed_df.columns[-1]].values ).to(torch.long).to(device)

In [18]:
# Generate node embedding using the GNN model
test_embeddings = gnn_model(
    X.to(device), torch.tensor([[], []], dtype=torch.int).to(device), return_hidden=True)

In [19]:
# Evaluate the XGBoost model
evaluate_xgboost(loaded_bst, test_embeddings, y)

Performance of XGBoost model trained on node embeddings
Accuracy: 0.9057
Precision: 0.4162
Recall: 0.4475
F1 Score: 0.4313
ROC AUC: 0.8883
Confusion Matrix: [[22715  1310]
 [ 1153   934]]


___
## Predictions on raw input
The purpose is to demonstrate the use of the models during inference

##### Read raw data

In [20]:
# Read example raw inputs
raw_file_path = os.path.join(dataset_base_path, 'xgb', 'example_transactions.csv')
data = pd.read_csv(raw_file_path)
data = data[data.columns[:-1]]
original_data = data.copy()

### Transform raw data
* Perform the same set of transformations on the raw data as was done on the training data.

#### Rename columns before the data is fed into the pre-fitted data transformer

In [21]:
# Rename columns before the data is fed into the data transformer
COL_USER = 'User'
COL_CARD = 'Card'
COL_AMOUNT = 'Amount'
COL_MCC = 'MCC'
COL_TIME = 'Time'
COL_DAY = 'Day'
COL_MONTH = 'Month'
COL_YEAR = 'Year'

COL_MERCHANT = 'Merchant'
COL_STATE ='State'
COL_CITY ='City'
COL_ZIP = 'Zip'
COL_ERROR = 'Errors'
COL_CHIP = 'Chip'


_ = data.rename(columns={
    "Merchant Name": COL_MERCHANT,
    "Merchant State": COL_STATE,
    "Merchant City": COL_CITY,
    "Errors?": COL_ERROR,
    "Use Chip": COL_CHIP
    },
    inplace=True
)

#### Handle unknown values as was done for the training data

In [22]:
UNKNOWN_STRING_MARKER = 'XX'
UNKNOWN_ZIP_CODE = 0
MAX_NR_CARDS_PER_YEAR = 9

data[COL_STATE] = data[COL_STATE].fillna(UNKNOWN_STRING_MARKER)
data[COL_ERROR] = data[COL_ERROR].fillna(UNKNOWN_STRING_MARKER)
data[COL_ZIP] = data[COL_ZIP].fillna(UNKNOWN_ZIP_CODE)

#### Convert column type and remove "$" and "," as was done for the training data

In [23]:
data[COL_AMOUNT] = data[COL_AMOUNT].str.replace("$","").astype("float")
data[COL_STATE] = data[COL_STATE].astype('str')
data[COL_MERCHANT] = data[COL_MERCHANT].astype('str')
data[COL_ERROR] = data[COL_ERROR].str.replace(",","")

#### Combine User and Card to generate unique numbers as was done for the training data

In [24]:
data[COL_CARD] = data[COL_USER] * MAX_NR_CARDS_PER_YEAR  + data[COL_CARD]
data[COL_CARD] = data[COL_CARD].astype('int')

##### Check if the transactions have unknown users or merchants

In [25]:
# Find the known merchants and (users, cards), i.e. the merchants and (users, cards) that are in training data
known_merchants = set()
known_cards = set()

for enc in  loaded_transformer.named_transformers_['binary'].named_steps['binary'].ordinal_encoder.mapping:
    if enc['col'] == COL_MERCHANT:
        known_merchants = set(enc['mapping'].keys())
    if enc['col'] == COL_CARD:
        known_cards = set(enc['mapping'].keys())

In [26]:
# Is user, card already known
data['Is_card_known'] = data[COL_CARD].map(lambda c: c in known_cards)

In [27]:
# Is merchant already known
data['Is_merchant_known'] = data[COL_MERCHANT].map(lambda m: m in known_merchants )

#### Use the same set of predictor columns as used for training

In [28]:
numerical_predictors = [COL_AMOUNT]
nominal_predictors = [COL_ERROR, COL_CARD, COL_CHIP, COL_CITY, COL_ZIP, COL_MCC, COL_MERCHANT]

predictor_columns = numerical_predictors + nominal_predictors

##### Load the data transformer and transform the raw data

In [29]:
with open(os.path.join(dataset_base_path, 'preprocessor.pkl'),'rb') as f:
    loaded_transformer = pickle.load(f)
    transformed_data = loaded_transformer.transform(data[predictor_columns])

#### Run prediction

In [30]:
# Set the device to GPU if available, otherwise default to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Convert data to torch tensors
X = torch.tensor(transformed_data).to(torch.float32).to(device)

In [31]:
# Generate node embedding using the GraphSAGE model
transaction_embeddings = gnn_model(
    X.to(device), torch.tensor([[], []], dtype=torch.int).to(device), return_hidden=True)

embeddings_cudf = cudf.DataFrame(cp.from_dlpack(to_dlpack(transaction_embeddings)))

In [32]:
# predict if the transaction(s) are fraud
preds = loaded_bst.predict(xgb.DMatrix(embeddings_cudf))
pred_labels = (preds > 0.5).astype(int)

#### If the transactions have unknown (user, card) or merchant, mark it as fraud

In [33]:
# Name of the target column
target_col_name = 'Is Fraud?'

data[target_col_name] = pred_labels
data[target_col_name] = data.apply(
    lambda row:
    (row[target_col_name] == 1) or (row['Is_card_known'] == False) or (row['Is_merchant_known'] == False), axis=1)

#### Label the raw data as Fraud or Non-Fraud, based on prediction

In [34]:
# Change 0 to No (non-Fraud) and 1 to Yes (Fraud)
binary_to_fraud = { False: 'No', True : 'Yes'}
data[target_col_name] = data[target_col_name].map(binary_to_fraud).astype('str')
original_data[target_col_name] = data[target_col_name]

#### Raw data with predicted labels (Fraud or Non-Fraud)

In [35]:
original_data

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,1999,1,2020,2,27,09:31,$120.00,Chip Transaction,-4282466774399734331,Berlin,NH,3570.0,4829,,No
1,1999,1,2020,2,27,11:36,$12.91,Chip Transaction,3414527459579106770,Nashua,NH,3064.0,5651,,No
2,1999,1,2020,2,27,20:18,$15.52,Chip Transaction,97032797689821735,Merrimack,NH,3054.0,5411,,No
3,1999,1,2020,2,27,20:29,$56.67,Chip Transaction,2500998799892805156,Merrimack,NH,3054.0,4121,,No
4,1999,1,2020,2,27,22:18,$63.43,Chip Transaction,-5162038175624867091,Merrimack,NH,3054.0,5541,,No
5,1999,1,2020,2,27,22:23,$-54.00,Chip Transaction,-5162038175624867091,Merrimack,NH,3054.0,5541,,No
6,1999,1,2020,2,27,22:24,$54.00,Chip Transaction,-5162038175624867091,Merrimack,NH,3054.0,5541,,No
7,1999,1,2020,2,28,07:43,$59.15,Chip Transaction,2500998799892805156,Merrimack,NH,3054.0,4121,,No
8,1999,1,2020,2,28,20:10,$43.12,Chip Transaction,2500998799892805156,Merrimack,NH,3054.0,4121,,No
9,1999,1,2020,2,28,23:10,$45.13,Chip Transaction,4751695835751691036,Merrimack,NH,3054.0,5814,,No
