In [1]:
# This is a sample Python script.
import pandas as pd
import torch
device = torch.device('cpu')
import matplotlib.pyplot as plt
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader


In [2]:
df=pd.read_csv("../data/PaySim_kaggle.csv")

df=df.sample(n=5000000)
#df=df.sample(n=500000)


print(df.head())

         step      type      amount     nameOrig  oldbalanceOrg  \
1294621   136   PAYMENT    21458.54   C715465815        9315.97   
2440721   203   PAYMENT    12971.43   C514521992       11426.00   
5944907   405  TRANSFER  2652547.31   C190920374           0.00   
837236     41  TRANSFER   129747.94   C225903766           0.00   
2636347   209  CASH_OUT   330022.08  C1951388404         236.00   

         newbalanceOrig     nameDest  oldbalanceDest  newbalanceDest  isFraud  \
1294621             0.0  M1092586090            0.00            0.00        0   
2440721             0.0  M1306897095            0.00            0.00        0   
5944907             0.0   C804681399      5969960.42      8622507.73        0   
837236              0.0   C425256616       737092.87       866840.81        0   
2636347             0.0  C1977002708            0.00       330022.08        0   

         isFlaggedFraud  
1294621               0  
2440721               0  
5944907               0  
837236

In [3]:
# Set the proportion of non-fraud to remove (e.g., 50% of non-fraud instances)
remove_fraction = 0.999

# Separate the fraud and non-fraud instances
fraud_df = df[df['isFraud'] == 1]
non_fraud_df = df[df['isFraud'] == 0]

# Randomly sample and remove 'remove_fraction' proportion of non-fraud instances
non_fraud_to_remove = non_fraud_df.sample(frac=remove_fraction, random_state=42)

# Drop the sampled non-fraud instances from the DataFrame
df = df.drop(non_fraud_to_remove.index)

# Verify the new balance
label_counts = df['isFraud'].value_counts()
fraud_ratio = label_counts[1] / len(df)
non_fraud_ratio = label_counts[0] / len(df)

print(f"Fraud count: {label_counts[1]}")
print(f"Non-fraud count: {label_counts[0]}")
print(f"Fraud ratio: {fraud_ratio:.4f}")
print(f"Non-fraud ratio: {non_fraud_ratio:.4f}")
print(df.shape)


Fraud count: 6453
Non-fraud count: 4994
Fraud ratio: 0.5637
Non-fraud ratio: 0.4363
(11447, 11)


In [4]:
# Create a mapping from unique user names to numeric IDs (nodes)
from sklearn.preprocessing import LabelEncoder
orig_encoder = LabelEncoder()
dest_encoder = LabelEncoder()

df['nameOrig_encoded'] = orig_encoder.fit_transform(df['nameOrig'])
df['nameDest_encoded'] = dest_encoder.fit_transform(df['nameDest'])

In [5]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,nameOrig_encoded,nameDest_encoded
425813,18,CASH_OUT,355357.81,C234632695,100.0,0.0,C401486812,189314.27,544672.08,0,0,7004,6615
4151754,303,CASH_OUT,29392.99,C1610490555,0.0,0.0,C1044130499,151664.72,197529.66,0,0,3712,218
6019708,456,TRANSFER,79717.14,C1726453809,79717.14,0.0,C1824298603,0.0,0.0,1,0,4399,4098
6020281,461,TRANSFER,873356.47,C1856023832,873356.47,0.0,C625389009,0.0,0.0,1,0,5166,7765
4388731,316,CASH_OUT,1825908.12,C1679134628,1825908.12,0.0,C1851310686,0.0,1825908.12,1,0,4118,4245


In [6]:
# Convert to tensors
# Get the number of unique identifiers
num_orig = df["nameOrig_encoded"].nunique()
num_dest = df["nameDest_encoded"].nunique()

from sklearn.model_selection import train_test_split

# Convert to PyTorch tensors
X_orig = torch.tensor(df["nameOrig_encoded"].values, dtype=torch.long)
X_dest = torch.tensor(df["nameDest_encoded"].values, dtype=torch.long)
y = torch.tensor(df["isFraud"].values, dtype=torch.float32).unsqueeze(1)

# Split dataset
X_orig_train, X_orig_test, X_dest_train, X_dest_test, y_train, y_test = train_test_split(
    X_orig, X_dest, y, test_size=0.2, random_state=42
)

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_orig_train, X_dest_train, y_train)
test_dataset = TensorDataset(X_orig_test, X_dest_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)



In [16]:
class FraudModel(nn.Module):
    def __init__(self, num_orig, num_dest, embedding_dim=8):
        super(FraudModel, self).__init__()

        # Learnable embeddings (no autoencoder needed)
        self.orig_embedding = nn.Embedding(num_orig, embedding_dim)
        self.dest_embedding = nn.Embedding(num_dest, embedding_dim)

        # Fully connected network
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim * 2, 16),
            nn.ReLU(),
            nn.Linear(embedding_dim * 2, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, orig, dest):
        orig_emb = self.orig_embedding(orig)
        dest_emb = self.dest_embedding(dest)
        x = torch.cat((orig_emb, dest_emb), dim=1)
        return self.fc(x)





In [30]:
def train_model(model, train_loader, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for orig, dest, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(orig, dest)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Initialize and train model
model = FraudModel(num_orig, num_dest)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.BCELoss()  # Binary Cross Entropy
train_model(model, train_loader, optimizer)




Epoch 1, Loss: 99.2837
Epoch 2, Loss: 99.1317
Epoch 3, Loss: 98.9952
Epoch 4, Loss: 98.8762
Epoch 5, Loss: 98.7539
Epoch 6, Loss: 98.6346
Epoch 7, Loss: 98.6417
Epoch 8, Loss: 98.5868
Epoch 9, Loss: 98.4344
Epoch 10, Loss: 98.4432


In [38]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for orig, dest, labels in test_loader:
            outputs = model(orig, dest)
            preds = (outputs > 0.5).float()  # Convert probabilities to 0/1
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Compute metrics
    acc = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"Test Accuracy: {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

# Run evaluation
evaluate_model(model, test_loader)



Test Accuracy: 0.5555
Precision: 0.5555
Recall: 1.0000
F1-score: 0.7142
