# Load necessary libraries

In [56]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import pandas as pd
from math import radians, cos, sin, asin, sqrt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import RandomOverSampler

# Preprocessing

In [39]:
def preprocess_fraud_data(file_path, train=True, scaler=None, label_encoders=None, test_size=0.2, random_state=42):
    """
    Preprocess the fraud dataset for training or testing.

    Parameters:
        file_path (str): Path to the input CSV file.
        train (bool): If True, fit scaler and encoders; if False, use provided ones.
        scaler (StandardScaler): Fitted scaler from training data.
        label_encoders (dict): Dictionary of fitted LabelEncoders from training.
        test_size (float): Fraction of data for validation (if training).
        random_state (int): Random seed for reproducibility.

    Returns:
        If train=True:
            tuple: (X_train, X_val, y_train, y_val, fitted_scaler, fitted_label_encoders)
        If train=False:
            tuple: (X_test, y_test)
    """
    # Load dataset
    df = pd.read_csv(file_path)

    # Convert 'trans_date_trans_time' to datetime
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

    # Calculate age
    df['transaction_year'] = df['trans_date_trans_time'].dt.year
    df['year_of_birth'] = pd.to_datetime(df['dob']).dt.year
    df['age'] = df['transaction_year'] - df['year_of_birth']
    df.drop(columns=['dob', 'transaction_year', 'year_of_birth'], inplace=True)

    # Drop irrelevant columns
    irrelevant_columns = ['Unnamed: 0', 'cc_num', 'trans_num', 'street']
    df_cleaned = df.drop(columns=irrelevant_columns)

    # Haversine function to calculate distance
    def haversine(lat1, lon1, lat2, lon2):
        lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
        c = 2 * asin(sqrt(a))
        r = 6371  # Earth radius in km
        return c * r

    # Compute distance feature
    df_cleaned['distance'] = df_cleaned.apply(
        lambda row: haversine(row['lat'], row['long'], row['merch_lat'], row['merch_long']), axis=1)

    # Create latitude and longitude bins
    n_bins = 10
    df_cleaned['lat_bucket'] = pd.cut(df_cleaned['lat'], bins=n_bins, labels=False)
    df_cleaned['long_bucket'] = pd.cut(df_cleaned['long'], bins=n_bins, labels=False)
    df_cleaned['merch_lat_bucket'] = pd.cut(df_cleaned['merch_lat'], bins=n_bins, labels=False)
    df_cleaned['merch_long_bucket'] = pd.cut(df_cleaned['merch_long'], bins=n_bins, labels=False)

    # Encode categorical columns
    categorical_columns = ['merchant', 'category', 'gender', 'job']
    
    if train:
        label_encoders = {}
        for col in categorical_columns:
            le = LabelEncoder()
            df_cleaned[col] = le.fit_transform(df_cleaned[col])
            label_encoders[col] = le  # Save encoder for future use
    else:
        # Apply pre-fitted encoders
        for col in categorical_columns:
            df_cleaned[col] = label_encoders[col].transform(df_cleaned[col])

    # Drop unnecessary columns
    columns_to_drop = ['trans_date_trans_time', 'first', 'last', 'city', 'state', 'zip', 'lat', 'long', 'merch_lat', 'merch_long']
    df_cleaned = df_cleaned.drop(columns=columns_to_drop)

    # Separate features and target
    X = df_cleaned.drop(columns=['is_fraud'])
    y = df_cleaned['is_fraud']

    # Normalize numerical columns
    numerical_columns = ['amt', 'age', 'distance', 'lat_bucket', 'long_bucket', 'merch_lat_bucket', 'merch_long_bucket']
    
    if train:
        scaler = StandardScaler()
        X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

        # Split into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state)
        return X_train, X_val, y_train, y_val, scaler, label_encoders  # Return fitted encoders & scaler
    else:
        X[numerical_columns] = scaler.transform(X[numerical_columns])
        return X, y  # Return processed test data


# simple feed-forward neural network

In [41]:
# Preprocess training data
X_train, X_val, y_train, y_val, scaler, label_encoders = preprocess_fraud_data("fraudTrain.csv", train=True)

In [44]:
# Build a simple feed-forward neural network
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),  
    Dense(32, activation='relu'),
    Dropout(0.2),  
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']  
)

# Train the model
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=512
)

# Predict and evaluate on the validation set
y_val_pred_proba = model.predict(X_val)
y_val_pred = (y_val_pred_proba > 0.5).astype(int)

accuracy  = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, zero_division=0)
recall    = recall_score(y_val, y_val_pred, zero_division=0)
f1        = f1_score(y_val, y_val_pred, zero_division=0)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

# Report
print("\nDetailed classification report:")
print(classification_report(y_val, y_val_pred, zero_division=0))


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.9890 - loss: 414693.1250 - val_accuracy: 0.9941 - val_loss: 0.1236
Epoch 2/10
[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9937 - loss: 557.9031 - val_accuracy: 0.9941 - val_loss: 0.0551
Epoch 3/10
[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9941 - loss: 199.4371 - val_accuracy: 0.9941 - val_loss: 0.0398
Epoch 4/10
[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9942 - loss: 47.4863 - val_accuracy: 0.9941 - val_loss: 0.0364
Epoch 5/10
[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9943 - loss: 12.0252 - val_accuracy: 0.9941 - val_loss: 0.0360
Epoch 6/10
[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9942 - loss: 18.0478 - val_accuracy: 0.9941 - val_loss: 0.0360
Epoch 7/10
[1m

A 0.00 recall in class 1 means all actual frauds were missed.

Oversampling / Undersampling;
Adjust the Threshold;
Gather More Fraud Examples

# simple feed-forward neural network with random oversampling

In [31]:
# define oversampling strategy
ros = RandomOverSampler(random_state=42)
# fit and apply the transform
X_over, y_over = ros.fit_resample(X_train, y_train)

In [49]:
# Build a simple feed-forward neural network
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_over.shape[1],)),
    Dropout(0.2),  
    Dense(32, activation='relu'),
    Dropout(0.2),  
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']  
)

# Train the model
history = model.fit(
    X_over,
    y_over,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=512
)

# Predict and evaluate on the validation set
y_val_pred_proba = model.predict(X_val)
y_val_pred = (y_val_pred_proba > 0.5).astype(int)

accuracy  = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, zero_division=0)
recall    = recall_score(y_val, y_val_pred, zero_division=0)
f1        = f1_score(y_val, y_val_pred, zero_division=0)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

# Report
print("\nDetailed classification report:")
print(classification_report(y_val, y_val_pred, zero_division=0))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m4029/4029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 0.4994 - loss: 1752860.1250 - val_accuracy: 0.0059 - val_loss: 0.6936
Epoch 2/10
[1m4029/4029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.5001 - loss: 227.9364 - val_accuracy: 0.0059 - val_loss: 0.6933
Epoch 3/10
[1m4029/4029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.5007 - loss: 23.1056 - val_accuracy: 0.9941 - val_loss: 0.6867
Epoch 4/10
[1m4029/4029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.5004 - loss: 6.5733 - val_accuracy: 0.0059 - val_loss: 0.6997
Epoch 5/10
[1m4029/4029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.5002 - loss: 4.8481 - val_accuracy: 0.9941 - val_loss: 0.6901
Epoch 6/10
[1m4029/4029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.4999 - loss: 1.3183 - val_accuracy: 0.0059 - val_loss: 0.6959
Epoc

# LSTM with oversampling

In [58]:
from tensorflow.keras.layers import LSTM, Embedding, Flatten
from tensorflow.keras.callbacks import EarlyStopping

# Reshape data for LSTM
X_train_lstm = X_over.values.reshape((X_over.shape[0], X_over.shape[1], 1))
X_val_lstm = X_val.values.reshape((X_val.shape[0], X_val.shape[1], 1))

# Build LSTM Model
model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(X_over.shape[1], 1)),
    Dropout(0.3),

    LSTM(64, return_sequences=False),
    Dropout(0.3),

    Dense(32, activation='relu'),
    Dropout(0.3),

    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Define early stopping
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=2,          # Stop training after 3 epochs of no improvement
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train_lstm, y_over,
    validation_data=(X_val_lstm, y_val),
    epochs=10,
    batch_size=256,
    callbacks=[early_stopping]
)


  super().__init__(**kwargs)


Epoch 1/10
[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m460s[0m 56ms/step - accuracy: 0.8986 - loss: 0.2369 - val_accuracy: 0.9380 - val_loss: 0.1166
Epoch 2/10
[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m499s[0m 56ms/step - accuracy: 0.9608 - loss: 0.0911 - val_accuracy: 0.9242 - val_loss: 0.1386
Epoch 3/10
[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m485s[0m 54ms/step - accuracy: 0.9703 - loss: 0.0746 - val_accuracy: 0.9609 - val_loss: 0.0785
Epoch 4/10
[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m433s[0m 53ms/step - accuracy: 0.9776 - loss: 0.0609 - val_accuracy: 0.9702 - val_loss: 0.0703
Epoch 5/10
[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m446s[0m 53ms/step - accuracy: 0.9830 - loss: 0.0499 - val_accuracy: 0.9779 - val_loss: 0.0556
Epoch 6/10
[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m462s[0m 56ms/step - accuracy: 0.9867 - loss: 0.0425 - val_accuracy: 0.9828 - val_loss: 0.048

In [63]:
# Predict and evaluate on the validation set
y_val_pred_proba = model.predict(X_val_lstm)
y_val_pred = (y_val_pred_proba > 0.5).astype(int)

# Compute evaluation metrics
accuracy  = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, zero_division=0)
recall    = recall_score(y_val, y_val_pred, zero_division=0)
f1        = f1_score(y_val, y_val_pred, zero_division=0)


# Print results
print("🔹 LSTM Model Results:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")


print("\nDetailed Classification Report:")
print(classification_report(y_val, y_val_pred, zero_division=0))


[1m8105/8105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 5ms/step
🔹 LSTM Model Results:
Accuracy:  0.9893
Precision: 0.3420
Recall:    0.8855
F1 Score:  0.4934

Detailed Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    257815
           1       0.34      0.89      0.49      1520

    accuracy                           0.99    259335
   macro avg       0.67      0.94      0.74    259335
weighted avg       1.00      0.99      0.99    259335



In [65]:
model.save("fraud_detection_lstm.keras")

# Test

In [67]:
from tensorflow.keras.models import load_model

# Load the model in the new Keras format
loaded_model = load_model("fraud_detection_lstm.keras")


In [69]:
# Ensure validation data is reshaped correctly
X_val_lstm = X_val.values.reshape((X_val.shape[0], X_val.shape[1], 1))

# Predict fraud probabilities
y_val_pred_proba = loaded_model.predict(X_val_lstm)

# Convert probabilities to binary predictions (0 or 1)
y_val_pred = (y_val_pred_proba > 0.5).astype(int)

# Evaluate performance
print(classification_report(y_val, y_val_pred, zero_division=0))

[1m8105/8105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 5ms/step
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    257815
           1       0.34      0.89      0.49      1520

    accuracy                           0.99    259335
   macro avg       0.67      0.94      0.74    259335
weighted avg       1.00      0.99      0.99    259335

