In [1]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import pandas as pd
from math import radians, cos, sin, asin, sqrt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
def preprocess_fraud_data(file_path):
    """
    Preprocess the fraud dataset and split into training and validation sets.

    Parameters:
        file_path (str): Path to the input CSV file.

    Returns:
        tuple: Processed training and validation sets (X_train, X_val, y_train, y_val).
    """
    # Load the dataset
    df = pd.read_csv(file_path)

    # Convert 'trans_date_trans_time' to datetime
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

    # Calculate age using 'dob' and transaction year
    df['transaction_year'] = df['trans_date_trans_time'].dt.year
    df['year_of_birth'] = pd.to_datetime(df['dob']).dt.year
    df['age'] = df['transaction_year'] - df['year_of_birth']
    df.drop(columns=['dob', 'transaction_year', 'year_of_birth'], inplace=True)

    # Drop irrelevant columns
    irrelevant_columns = ['Unnamed: 0', 'cc_num', 'trans_num', 'street']
    df_cleaned = df.drop(columns=irrelevant_columns)

    # Haversine function to calculate distance
    def haversine(lat1, lon1, lat2, lon2):
        lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
        c = 2 * asin(sqrt(a))
        r = 6371  # Radius of Earth in kilometers.
        return c * r

    # Calculate distance and add to the dataset
    df_cleaned['distance'] = df_cleaned.apply(
        lambda row: haversine(row['lat'], row['long'], row['merch_lat'], row['merch_long']), axis=1)

    # Create bins for latitude and longitude
    n_bins = 10
    df_cleaned['lat_bucket'] = pd.cut(df_cleaned['lat'], bins=n_bins, labels=False)
    df_cleaned['long_bucket'] = pd.cut(df_cleaned['long'], bins=n_bins, labels=False)
    df_cleaned['merch_lat_bucket'] = pd.cut(df_cleaned['merch_lat'], bins=n_bins, labels=False)
    df_cleaned['merch_long_bucket'] = pd.cut(df_cleaned['merch_long'], bins=n_bins, labels=False)

    # Encode categorical columns
    categorical_columns = ['merchant', 'category', 'gender', 'job']
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df_cleaned[col] = le.fit_transform(df_cleaned[col])
        label_encoders[col] = le

    # Drop columns that are no longer needed
    columns_to_drop = ['trans_date_trans_time', 'first', 'last', 'city', 'state', 'zip', 'lat', 'long', 'merch_lat',
                       'merch_long']
    df_cleaned = df_cleaned.drop(columns=columns_to_drop)

    # Separate features and target variable
    X = df_cleaned.drop(columns=['is_fraud'])
    y = df_cleaned['is_fraud']

    # Normalize numerical columns
    numerical_columns = ['amt', 'age', 'distance', 'lat_bucket', 'long_bucket', 'merch_lat_bucket', 'merch_long_bucket']
    scaler = StandardScaler()
    X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

    # Split into training and validation sets (fixed parameters)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_val, y_train, y_val, scaler

In [3]:
X_train, X_val, y_train, y_val, scaler = preprocess_fraud_data('fraudTrain.csv')

In [8]:
# Build a simple feed-forward neural network
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),  
    Dense(32, activation='relu'),
    Dropout(0.2),  
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']  
)

# Train the model
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=5,
    batch_size=512
)

# Predict and evaluate on the validation set
y_val_pred_proba = model.predict(X_val)
y_val_pred = (y_val_pred_proba > 0.5).astype(int)

accuracy  = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, zero_division=0)
recall    = recall_score(y_val, y_val_pred, zero_division=0)
f1        = f1_score(y_val, y_val_pred, zero_division=0)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

# Report
print("\nDetailed classification report:")
print(classification_report(y_val, y_val_pred, zero_division=0))


Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.9790 - loss: 1127733.7500 - val_accuracy: 0.9941 - val_loss: 0.1461
Epoch 2/5
[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9937 - loss: 4598.8525 - val_accuracy: 0.9941 - val_loss: 0.0581
Epoch 3/5
[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9940 - loss: 792.0883 - val_accuracy: 0.9941 - val_loss: 0.0404
Epoch 4/5
[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9943 - loss: 265.6897 - val_accuracy: 0.9941 - val_loss: 0.0365
Epoch 5/5
[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9942 - loss: 192.6324 - val_accuracy: 0.9941 - val_loss: 0.0360
[1m8105/8105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 404us/step
Accuracy:  0.9941
Precision: 0.0000
Recall:    0.0000
F1 Score:  0.0000

Detailed classification repo

A 0.00 recall in class 1 means all actual frauds were missed.

Oversampling / Undersampling;
Adjust the Threshold;
Gather More Fraud Examples

# random oversampling

In [11]:
from imblearn.over_sampling import RandomOverSampler
# define oversampling strategy
ros = RandomOverSampler(random_state=42)
# fit and apply the transform
X_over, y_over = ros.fit_resample(X_train, y_train)

In [40]:
# Build a simple feed-forward neural network
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_over.shape[1],)),
    Dropout(0.2),  
    Dense(32, activation='relu'),
    Dropout(0.2),  
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']  
)

# Train the model
history = model.fit(
    X_over,
    y_over,
    validation_data=(X_val, y_val),
    epochs=5,
    batch_size=512
)

# Predict and evaluate on the validation set
y_val_pred_proba = model.predict(X_val)
y_val_pred = (y_val_pred_proba > 0.5).astype(int)

accuracy  = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, zero_division=0)
recall    = recall_score(y_val, y_val_pred, zero_division=0)
f1        = f1_score(y_val, y_val_pred, zero_division=0)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

# Report
print("\nDetailed classification report:")
print(classification_report(y_val, y_val_pred, zero_division=0))


Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4029/4029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - accuracy: 0.4993 - loss: 3170104.0000 - val_accuracy: 0.9941 - val_loss: 0.6914
Epoch 2/5
[1m4029/4029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.4996 - loss: 399.2000 - val_accuracy: 0.9941 - val_loss: 0.6916
Epoch 3/5
[1m4029/4029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.5004 - loss: 42.1779 - val_accuracy: 0.0059 - val_loss: 0.6936
Epoch 4/5
[1m4029/4029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.4996 - loss: 16.9885 - val_accuracy: 0.0059 - val_loss: 0.6964
Epoch 5/5
[1m4029/4029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.5000 - loss: 12.1071 - val_accuracy: 0.0059 - val_loss: 0.6941
[1m8105/8105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 433us/step
Accuracy:  0.0059
Precision: 0.0059
Recall:    1.0000
F1 Score:  0.0117

Detailed classification report:


In [43]:
from tensorflow.keras.layers import BatchNormalization

# Build a deeper feed-forward neural network
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_over.shape[1],)),
    BatchNormalization(),  # Normalization to stabilize training
    Dropout(0.3),  

    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),  

    Dense(32, activation='relu'),
    Dropout(0.3),

    Dense(1, activation='sigmoid')  # Output layer
])

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=5e-4),  # Reduce learning rate
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Train the model with Early Stopping
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    X_over,
    y_over,
    validation_data=(X_val, y_val),
    epochs=50,  # Increased from 5
    batch_size=256,  # Reduced batch size for better updates
    callbacks=[early_stopping]
)


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.5226 - loss: 0.7146 - val_accuracy: 0.0061 - val_loss: 1.2915
Epoch 2/50
[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.5380 - loss: 0.6871 - val_accuracy: 0.0059 - val_loss: 12.5662
Epoch 3/50
[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.5391 - loss: 0.6867 - val_accuracy: 0.0059 - val_loss: 1.3595
Epoch 4/50
[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.5409 - loss: 0.6864 - val_accuracy: 0.3168 - val_loss: 1.5131
Epoch 5/50
[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.5411 - loss: 0.6861 - val_accuracy: 0.0059 - val_loss: 0.8728
Epoch 6/50
[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.5414 - loss: 0.6858 - val_accuracy: 0.3678 - val_loss: 0.6872
Epoch 7/50
[1m8058/

In [45]:
# Predict and evaluate on the validation set
y_val_pred_proba = model.predict(X_val)
y_val_pred = (y_val_pred_proba > 0.5).astype(int)

# Compute evaluation metrics
accuracy  = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, zero_division=0)
recall    = recall_score(y_val, y_val_pred, zero_division=0)
f1        = f1_score(y_val, y_val_pred, zero_division=0)
auc_roc   = roc_auc_score(y_val, y_val_pred_proba)
pr_auc    = average_precision_score(y_val, y_val_pred_proba)

# Print results
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"AUC-ROC Score: {auc_roc:.4f}")
print(f"Precision-Recall AUC: {pr_auc:.4f}")

# Print classification report
print("\nDetailed Classification Report:")
print(classification_report(y_val, y_val_pred, zero_division=0))


[1m8105/8105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 491us/step
Accuracy:  0.9588
Precision: 0.0085
Recall:    0.0520
F1 Score:  0.0146
AUC-ROC Score: 0.5318
Precision-Recall AUC: 0.0069

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.96      0.98    257815
           1       0.01      0.05      0.01      1520

    accuracy                           0.96    259335
   macro avg       0.50      0.51      0.50    259335
weighted avg       0.99      0.96      0.97    259335



In [47]:
from tensorflow.keras.layers import LSTM, Embedding, Flatten

# Reshape data for LSTM (if needed)
X_train_lstm = X_over.values.reshape((X_over.shape[0], X_over.shape[1], 1))
X_val_lstm = X_val.values.reshape((X_val.shape[0], X_val.shape[1], 1))

# Build LSTM Model
model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(X_over.shape[1], 1)),
    Dropout(0.3),

    LSTM(64, return_sequences=False),
    Dropout(0.3),

    Dense(32, activation='relu'),
    Dropout(0.3),

    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    X_train_lstm, y_over,
    validation_data=(X_val_lstm, y_val),
    epochs=20,
    batch_size=256,
    callbacks=[early_stopping]
)


Epoch 1/20


  super().__init__(**kwargs)


[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 32ms/step - accuracy: 0.8984 - loss: 0.2386 - val_accuracy: 0.9542 - val_loss: 0.0903
Epoch 2/20
[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m258s[0m 32ms/step - accuracy: 0.9627 - loss: 0.0896 - val_accuracy: 0.9552 - val_loss: 0.0839
Epoch 3/20
[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 32ms/step - accuracy: 0.9727 - loss: 0.0714 - val_accuracy: 0.9572 - val_loss: 0.0942
Epoch 4/20
[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 32ms/step - accuracy: 0.9804 - loss: 0.0562 - val_accuracy: 0.9750 - val_loss: 0.0683
Epoch 5/20
[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 33ms/step - accuracy: 0.9848 - loss: 0.0470 - val_accuracy: 0.9801 - val_loss: 0.0545
Epoch 6/20
[1m8058/8058[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 33ms/step - accuracy: 0.9880 - loss: 0.0395 - val_accuracy: 0.9845 - val_loss: 0.0468
Epoch 7/2

In [51]:
# Predict and evaluate on the validation set
y_val_pred_proba = model.predict(X_val_lstm)
y_val_pred = (y_val_pred_proba > 0.5).astype(int)

# Compute evaluation metrics
accuracy  = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, zero_division=0)
recall    = recall_score(y_val, y_val_pred, zero_division=0)
f1        = f1_score(y_val, y_val_pred, zero_division=0)


# Print results
print("🔹 LSTM Model Results:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")


print("\nDetailed Classification Report:")
print(classification_report(y_val, y_val_pred, zero_division=0))


[1m8105/8105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step
🔹 LSTM Model Results:
Accuracy:  0.9930
Precision: 0.4463
Recall:    0.8395
F1 Score:  0.5828

Detailed Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    257815
           1       0.45      0.84      0.58      1520

    accuracy                           0.99    259335
   macro avg       0.72      0.92      0.79    259335
weighted avg       1.00      0.99      0.99    259335



In [57]:
model.save("fraud_detection_lstm.keras")

In [59]:
from tensorflow.keras.models import load_model

# Load the model in the new Keras format
loaded_model = load_model("fraud_detection_lstm.keras")


  saveable.load_own_variables(weights_store.get(inner_path))
