In [None]:
import pandas as pd
from math import radians, cos, sin, asin, sqrt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

def preprocess_fraud_data(file_path):
    """
    Preprocess the fraud dataset and split into training and validation sets.

    Parameters:
        file_path (str): Path to the input CSV file.

    Returns:
        tuple: Processed training and validation sets (X_train, X_val, y_train, y_val).
    """
    # Load the dataset
    df = pd.read_csv(file_path)

    # Convert 'trans_date_trans_time' to datetime
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

    # Calculate age using 'dob' and transaction year
    df['transaction_year'] = df['trans_date_trans_time'].dt.year
    df['year_of_birth'] = pd.to_datetime(df['dob']).dt.year
    df['age'] = df['transaction_year'] - df['year_of_birth']
    df.drop(columns=['dob', 'transaction_year', 'year_of_birth'], inplace=True)

    # Drop irrelevant columns
    irrelevant_columns = ['Unnamed: 0', 'cc_num', 'trans_num', 'street']
    df_cleaned = df.drop(columns=irrelevant_columns)

    # Haversine function to calculate distance
    def haversine(lat1, lon1, lat2, lon2):
        lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
        c = 2 * asin(sqrt(a))
        r = 6371  # Radius of Earth in kilometers.
        return c * r

    # Calculate distance and add to the dataset
    df_cleaned['distance'] = df_cleaned.apply(
        lambda row: haversine(row['lat'], row['long'], row['merch_lat'], row['merch_long']), axis=1)

    # Create bins for latitude and longitude
    n_bins = 10
    df_cleaned['lat_bucket'] = pd.cut(df_cleaned['lat'], bins=n_bins, labels=False)
    df_cleaned['long_bucket'] = pd.cut(df_cleaned['long'], bins=n_bins, labels=False)
    df_cleaned['merch_lat_bucket'] = pd.cut(df_cleaned['merch_lat'], bins=n_bins, labels=False)
    df_cleaned['merch_long_bucket'] = pd.cut(df_cleaned['merch_long'], bins=n_bins, labels=False)

    # Encode categorical columns
    categorical_columns = ['merchant', 'category', 'gender', 'job']
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df_cleaned[col] = le.fit_transform(df_cleaned[col])
        label_encoders[col] = le

    # Drop columns that are no longer needed
    columns_to_drop = ['trans_date_trans_time', 'first', 'last', 'city', 'state', 'zip', 'lat', 'long', 'merch_lat',
                       'merch_long']
    df_cleaned = df_cleaned.drop(columns=columns_to_drop)

    # Separate features and target variable
    X = df_cleaned.drop(columns=['is_fraud'])
    y = df_cleaned['is_fraud']

    # Normalize numerical columns
    numerical_columns = ['amt', 'age', 'distance', 'lat_bucket', 'long_bucket', 'merch_lat_bucket', 'merch_long_bucket']
    scaler = StandardScaler()
    X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

    # Split into training and validation sets (fixed parameters)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_val, y_train, y_val, scaler

In [None]:
X_train, X_val, y_train, y_val, scaler = preprocess_fraud_data('fraudTrain.csv')

In [None]:
X_train

Unnamed: 0,merchant,category,amt,gender,city_pop,job,unix_time,age,distance,lat_bucket,long_bucket,merch_lat_bucket,merch_long_bucket
330201,340,13,-0.398220,0,1178,99,1338993811,1.263965,-0.481499,-1.315578,-0.157062,-1.405042,-0.119944
798518,476,0,-0.313013,0,85,390,1354562808,-0.634511,1.211505,2.240190,-2.252357,1.337741,-2.235703
1260375,571,5,-0.215206,0,24536,300,1370692963,-0.519452,-0.392995,-1.315578,-0.157062,-1.405042,-0.119944
412511,357,3,-0.175535,1,3096,423,1341539214,-0.116745,0.001850,1.351248,0.541370,1.337741,0.585309
344644,197,7,-0.081782,0,128354,452,1339371240,-1.094747,1.553005,0.462306,-0.157062,0.423480,-0.119944
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110268,590,2,0.012656,0,1478,310,1330829368,0.458551,1.056812,1.351248,-0.157062,1.337741,-0.119944
259178,191,9,-0.424294,0,5438,270,1336546444,0.516081,1.275304,0.462306,1.239802,0.423480,1.290563
131932,680,6,0.298903,0,310,439,1331675187,-0.289333,-1.389976,-0.426636,-0.855493,-0.490781,-0.825197
671155,409,12,-0.403896,1,140,149,1350120506,1.781731,1.388977,2.240190,-0.157062,1.337741,-0.825197


In [None]:
y_train

330201     0
798518     0
1260375    0
412511     0
344644     0
          ..
110268     0
259178     0
131932     0
671155     0
121958     0
Name: is_fraud, Length: 1037340, dtype: int64

In [None]:
y_train.sum()

5986

In [None]:
print(f'{(y_train.sum() / y_train.count()) * 100:.2f}%')

0.58%


In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Build a simple feed-forward neural network
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=512
)

# Predict and evaluate on the validation set
y_val_pred_proba = model.predict(X_val)
y_val_pred = (y_val_pred_proba > 0.5).astype(int)

accuracy  = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, zero_division=0)
recall    = recall_score(y_val, y_val_pred, zero_division=0)
f1        = f1_score(y_val, y_val_pred, zero_division=0)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

# Report
print("\nDetailed classification report:")
print(classification_report(y_val, y_val_pred, zero_division=0))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9884 - loss: 768235.5625 - val_accuracy: 0.9941 - val_loss: 0.1179
Epoch 2/10
[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9939 - loss: 823.1498 - val_accuracy: 0.9941 - val_loss: 0.0539
Epoch 3/10
[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.9941 - loss: 134.2233 - val_accuracy: 0.9941 - val_loss: 0.0395
Epoch 4/10
[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9942 - loss: 53.7900 - val_accuracy: 0.9941 - val_loss: 0.0363
Epoch 5/10
[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9943 - loss: 12.4795 - val_accuracy: 0.9941 - val_loss: 0.0360
Epoch 6/10
[1m2027/2027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.9943 - loss: 1.5541 - val_accuracy: 0.9941 - val_loss: 0.0360
Epo

A 0.00 recall in class 1 means all actual frauds were missed.

Oversampling / Undersampling;
Adjust the Threshold;
Gather More Fraud Examples