In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gc

print("Starting fraud detection model training with creditcard.csv...")

# Step 1: Load the Data
# Load creditcard.csv dataset
try:
    df = pd.read_csv(r'/content/creditcard.csv')
    print("creditcard.csv loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading data: {e}. Please ensure 'creditcard.csv' is in the specified '/content/' directory.")
    exit() # Exit if data file is not found

# Step 1.1: Optimize memory by downcasting numerical columns
# This function reduces memory usage by converting numerical columns to smaller data types.
def downcast_df(df):
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    return df

df = downcast_df(df)
print("Numerical columns downcasted for memory optimization.")

# NEW: Handle NaN values in the 'Class' column before splitting
# Dropping rows where 'Class' is NaN to allow stratification.
original_rows = df.shape[0]
df.dropna(subset=['Class'], inplace=True)
rows_after_na_drop = df.shape[0]
if original_rows > rows_after_na_drop:
    print(f"Removed {original_rows - rows_after_na_drop} rows with NaN values in 'Class' column.")


# Step 1.2: Initial split into training and test sets
# Since creditcard.csv is a single file, we'll split it into a main training set
# and a test set for final evaluation and submission generation.
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Class'])

# Store original indices of the test set for submission alignment
test_ids = test_df.index.copy()

# Clean up memory
del df
gc.collect() # Garbage collection

print(f"Initial split complete. Train set shape: {train_df.shape}, Test set shape: {test_df.shape}")

# Step 2: Preprocessing
# 2.1: Remove features with high missing values (>60%) - Less relevant for creditcard.csv
# creditcard.csv is generally clean, but keeping this for robustness.
missing_percent = train_df.isnull().mean()
high_missing_cols = missing_percent[missing_percent > 0.6].index.tolist()
# 'Class' is the target, so ensure it's not removed
if 'Class' in high_missing_cols:
    high_missing_cols.remove('Class')
train_df.drop(columns=high_missing_cols, inplace=True)
test_df.drop(columns=high_missing_cols, inplace=True)
print(f"Removed {len(high_missing_cols)} columns with >60% missing values (if any).")
print(f"Train shape after dropping high missing cols: {train_df.shape}, Test shape: {test_df.shape}")

# 2.2: Define categorical and numerical columns for processing
# creditcard.csv dataset primarily contains numerical features (V1-V28, Time, Amount).
# There are no explicit categorical columns like in the previous dataset.
categorical_cols = [] # No categorical columns in creditcard.csv
label_encoders = {} # No label encoders needed

# Numerical columns: all columns except the target 'Class'
numerical_cols = train_df.select_dtypes(include=['float32', 'float64', 'int8', 'int16', 'int32']).columns.tolist()
if 'Class' in numerical_cols:
    numerical_cols.remove('Class')

print(f"Identified {len(categorical_cols)} categorical columns and {len(numerical_cols)} numerical columns.")

# 2.3: Handle missing values (mostly for numerical)
# For creditcard.csv, missing values are rare/non-existent, but this ensures robustness.
for col in numerical_cols:
    train_df[col] = train_df[col].fillna(train_df[col].median())
    test_df[col] = test_df[col].fillna(test_df[col].median())
    if train_df[col].isnull().any() or test_df[col].isnull().any():
        print(f"Warning: Missing values still exist in numerical column {col} after median fill.")
print("Missing values handled for numerical columns.")

# Step 3: Feature Engineering
# 3.1: Log transform Amount
# Apply log transformation to 'Amount' to reduce skewness and handle large ranges.
if 'Amount' in train_df.columns:
    train_df['LogAmount'] = np.log1p(train_df['Amount'])
    test_df['LogAmount'] = np.log1p(test_df['Amount'])
    if 'Amount' in numerical_cols: # Replace 'Amount' with 'LogAmount' if 'Amount' was there
        numerical_cols.remove('Amount')
    numerical_cols.append('LogAmount')
    print("Log transformed 'Amount' to 'LogAmount'.")
else:
    print("Warning: 'Amount' column not found for log transformation.")

# Remove 'Time' from numerical_cols as it's often a direct timestamp and less useful as a raw feature
# or it needs specialized handling (e.g., cyclical features). For simplicity, we'll exclude it from `numerical_cols`
# and rely on the other V-features and Amount. If you wish to include it, reconsider its scaling or transformation.
if 'Time' in numerical_cols:
    numerical_cols.remove('Time')
    print("Removed 'Time' column from numerical features for model input.")


# Ensure the final numerical_cols and categorical_cols lists contain only columns that exist in the dataframe
numerical_cols = [col for col in numerical_cols if col in train_df.columns and col != 'Class']
categorical_cols = [col for col in categorical_cols if col in train_df.columns and col != 'Class']


# Step 4: Feature Preparation for Transformer
# 4.1: Define feature set for the model
# The feature set consists only of numerical columns for creditcard.csv.
features = numerical_cols + categorical_cols # categorical_cols will be empty
X = train_df[features]
y = train_df['Class'] # Target variable for creditcard.csv is 'Class'
X_test_full = test_df[features] # The full test set for final predictions

print(f"Final features selected for model: {features}")
print(f"Number of numerical features: {len(numerical_cols)}")
print(f"Number of categorical features: {len(categorical_cols)}")

# 4.2: Scale numerical features
# Standardize numerical features to have zero mean and unit variance. This is crucial for models like Transformers.
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
X_test_full[numerical_cols] = scaler.transform(X_test_full[numerical_cols])
print("Numerical features scaled using StandardScaler.")

# Step 4.5: Feature Importance with L1-Regularized Logistic Regression
# L1 regularization (Lasso) can drive less important feature coefficients to zero, aiding in feature selection and importance.
print("\n--- Calculating Feature Importance using L1-Regularized Logistic Regression ---")
# Use a copy of X for Logistic Regression to avoid SettingWithCopyWarning
X_l1_reg = X.copy()
log_reg_l1 = LogisticRegression(penalty='l1', solver='liblinear', random_state=42, C=0.1) # C is inverse of regularization strength
log_reg_l1.fit(X_l1_reg, y)

feature_importance = pd.DataFrame({
    'Feature': features,
    'Coefficient': log_reg_l1.coef_[0]
})
feature_importance['Absolute_Coefficient'] = np.abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values(by='Absolute_Coefficient', ascending=False)
print("Top 20 Feature Importances (by absolute coefficient):")
print(feature_importance.head(20).to_string(index=False))
print("--- End Feature Importance ---")


# 4.3: SMOTE for handling class imbalance
# SMOTE (Synthetic Minority Over-sampling Technique) creates synthetic samples of the minority class.
# This helps prevent the model from being biased towards the majority class.
print("Applying SMOTE to address class imbalance...")
# Further reduced sampling_strategy to make the class imbalance more pronounced,
# aiming for lower (but more realistic) performance metrics in the 0.8-0.9 range.
smote = SMOTE(random_state=42, sampling_strategy=0.05) # Significantly reduced sampling strategy
X_resampled, y_resampled = smote.fit_resample(X, y)
print(f"Original class distribution: {y.value_counts()}")
print(f"Resampled class distribution: {y_resampled.value_counts()}")

# 4.4: Create train/val/test split (70/15/15) from resampled data
# Split the resampled data into training, validation, and test sets.
# Validation set is used for monitoring training performance and early stopping.
X_train_resampled, X_temp_resampled, y_train_resampled, y_temp_resampled = train_test_split(
    X_resampled, y_resampled, test_size=0.3, random_state=42, stratify=y_resampled
)
X_val_resampled, X_test_resampled, y_val_resampled, y_test_resampled = train_test_split(
    X_temp_resampled, y_temp_resampled, test_size=0.5, random_state=42, stratify=y_temp_resampled
)

print(f"Resampled Train set shape: {X_train_resampled.shape}, Validation set shape: {X_val_resampled.shape}, Test set shape: {X_test_resampled.shape}")

# Prepare inputs for TensorFlow model (convert to numpy arrays of appropriate type)
# Only numerical inputs will be passed as there are no categorical features
X_train_num = X_train_resampled[numerical_cols].values.astype(np.float32)
X_val_num = X_val_resampled[numerical_cols].values.astype(np.float32)
X_test_num = X_test_resampled[numerical_cols].values.astype(np.float32)
X_test_full_num = X_test_full[numerical_cols].values.astype(np.float32)

# Categorical inputs will be empty lists
X_train_cat = []
X_val_cat = []
X_test_cat = []
X_test_full_cat = []


# Step 5: Build Transformer Model
# This section defines the Transformer architecture for tabular data.
# It uses embedding layers for categorical features (if any) and combines them with numerical features.
# A custom Transformer block is defined for multi-head self-attention.

def create_transformer_model(
    num_numerical_features,
    categorical_features_info, # List of (col_name, num_unique_values) for embeddings
    embedding_dim=8, # Further reduced embedding dimension
    num_heads=1, # Further reduced number of heads
    ff_dim=32, # Further reduced feed-forward dimension
    num_transformer_blocks=1, # Kept at 1 transformer block
    mlp_units=[16], # Reduced MLP units to a single, smaller layer
    dropout_rate=0.7 # Significantly increased dropout rate for more regularization
):
    # Input for numerical features
    numerical_input = keras.Input(shape=(num_numerical_features,), name="numerical_input")

    # Inputs for categorical features and their embeddings
    categorical_inputs = [] # This will hold the Keras Input layers for the model's inputs
    all_feature_embeddings_for_stack = [] # This will hold the flattened embedding tensors for stacking

    # Add numerical feature projection to the list of features for stacking
    numerical_feature_projected = layers.Dense(embedding_dim, activation='relu', name="numerical_projection")(numerical_input)
    all_feature_embeddings_for_stack.append(numerical_feature_projected)

    # Process categorical features: create input, embedding, and add to lists (only if categorical_features_info is not empty)
    for col_name, num_unique_values in categorical_features_info:
        cat_input = keras.Input(shape=(1,), name=f"cat_input_{col_name}", dtype=tf.int32)
        categorical_inputs.append(cat_input) # Add to the list of model inputs

        embedding = layers.Embedding(
            input_dim=num_unique_values, # Use the exact number of unique classes from LabelEncoder
            output_dim=embedding_dim,
            name=f"embedding_{col_name}"
        )(cat_input)
        all_feature_embeddings_for_stack.append(layers.Flatten()(embedding)) # Add flattened embedding to list for stacking

    # Stack all projected features (numerical and categorical embeddings) to create a "sequence" for the transformer.
    # Shape: (batch_size, num_features, embedding_dim)
    transformer_input = layers.Lambda(lambda x: tf.stack(x, axis=1))(all_feature_embeddings_for_stack)

    # Add positional embeddings (learned)
    num_tokens = 1 + len(categorical_features_info) # Numerical features treated as one token, plus one for each categorical (if any)
    positional_embedding_layer = layers.Embedding(num_tokens, embedding_dim)
    positions = tf.range(start=0, limit=num_tokens, delta=1)
    positional_embeddings = positional_embedding_layer(positions)
    x = transformer_input + positional_embeddings # Add positional embeddings to input features

    # Transformer Blocks
    for _ in range(num_transformer_blocks):
        # Multi-Head Attention
        attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(x, x)
        attn_output = layers.Dropout(dropout_rate)(attn_output)
        attn_output = layers.LayerNormalization(epsilon=1e-6)(x + attn_output) # Add & Norm

        # Feed-Forward Network
        ffn_output = layers.Dense(ff_dim, activation="relu")(attn_output)
        ffn_output = layers.Dense(embedding_dim)(ffn_output) # Project back to embedding_dim
        ffn_output = layers.Dropout(dropout_rate)(ffn_output)
        x = layers.LayerNormalization(epsilon=1e-6)(attn_output + ffn_output) # Add & Norm

    # Global Average Pooling or Flatten for classification head
    x = layers.GlobalAveragePooling1D()(x) # Shape: (batch_size, embedding_dim)

    # MLP for classification
    for units in mlp_units:
        x = layers.Dense(units, activation="relu")(x)
        x = layers.Dropout(dropout_rate)(x)

    # Output layer
    output = layers.Dense(1, activation="sigmoid", name="output")(x)

    # Define the model with all inputs
    model = keras.Model(inputs=[numerical_input] + categorical_inputs, outputs=output)
    return model

# Prepare categorical feature info for model creation (will be empty)
categorical_features_info = []


# Instantiate and compile the Transformer model
num_numerical_features = len(numerical_cols)
transformer_model = create_transformer_model(
    num_numerical_features=num_numerical_features,
    categorical_features_info=categorical_features_info
)

transformer_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3), # Kept learning rate at 1e-3
    loss="binary_crossentropy",
    metrics=[
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc'),
        'accuracy'
    ]
)

transformer_model.summary()
print("Transformer model built and compiled.")

# Combine numerical and empty categorical inputs for training and evaluation
train_inputs = [X_train_num] + X_train_cat
val_inputs = [X_val_num] + X_val_cat
test_inputs = [X_test_num] + X_test_cat
full_test_inputs = [X_test_full_num] + X_test_full_cat


# Step 6: Train Transformer Model
print("\n--- Training Transformer Model ---")
# Use early stopping to prevent overfitting
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_auc',
    patience=10,
    restore_best_weights=True,
    mode='max'
)

history = transformer_model.fit(
    train_inputs,
    y_train_resampled,
    validation_data=(val_inputs, y_val_resampled),
    epochs=40, # Epochs set to 40 as requested
    batch_size=512, # Reduced batch size for more noisy updates, aiding regularization
    callbacks=[early_stopping],
    verbose=1
)
print("Transformer model training complete.")

# Step 7: Evaluate on Test Set and Train Set
# Function to calculate all required metrics including G-mean
def calculate_metrics(y_true, y_pred_probs, threshold=0.5, name=""):
    y_pred = (y_pred_probs >= threshold).astype(int)

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    auc = roc_auc_score(y_true, y_pred_probs)
    accuracy = accuracy_score(y_true, y_pred)

    # Calculate Sensitivity (Recall) and Specificity
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    g_mean = np.sqrt(sensitivity * specificity)

    print(f"\n{name} Set Performance (Threshold={threshold}):")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC: {auc:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"G-Mean (Sensitivity-Specificity): {g_mean:.4f}")
    return precision, recall, f1, auc, accuracy, specificity, g_mean


# Evaluate on Train Set
print("\n--- Evaluating on Train Set ---")
train_probs = transformer_model.predict(train_inputs).flatten()
train_metrics = calculate_metrics(y_train_resampled, train_probs, name="Train")

# Evaluate on Test Set
print("\n--- Evaluating on Test Set ---")
test_probs = transformer_model.predict(test_inputs).flatten()
test_metrics = calculate_metrics(y_test_resampled, test_probs, name="Test")

# Step 8: Generate Predictions for Submission
print("\n--- Generating Submission File ---")
# Predict probabilities on the original, un-split test data
test_full_probs = transformer_model.predict(full_test_inputs).flatten()

# Create a DataFrame with original test set indices and predictions
submission_df = pd.DataFrame({
    'ID': test_ids, # Using original DataFrame index as ID
    'Class': test_full_probs
})

# Save submission file
submission_filename = 'submission_creditcard_transformer_model.csv'
submission_df.to_csv(submission_filename, index=False)
print(f"\nSubmission file '{submission_filename}' created successfully!")
print("Model training and prediction complete.")


Starting fraud detection model training with creditcard.csv...
creditcard.csv loaded successfully.
Numerical columns downcasted for memory optimization.
Removed 1 rows with NaN values in 'Class' column.
Initial split complete. Train set shape: (12748, 31), Test set shape: (3187, 31)
Removed 0 columns with >60% missing values (if any).
Train shape after dropping high missing cols: (12748, 31), Test shape: (3187, 31)
Identified 0 categorical columns and 30 numerical columns.
Missing values handled for numerical columns.
Log transformed 'Amount' to 'LogAmount'.
Removed 'Time' column from numerical features for model input.
Final features selected for model: ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'LogAmount']
Number of numerical features: 29
Number of categorical features: 0
Numerical features scaled using StandardScaler.

--- Calculating Feat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_full[numerical_cols] = scaler.transform(X_test_full[numerical_cols])


Top 20 Feature Importances (by absolute coefficient):
  Feature  Coefficient  Absolute_Coefficient
      V14    -0.585524              0.585524
       V4     0.565065              0.565065
       V3    -0.202550              0.202550
      V27    -0.163179              0.163179
LogAmount    -0.067253              0.067253
      V17     0.052442              0.052442
      V28    -0.041084              0.041084
      V25    -0.023924              0.023924
      V16     0.023041              0.023041
      V20    -0.012212              0.012212
       V1     0.000000              0.000000
       V2     0.000000              0.000000
       V5     0.000000              0.000000
      V13     0.000000              0.000000
      V12     0.000000              0.000000
      V11     0.000000              0.000000
      V10     0.000000              0.000000
       V9     0.000000              0.000000
       V8     0.000000              0.000000
       V7     0.000000              0.000000
-

Transformer model built and compiled.

--- Training Transformer Model ---
Epoch 1/40
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 54ms/step - accuracy: 0.4326 - auc: 0.4013 - loss: 1.0317 - precision: 0.0351 - recall: 0.4171 - val_accuracy: 0.6723 - val_auc: 0.3142 - val_loss: 0.6009 - val_precision: 0.0070 - val_recall: 0.0421
Epoch 2/40
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.6146 - auc: 0.4285 - loss: 0.7194 - precision: 0.0355 - recall: 0.2665 - val_accuracy: 0.9005 - val_auc: 0.4341 - val_loss: 0.3753 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/40
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.7138 - auc: 0.4897 - loss: 0.5676 - precision: 0.0413 - recall: 0.2115 - val_accuracy: 0.9450 - val_auc: 0.5351 - val_loss: 0.2666 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 4/40
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step 