<a href="https://colab.research.google.com/github/igAvinashSingh/ML-ASSIGNMENT2/blob/main/Credit%20Card%20Fraud%20Detection%20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install dependencies with correct versions
!pip install imbalanced-learn xgboost scikit-learn matplotlib seaborn tensorflow --quiet

# Import necessary libraries
import time
import os
import warnings
warnings.filterwarnings('ignore')

print("Starting GPU-optimized credit card fraud detection pipeline...")

# Configure matplotlib to display properly in the current environment
import matplotlib.pyplot as plt
try:
    get_ipython().run_line_magic('matplotlib', 'inline')
except:
    plt.switch_backend('agg')

# Use a modern and appealing style for plots
plt.style.use('fivethirtyeight')

# Standard imports
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_score
from sklearn.metrics import recall_score, accuracy_score, f1_score, roc_curve, auc, precision_recall_curve
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
import matplotlib.gridspec as gridspec
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.ticker as ticker

# Create custom colormaps for better visualizations
fraud_cmap = LinearSegmentedColormap.from_list('fraud_cmap', ['#f5f7fa', '#c3101c'])
blue_cmap = LinearSegmentedColormap.from_list('blue_cmap', ['#f5f7fa', '#1a53a2'])

# Check for GPU availability and optimize
print("Checking for GPU availability...")
if tf.config.list_physical_devices('GPU'):
    print("GPU is available!")
    # Set memory growth to avoid OOM errors
    for gpu in tf.config.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)
    # Set XGBoost to use GPU
    gpu_params = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'gpu_id': 0}
else:
    print("No GPU found, using CPU.")
    gpu_params = {}

# Performance tracking setup
perf_metrics = {}
start_time_total = time.time()

# Function to measure execution time
def time_operation(operation_name):
    def decorator(func):
        def wrapper(*args, **kwargs):
            print(f"Starting {operation_name}...")
            start_time = time.time()
            result = func(*args, **kwargs)
            end_time = time.time()
            duration = end_time - start_time
            perf_metrics[operation_name] = duration
            print(f"⏱ {operation_name}: {duration:.2f} seconds")
            return result
        return wrapper
    return decorator

# Create a directory for saving plots
os.makedirs('plots', exist_ok=True)

# Step 1: Load dataset with error handling
@time_operation("Data Loading")
def load_dataset():
    # Check if file exists locally first
    file_path = "creditcard.csv"
    if os.path.exists(file_path):
        print(f"Loading dataset from local file: {file_path}")
        return pd.read_csv(file_path)
    else:
        print("Local file not found, creating synthetic dataset for testing...")
        return create_synthetic_data()

def create_synthetic_data():
    """Create a small synthetic dataset for testing when download fails"""
    np.random.seed(42)
    n_samples = 10000

    # Generate features (V1-V28, Time, Amount)
    X = np.random.randn(n_samples, 28)  # Generate exactly 28 V columns
    time = np.random.randint(0, 172800, size=n_samples)
    amount = np.random.exponential(scale=100, size=n_samples)

    # Make some features more discriminative for fraud detection
    # Generate target (mostly 0s, few 1s for fraud)
    y = np.zeros(n_samples)
    fraud_idx = np.random.choice(n_samples, size=int(n_samples * 0.002), replace=False)
    y[fraud_idx] = 1

    # Make V1, V2, V3 more discriminative for fraud detection
    X[fraud_idx, 0] = np.random.normal(-3, 1, size=len(fraud_idx))  # V1 for fraud
    X[fraud_idx, 1] = np.random.normal(2, 1, size=len(fraud_idx))   # V2 for fraud
    X[fraud_idx, 2] = np.random.normal(-4, 1.5, size=len(fraud_idx))  # V3 for fraud

    # Create DataFrame - fixed column numbering to match expected format
    df = pd.DataFrame(X, columns=[f'V{i}' for i in range(1, 29)])
    df['Time'] = time
    df['Amount'] = amount
    df['Class'] = y

    print("Synthetic data created with shape:", df.shape)
    print("Fraud transactions:", df['Class'].sum())

    return df

# Load the data
df = load_dataset()
print(f"Dataset shape: {df.shape}")
print(f"Number of fraud cases: {df['Class'].sum()}")
print(f"Percentage of fraud: {100 * df['Class'].mean():.4f}%")

# Step 2: Enhanced Exploratory Data Analysis
@time_operation("Enhanced EDA")
def enhanced_eda(dataframe):
    print("Performing enhanced exploratory data analysis...")

    # Create a figure for class distribution with count and percentage
    plt.figure(figsize=(12, 6))
    gs = gridspec.GridSpec(1, 2, width_ratios=[2, 1])

    # Plot 1: Class distribution bar plot
    ax1 = plt.subplot(gs[0])
    class_counts = dataframe['Class'].value_counts()
    sns.barplot(x=class_counts.index, y=class_counts.values, ax=ax1, palette=['#3498db', '#e74c3c'])

    for i, count in enumerate(class_counts.values):
        ax1.text(i, count + 50, f"{count:,}", ha='center', fontweight='bold')
        percentage = 100 * count / len(dataframe)
        ax1.text(i, count//2, f"{percentage:.2f}%", ha='center', color='white', fontweight='bold')

    ax1.set_title('Transaction Class Distribution', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Class (0: Normal, 1: Fraud)', fontsize=12)
    ax1.set_ylabel('Count', fontsize=12)
    ax1.set_xticklabels(['Normal', 'Fraud'])
    ax1.grid(False)

    # Plot 2: Pie chart
    ax2 = plt.subplot(gs[1])
    labels = ['Normal', 'Fraud']
    ax2.pie(class_counts.values, labels=labels, autopct='%1.2f%%',
            colors=['#3498db', '#e74c3c'], startangle=90, explode=(0, 0.1),
            textprops={'fontsize': 12, 'fontweight': 'bold'})
    ax2.set_title('Class Percentage', fontsize=14, fontweight='bold')

    plt.tight_layout()
    plt.savefig('plots/class_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Transaction amount analysis: Normal vs Fraud
    plt.figure(figsize=(14, 6))
    gs = gridspec.GridSpec(1, 2)

    # Plot 1: Amount distribution by class
    ax1 = plt.subplot(gs[0])
    normal = dataframe[dataframe['Class'] == 0]['Amount']
    fraud = dataframe[dataframe['Class'] == 1]['Amount']

    # Use log scale for better visualization
    sns.histplot(normal, color='#3498db', alpha=0.5, label='Normal',
                bins=50, kde=True, log_scale=(False, True), ax=ax1)
    sns.histplot(fraud, color='#e74c3c', alpha=0.7, label='Fraud',
                bins=50, kde=True, log_scale=(False, True), ax=ax1)

    ax1.set_title('Transaction Amount Distribution by Class', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Amount', fontsize=12)
    ax1.set_ylabel('Frequency (log scale)', fontsize=12)
    ax1.legend()

    # Plot 2: Box plot comparison
    ax2 = plt.subplot(gs[1])
    sns.boxplot(x='Class', y='Amount', data=dataframe, ax=ax2,
               palette=['#3498db', '#e74c3c'])

    # Add statistics
    for i, cls in enumerate([0, 1]):
        subset = dataframe[dataframe['Class'] == cls]['Amount']
        ax2.text(i, subset.median() + 10, f"Median: ${subset.median():.2f}",
                ha='center', fontweight='bold')
        ax2.text(i, subset.mean(), f"Mean: ${subset.mean():.2f}",
                ha='center', fontweight='bold', color='black',
                bbox=dict(facecolor='white', alpha=0.8, boxstyle='round,pad=0.5'))

    ax2.set_title('Amount Boxplot by Class', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Class (0: Normal, 1: Fraud)', fontsize=12)
    ax2.set_ylabel('Amount ($)', fontsize=12)
    ax2.set_xticklabels(['Normal', 'Fraud'])

    plt.tight_layout()
    plt.savefig('plots/amount_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Time analysis
    plt.figure(figsize=(14, 5))
    # Convert time to hours for better interpretation
    dataframe['Time_hr'] = dataframe['Time'] / 3600

    ax = sns.histplot(data=dataframe, x='Time_hr', hue='Class',
                     palette=['#3498db', '#e74c3c'], bins=48,
                     multiple='stack', alpha=0.7)

    ax.set_title('Transaction Count by Hour of Day', fontsize=14, fontweight='bold')
    ax.set_xlabel('Time (hours)', fontsize=12)
    ax.set_ylabel('Transaction Count', fontsize=12)
    ax.legend(['Normal', 'Fraud'])

    # Add grid lines for every 6 hours
    ax.xaxis.set_major_locator(ticker.MultipleLocator(6))
    ax.grid(which='major', linestyle='--', alpha=0.7)

    plt.tight_layout()
    plt.savefig('plots/time_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Feature importance analysis using correlation
    plt.figure(figsize=(18, 10))
    corr_with_class = dataframe.corr()['Class'].sort_values(ascending=False)

    # Get top correlated features (positive and negative)
    top_corr = pd.concat([corr_with_class.head(15), corr_with_class.tail(15)])
    top_corr = top_corr[top_corr.index != 'Class']  # Remove the class itself

    colors = ['#e74c3c' if x > 0 else '#3498db' for x in top_corr.values]
    sns.barplot(x=top_corr.values, y=top_corr.index, palette=colors)

    plt.title('Top Features Correlated with Fraud', fontsize=16, fontweight='bold')
    plt.xlabel('Correlation Coefficient', fontsize=14)
    plt.axvline(x=0, color='black', linestyle='--')
    plt.grid(False)

    plt.tight_layout()
    plt.savefig('plots/feature_correlation.png', dpi=300, bbox_inches='tight')
    plt.close()

    # PCA visualization for 2D representation
    # Apply PCA
    X = dataframe.drop('Class', axis=1)
    X_scaled = StandardScaler().fit_transform(X)
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)

    # Create DataFrame with PCA results
    pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
    pca_df['Class'] = dataframe['Class'].values

    # Plot PCA with improved styling
    plt.figure(figsize=(12, 8))

    # First plot all normal transactions with low alpha
    normal = pca_df[pca_df['Class'] == 0]
    fraud = pca_df[pca_df['Class'] == 1]

    # Plot normal transactions (sample to reduce plotting time)
    normal_sample = normal.sample(min(5000, len(normal)), random_state=42)
    plt.scatter(normal_sample['PC1'], normal_sample['PC2'],
               c='#3498db', label='Normal', alpha=0.5, s=10)

    # Plot all fraud transactions
    plt.scatter(fraud['PC1'], fraud['PC2'],
               c='#e74c3c', label='Fraud', alpha=0.9, s=30, edgecolor='black')

    plt.title('PCA: Normal vs Fraud Transactions', fontsize=16, fontweight='bold')
    plt.xlabel(f'Principal Component 1 (Variance: {pca.explained_variance_ratio_[0]:.2%})', fontsize=14)
    plt.ylabel(f'Principal Component 2 (Variance: {pca.explained_variance_ratio_[1]:.2%})', fontsize=14)
    plt.legend(title='Transaction Type', fontsize=12)
    plt.grid(True, alpha=0.3)

    # Add annotation showing total variance explained
    total_var = sum(pca.explained_variance_ratio_)
    plt.annotate(f'Total Variance Explained: {total_var:.2%}',
                xy=(0.05, 0.95), xycoords='axes fraction',
                bbox=dict(boxstyle="round,pad=0.5", fc="#f8f9fa", ec="gray", alpha=0.8),
                fontsize=12, fontweight='bold')

    plt.tight_layout()
    plt.savefig('plots/pca_visualization.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Return a few statistics for later use
    stats = {
        'fraud_rate': 100 * dataframe['Class'].mean(),
        'normal_mean_amount': dataframe[dataframe['Class'] == 0]['Amount'].mean(),
        'fraud_mean_amount': dataframe[dataframe['Class'] == 1]['Amount'].mean(),
        'top_correlated_features': corr_with_class.head(5).index.tolist()
    }

    return stats

# Run enhanced EDA
eda_stats = enhanced_eda(df)

# Step 3: Data Preprocessing
@time_operation("Data Preprocessing")
def preprocess_data(dataframe):
    # Handle missing values if any
    if dataframe.isnull().sum().sum() > 0:
        print(f"Found {dataframe.isnull().sum().sum()} missing values, handling them...")
        dataframe.fillna(dataframe.mean(), inplace=True)

    # Create a copy of the original dataframe for visualization
    df_processed = dataframe.copy()

    # Extract features and target
    X = dataframe.drop(columns=['Class'])
    y = dataframe['Class']

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Visualize the scaling effect on a few important features
    if 'V1' in dataframe.columns and 'V2' in dataframe.columns and 'V3' in dataframe.columns:
        # Pick a few important features for visualization
        features_to_plot = ['Amount', 'V1', 'V2', 'V3', 'V4']

        plt.figure(figsize=(16, 12))
        for i, feature in enumerate(features_to_plot):
            if feature in dataframe.columns:
                # Original distribution
                plt.subplot(len(features_to_plot), 2, 2*i+1)
                sns.histplot(dataframe[feature][dataframe['Class']==0], color='blue',
                           label='Normal', alpha=0.5, kde=True)
                sns.histplot(dataframe[feature][dataframe['Class']==1], color='red',
                           label='Fraud', alpha=0.5, kde=True)
                plt.title(f'Original {feature} Distribution')
                plt.legend()

                # Scaled distribution
                plt.subplot(len(features_to_plot), 2, 2*i+2)
                feature_idx = list(X.columns).index(feature)
                sns.histplot(X_scaled[:, feature_idx][y==0], color='blue',
                           label='Normal', alpha=0.5, kde=True)
                sns.histplot(X_scaled[:, feature_idx][y==1], color='red',
                           label='Fraud', alpha=0.5, kde=True)
                plt.title(f'Scaled {feature} Distribution')
                plt.legend()

        plt.tight_layout()
        plt.savefig('plots/feature_scaling_effect.png', dpi=300, bbox_inches='tight')
        plt.close()

    return X_scaled, y.values, df_processed

X_scaled_np, y_np, df_processed = preprocess_data(df)

# Train-Test Split with stratification
@time_operation("Train-Test Split")
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")
    print(f"Fraud cases in training: {np.sum(y_train)}, test: {np.sum(y_test)}")

    # Visualize the train-test split
    plt.figure(figsize=(12, 6))

    # Plot 1: Class distribution in train set
    plt.subplot(1, 2, 1)
    train_counts = pd.Series(y_train).value_counts()
    train_labels = ['Normal', 'Fraud'] if len(train_counts) > 1 else ['Normal']
    plt.pie(train_counts, labels=train_labels, autopct='%1.2f%%',
           colors=['#3498db', '#e74c3c'], explode=[0, 0.1] if len(train_counts) > 1 else [0])
    plt.title('Training Set Class Distribution')

    # Plot 2: Class distribution in test set
    plt.subplot(1, 2, 2)
    test_counts = pd.Series(y_test).value_counts()
    test_labels = ['Normal', 'Fraud'] if len(test_counts) > 1 else ['Normal']
    plt.pie(test_counts, labels=test_labels, autopct='%1.2f%%',
           colors=['#3498db', '#e74c3c'], explode=[0, 0.1] if len(test_counts) > 1 else [0])
    plt.title('Test Set Class Distribution')

    plt.tight_layout()
    plt.savefig('plots/train_test_split.png', dpi=300, bbox_inches='tight')
    plt.close()

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(X_scaled_np, y_np)

# Apply SMOTE with GPU acceleration if available
@time_operation("SMOTE Resampling")
def apply_smote(X_train, y_train):
    try:
        print("Applying SMOTE with 10% sampling ratio...")
        smote = SMOTE(random_state=42, sampling_strategy=0.1)
        X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
        print(f"After SMOTE - X shape: {X_resampled.shape}, Fraud cases: {np.sum(y_resampled)}")

        # Visualize before and after SMOTE
        plt.figure(figsize=(14, 6))

        # Plot 1: Before SMOTE
        plt.subplot(1, 2, 1)
        before_counts = pd.Series(y_train).value_counts()
        plt.bar([0, 1], before_counts, color=['#3498db', '#e74c3c'])
        for i, count in enumerate(before_counts):
            plt.text(i, count//2, f"{count}\n({100*count/len(y_train):.2f}%)",
                   ha='center', color='white', fontweight='bold')
        plt.title('Class Distribution Before SMOTE')
        plt.xticks([0, 1], ['Normal', 'Fraud'])
        plt.xlabel('Class')
        plt.ylabel('Count')

        # Plot 2: After SMOTE
        plt.subplot(1, 2, 2)
        after_counts = pd.Series(y_resampled).value_counts()
        plt.bar([0, 1], after_counts, color=['#3498db', '#e74c3c'])
        for i, count in enumerate(after_counts):
            plt.text(i, count//2, f"{count}\n({100*count/len(y_resampled):.2f}%)",
                   ha='center', color='white', fontweight='bold')
        plt.title('Class Distribution After SMOTE')
        plt.xticks([0, 1], ['Normal', 'Fraud'])
        plt.xlabel('Class')
        plt.ylabel('Count')

        plt.tight_layout()
        plt.savefig('plots/smote_effect.png', dpi=300, bbox_inches='tight')
        plt.close()

        # Visualize SMOTE effect with PCA
        all_data = np.vstack((X_train, X_resampled[np.sum(y_train):]))
        all_labels = np.hstack((y_train, y_resampled[np.sum(y_train):]))

        # Flag original vs synthetic samples
        origin = np.hstack((np.zeros(len(X_train)), np.ones(len(X_resampled) - len(X_train))))

        # Apply PCA for visualization
        pca = PCA(n_components=2)
        pca_result = pca.fit_transform(all_data)

        # Create a dataframe for plotting
        pca_df = pd.DataFrame({
            'PC1': pca_result[:, 0],
            'PC2': pca_result[:, 1],
            'Class': all_labels,
            'Origin': ['Original' if o == 0 else 'Synthetic' for o in origin]
        })

        # Plot
        plt.figure(figsize=(12, 8))

        # Plot normal samples
        plt.scatter(
            pca_df[pca_df['Class'] == 0]['PC1'],
            pca_df[pca_df['Class'] == 0]['PC2'],
            c='#3498db', alpha=0.5, s=10, label='Normal'
        )

        # Plot original fraud samples
        plt.scatter(
            pca_df[(pca_df['Class'] == 1) & (pca_df['Origin'] == 'Original')]['PC1'],
            pca_df[(pca_df['Class'] == 1) & (pca_df['Origin'] == 'Original')]['PC2'],
            c='#e74c3c', s=30, label='Original Fraud', alpha=0.9
        )

        # Plot synthetic fraud samples
        plt.scatter(
            pca_df[(pca_df['Class'] == 1) & (pca_df['Origin'] == 'Synthetic')]['PC1'],
            pca_df[(pca_df['Class'] == 1) & (pca_df['Origin'] == 'Synthetic')]['PC2'],
            c='#2ecc71', s=20, label='Synthetic Fraud', alpha=0.7
        )

        plt.title('PCA: Original vs. SMOTE-generated Samples', fontsize=16, fontweight='bold')
        plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]:.2%} variance)', fontsize=12)
        plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]:.2%} variance)', fontsize=12)
        plt.legend(title='Sample Type', fontsize=10)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig('plots/smote_pca_visualization.png', dpi=300, bbox_inches='tight')
        plt.close()

        return X_resampled, y_resampled

    except Exception as e:
        print(f"SMOTE failed: {e}")
        print("Attempting basic random oversampling as fallback...")

        # Simple random oversampling as fallback
        fraud_indices = np.where(y_train == 1)[0]
        non_fraud_indices = np.where(y_train == 0)[0]

        # Oversample fraud cases to 10% of non-fraud
        target_samples = int(len(non_fraud_indices) * 0.1)
        oversample_indices = np.random.choice(
            fraud_indices,
            size=max(target_samples - len(fraud_indices), 0),
            replace=True
        )

        # Combine with original fraud indices
        all_fraud_indices = np.concatenate([fraud_indices, oversample_indices])
        combined_indices = np.concatenate([non_fraud_indices, all_fraud_indices])

        # Shuffle the indices
        np.random.shuffle(combined_indices)

        X_resampled = X_train[combined_indices]
        y_resampled = y_train[combined_indices]

        print(f"After manual oversampling - X shape: {X_resampled.shape}, Fraud cases: {np.sum(y_resampled)}")
        return X_resampled, y_resampled

X_train_resampled, y_train_resampled = apply_smote(X_train, y_train)

# Create TensorFlow model optimized for GPU
def create_tensorflow_model(input_dim):
    # Reduce batch size for T4 GPU memory constraints
    # Use mixed precision for faster training
    try:
        tf.keras.mixed_precision.set_global_policy('mixed_float16')
    except:
        print("Mixed precision not supported, using default precision")

    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(input_dim,)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    # Compile with appropriate optimizer for GPU
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC()]
    )

    return model

# Define the models dictionary scope at the global level
models = {}

# Step 4: Train GPU-optimized models
@time_operation("Model Training and Evaluation")
def train_and_evaluate_models(X_train, y_train, X_test, y_test):
    # Configure models optimized for GPU/performance
    global models
    models = {
        "Logistic Regression": LogisticRegression(
            max_iter=300,
            solver='liblinear',
            C=0.1,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        ),
        "Random Forest": RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_split=10,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        ),
        "XGBoost": XGBClassifier(
            n_estimators=100,
            max_depth=8,
            learning_rate=0.1,
            eval_metric="logloss",
            **gpu_params
        ),
        "Neural Network": create_tensorflow_model(X_train.shape[1]),
        "Support Vector Machine": SVC(
            kernel='rbf',
            C=1.0,
            gamma='scale',
            probability=True,
            class_weight='balanced',
            random_state=42
        ),
        "Gradient Boosting": GradientBoostingClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=5,
            random_state=42
        ),
        "Decision Tree": DecisionTreeClassifier(
            max_depth=8,
            min_samples_split=10,
            class_weight='balanced',
            random_state=42
        )
    }

    performance = []
    best_model = None
    best_f1 = -1

    # Store training history for Neural Network
    nn_history = None

    # Store predictions for later visualization
    all_predictions = {}

    for name, model in models.items():
        model_start = time.time()

        print(f"Training {name}...")
        try:
            if name == "Neural Network":
                # TensorFlow model requires different training approach
                history = model.fit(
                    X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=10,
                    batch_size=2048,
                    verbose=0
                )

                # Store history for later visualization
                nn_history = history.history

                # Get predictions
                y_prob = model.predict(X_test, batch_size=4096)
                y_pred = (y_prob > 0.5).astype(int)

                # Calculate AUC
                auc_score = roc_auc_score(y_test, y_prob)

            else:
                # Scikit-learn based models
                # Scikit-learn based models
                model.fit(X_train, y_train)

                # Get predictions
                y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.predict(X_test)
                y_pred = (y_prob > 0.5).astype(int) if hasattr(y_prob, "astype") else y_prob

                # Calculate AUC
                auc_score = roc_auc_score(y_test, y_prob)

            # Calculate metrics
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            model_duration = time.time() - model_start

            # Print results
            print(f"{name} Results:")
            print(f"Accuracy: {accuracy:.4f}")
            print(f"Precision: {precision:.4f}")
            print(f"Recall: {recall:.4f}")
            print(f"F1 Score: {f1:.4f}")
            print(f"AUC: {auc_score:.4f}")
            print(f"Training Time: {model_duration:.2f} seconds\n")

            # Store performance
            performance.append({
                'Model': name,
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1 Score': f1,
                'AUC': auc_score,
                'Training Time': model_duration
            })

            # Store predictions for later visualization
            all_predictions[name] = {
                'y_prob': y_prob,
                'y_pred': y_pred
            }

            # Update best model based on F1 score
            if f1 > best_f1:
                best_f1 = f1
                best_model = name

        except Exception as e:
            print(f"Error training {name}: {e}")
            continue

    # Convert to DataFrame for easier analysis
    performance_df = pd.DataFrame(performance)

    # Visualize NN training history if available
    if nn_history:
        plt.figure(figsize=(12, 5))

        plt.subplot(1, 2, 1)
        plt.plot(nn_history['loss'], label='Training Loss')
        plt.plot(nn_history['val_loss'], label='Validation Loss')
        plt.title('Neural Network Training Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(nn_history['accuracy'], label='Training Accuracy')
        plt.plot(nn_history['val_accuracy'], label='Validation Accuracy')
        plt.title('Neural Network Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()

        plt.tight_layout()
        plt.savefig('plots/neural_network_training.png', dpi=300, bbox_inches='tight')
        plt.close()

    # Create interactive performance visualization
    plt.figure(figsize=(16, 10))

    # Metrics comparison
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC']

    performance_sorted = performance_df.sort_values('F1 Score', ascending=False)
    models_sorted = performance_sorted['Model'].tolist()

    colors = [
        '#3498db', '#e74c3c', '#2ecc71', '#f39c12',
        '#9b59b6', '#1abc9c', '#34495e'
    ]

    # Plot 1: Model performance metrics
    plt.subplot(2, 2, 1)
    bar_width = 0.15
    x = np.arange(len(models_sorted))

    for i, metric in enumerate(metrics):
        plt.bar(
            x + i * bar_width,
            performance_sorted[metric],
            width=bar_width,
            label=metric,
            color=colors[i % len(colors)]
        )

    plt.xlabel('Models', fontsize=12)
    plt.ylabel('Score', fontsize=12)
    plt.title('Model Performance Comparison', fontsize=14, fontweight='bold')
    plt.xticks(x + bar_width * 2, models_sorted, rotation=45, ha='right')
    plt.ylim(0, 1.05)
    plt.legend(loc='upper right')
    plt.grid(axis='y', alpha=0.3)

    # Plot 2: Training time comparison
    plt.subplot(2, 2, 2)
    bars = plt.bar(
        models_sorted,
        performance_sorted['Training Time'],
        color='#3498db'
    )

    # Add time labels on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2.,
            height + 0.1,
            f'{height:.1f}s',
            ha='center',
            va='bottom',
            fontweight='bold'
        )

    plt.xlabel('Models', fontsize=12)
    plt.ylabel('Time (seconds)', fontsize=12)
    plt.title('Training Time Comparison', fontsize=14, fontweight='bold')
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)

    # Plot 3: ROC curves
    plt.subplot(2, 1, 2)
    for i, model_name in enumerate(models_sorted):
        if model_name in all_predictions:
            y_prob = all_predictions[model_name]['y_prob']
            fpr, tpr, _ = roc_curve(y_test, y_prob)
            auc_val = auc(fpr, tpr)

            plt.plot(
                fpr,
                tpr,
                label=f'{model_name} (AUC = {auc_val:.4f})',
                color=colors[i % len(colors)],
                linewidth=2
            )

    plt.plot([0, 1], [0, 1], 'k--', label='Random (AUC = 0.500)')
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('Receiver Operating Characteristic (ROC) Curves', fontsize=14, fontweight='bold')
    plt.legend(loc='lower right')
    plt.grid(alpha=0.3)

    plt.tight_layout()
    plt.savefig('plots/model_performance_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Create confusion matrices for each model
    rows = int(np.ceil(len(models_sorted) / 3))
    plt.figure(figsize=(18, rows * 5))

    for i, model_name in enumerate(models_sorted):
        if model_name in all_predictions:
            y_pred = all_predictions[model_name]['y_pred']
            cm = confusion_matrix(y_test, y_pred)

            plt.subplot(rows, 3, i+1)
            sns.heatmap(
                cm,
                annot=True,
                fmt="d",
                cmap=fraud_cmap,
                cbar=False
            )

            plt.title(f'{model_name} Confusion Matrix', fontsize=14)
            plt.xlabel('Predicted Label', fontsize=10)
            plt.ylabel('True Label', fontsize=10)
            plt.xticks([0.5, 1.5], ['Normal', 'Fraud'], rotation=0)
            plt.yticks([0.5, 1.5], ['Normal', 'Fraud'], rotation=0)

    plt.tight_layout()
    plt.savefig('plots/confusion_matrices.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Visualize the precision-recall curves
    plt.figure(figsize=(12, 8))

    for i, model_name in enumerate(models_sorted):
        if model_name in all_predictions:
            y_prob = all_predictions[model_name]['y_prob']
            precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_prob)

            plt.plot(
                recall_curve,
                precision_curve,
                label=f'{model_name} (F1 = {performance_sorted[performance_sorted["Model"] == model_name]["F1 Score"].values[0]:.4f})',
                color=colors[i % len(colors)],
                linewidth=2
            )

    # Add a horizontal line representing the baseline
    plt.axhline(y=sum(y_test)/len(y_test), color='k', linestyle='--',
               label=f'No Skill (Baseline = {sum(y_test)/len(y_test):.4f})')

    plt.xlabel('Recall', fontsize=12)
    plt.ylabel('Precision', fontsize=12)
    plt.title('Precision-Recall Curves', fontsize=14, fontweight='bold')
    plt.legend(loc='upper right')
    plt.grid(alpha=0.3)

    plt.tight_layout()
    plt.savefig('plots/precision_recall_curves.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Return best model and performance data
    return best_model, performance_df, all_predictions

# Train and evaluate models
best_model, performance_df, all_predictions = train_and_evaluate_models(
    X_train_resampled, y_train_resampled, X_test, y_test
)

print(f"\n🏆 Best performing model: {best_model}")
print(performance_df.sort_values('F1 Score', ascending=False).to_string(index=False))

# Step 5: Feature Importance Analysis for the best model
@time_operation("Feature Importance Analysis")
def feature_importance_analysis():
    # Select the best model from the trained models
    if best_model not in models:
        print("Best model not found in the trained models.")
        return

    model = models[best_model]
    feature_names = df.drop(columns=['Class']).columns.tolist()

    # Different feature importance extraction depending on model type
    if best_model == "Neural Network":
        # For Neural Network, we use permutation importance
        from sklearn.inspection import permutation_importance

        result = permutation_importance(
            model, X_test, y_test,
            n_repeats=5, random_state=42, n_jobs=-1
        )

        importances = result.importances_mean
        indices = np.argsort(importances)[::-1]

    elif best_model in ["XGBoost", "Random Forest", "Gradient Boosting", "Decision Tree"]:
        # Tree-based models
        if hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
            indices = np.argsort(importances)[::-1]
        else:
            print(f"Feature importance not available for {best_model}")
            return

    elif best_model == "Logistic Regression":
        # Logistic Regression
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_[0])
            indices = np.argsort(importances)[::-1]
        else:
            print(f"Feature importance not available for {best_model}")
            return

    else:
        # For other models, try to use permutation importance
        try:
            from sklearn.inspection import permutation_importance

            result = permutation_importance(
                model, X_test, y_test,
                n_repeats=5, random_state=42, n_jobs=-1
            )

            importances = result.importances_mean
            indices = np.argsort(importances)[::-1]
        except:
            print(f"Feature importance not available for {best_model}")
            return

    # Display top 20 features
    top_n = min(20, len(feature_names))
    plt.figure(figsize=(14, 10))

    # Get the most important features
    top_indices = indices[:top_n]
    top_importances = importances[top_indices]
    top_features = [feature_names[i] for i in top_indices]

    # Create a DataFrame for easier sorting
    importance_df = pd.DataFrame({
        'Feature': top_features,
        'Importance': top_importances
    })
    importance_df = importance_df.sort_values('Importance', ascending=True)

    # Plot horizontal bar chart
    bars = plt.barh(importance_df['Feature'], importance_df['Importance'], color='#3498db')

    for i, bar in enumerate(bars):
        plt.text(
            bar.get_width() * 1.01,
            bar.get_y() + bar.get_height()/2,
            f"{importance_df['Importance'].iloc[i]:.4f}",
            va='center',
            fontweight='bold'
        )

    plt.xlabel('Importance', fontsize=12)
    plt.ylabel('Feature', fontsize=12)
    plt.title(f'Top {top_n} Features for {best_model}', fontsize=16, fontweight='bold')
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()

    plt.savefig('plots/feature_importance.png', dpi=300, bbox_inches='tight')
    plt.close()

    return importance_df

# Analyze feature importance
importance_df = feature_importance_analysis()

if importance_df is not None:
    print("\nTop 10 most important features:")
    print(importance_df.sort_values('Importance', ascending=False).head(10))

# Step 6: Real-time Fraud Detection Simulation
@time_operation("Fraud Detection Simulation")
def simulate_fraud_detection():
    print("Simulating real-time fraud detection pipeline...")

    if best_model not in models:
        print("Best model not found in the trained models.")
        return

    model = models[best_model]
    feature_names = df.drop(columns=['Class']).columns.tolist()

    # Create test cases - a mix of normal and fraud transactions
    # Sample a few random examples from the test set
    normal_indices = np.where(y_test == 0)[0]
    fraud_indices = np.where(y_test == 1)[0]

    # Select random examples for the simulation
    n_normal = min(5, len(normal_indices))
    n_fraud = min(5, len(fraud_indices))

    selected_normal = np.random.choice(normal_indices, n_normal, replace=False)
    selected_fraud = np.random.choice(fraud_indices, n_fraud, replace=False)

    # Combine the indices
    selected_indices = np.concatenate([selected_normal, selected_fraud])
    np.random.shuffle(selected_indices)

    # Get the selected examples
    X_sample = X_test[selected_indices]
    y_sample = y_test[selected_indices]

    # Create a figure to save all simulations
    plt.figure(figsize=(14, len(selected_indices) * 3))

    # Perform real-time detection
    detection_results = []

    for i, (x, y_true) in enumerate(zip(X_sample, y_sample)):
        # Reshape for single prediction
        x_reshape = x.reshape(1, -1)

        # Measure prediction time
        start_time = time.time()

        # Get prediction
        if best_model == "Neural Network":
            y_prob = model.predict(x_reshape)[0][0]
        else:
            y_prob = model.predict_proba(x_reshape)[0][1] if hasattr(model, "predict_proba") else model.predict(x_reshape)[0]

        y_pred = 1 if y_prob > 0.5 else 0
        end_time = time.time()

        detection_time = (end_time - start_time) * 1000  # in milliseconds

        # Create a subplot for each transaction
        plt.subplot(len(selected_indices), 1, i + 1)
        bar_colors = ['#3498db', '#e74c3c']

        # Create a horizontal bar for confidence
        bars = plt.barh(
            ['Normal', 'Fraud'],
            [1 - y_prob, y_prob],
            color=[bar_colors[0], bar_colors[1]],
            height=0.5
        )

        # Add percentage text
        for bar, val in zip(bars, [1 - y_prob, y_prob]):
            plt.text(
                min(val + 0.02, 0.98) if val > 0.92 else val + 0.02,
                bar.get_y() + bar.get_height()/2,
                f"{val:.2%}",
                va='center',
                fontweight='bold',
                color='white' if val > 0.3 else 'black'
            )

        # Highlight the correct class
        correct = y_pred == y_true
        result_color = '#2ecc71' if correct else '#e74c3c'
        result_text = "Correct" if correct else "Incorrect"

        plt.title(
            f"Transaction {i+1}: {'Fraud' if y_true == 1 else 'Normal'} | "
            f"Prediction: {'Fraud' if y_pred == 1 else 'Normal'} | "
            f"Result: {result_text} | "
            f"Detection time: {detection_time:.2f}ms",
            fontweight='bold',
            color=result_color
        )

        plt.xlim(0, 1)
        plt.grid(axis='x', alpha=0.3)

        # Store result for the final report
        detection_results.append({
            'Transaction': i + 1,
            'True Class': 'Fraud' if y_true == 1 else 'Normal',
            'Predicted Class': 'Fraud' if y_pred == 1 else 'Normal',
            'Fraud Probability': y_prob,
            'Correct': correct,
            'Detection Time (ms)': detection_time
        })

    plt.tight_layout()
    plt.savefig('plots/fraud_detection_simulation.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Create a summary of the simulation
    detection_df = pd.DataFrame(detection_results)

    # Calculate accuracy and average detection time
    accuracy = detection_df['Correct'].mean()
    avg_detection_time = detection_df['Detection Time (ms)'].mean()

    print(f"Simulation Results:")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"Average Detection Time: {avg_detection_time:.2f} ms")

    return detection_df

# Run fraud detection simulation
detection_results = simulate_fraud_detection()

# Final summary and report
@time_operation("Performance Summary")
def create_performance_summary():
    # End time for total script run
    end_time_total = time.time()
    total_duration = end_time_total - start_time_total

    print("\n" + "="*80)
    print("CREDIT CARD FRAUD DETECTION - PERFORMANCE SUMMARY")
    print("="*80)

    print(f"\nTotal Processing Time: {total_duration:.2f} seconds")

    # Dataset summary
    print("\nDataset Summary:")
    print(f"- Total transactions: {len(df)}")
    print(f"- Fraud transactions: {int(df['Class'].sum())} ({eda_stats['fraud_rate']:.4f}%)")
    print(f"- Normal transactions: {len(df) - int(df['Class'].sum())}")

    # Display performance timing
    print("\nPipeline Performance:")
    for operation, duration in perf_metrics.items():
        print(f"- {operation}: {duration:.2f} seconds")

    # Best model details
    print(f"\nBest Model: {best_model}")
    best_performance = performance_df[performance_df['Model'] == best_model].iloc[0]
    print(f"- Accuracy: {best_performance['Accuracy']:.4f}")
    print(f"- Precision: {best_performance['Precision']:.4f}")
    print(f"- Recall: {best_performance['Recall']:.4f}")
    print(f"- F1 Score: {best_performance['F1 Score']:.4f}")
    print(f"- AUC: {best_performance['AUC']:.4f}")

    # Top features
    if importance_df is not None:
        print("\nTop 5 Most Important Features:")
        for i, row in importance_df.sort_values('Importance', ascending=False).head(5).iterrows():
            print(f"- {row['Feature']}: {row['Importance']:.4f}")

    # Save summary to file
    with open('performance_summary.txt', 'w') as f:
        f.write("CREDIT CARD FRAUD DETECTION - PERFORMANCE SUMMARY\n")
        f.write("="*80 + "\n\n")

        f.write(f"Total Processing Time: {total_duration:.2f} seconds\n\n")

        f.write("Dataset Summary:\n")
        f.write(f"- Total transactions: {len(df)}\n")
        f.write(f"- Fraud transactions: {int(df['Class'].sum())} ({eda_stats['fraud_rate']:.4f}%)\n")
        f.write(f"- Normal transactions: {len(df) - int(df['Class'].sum())}\n\n")

        f.write("Pipeline Performance:\n")
        for operation, duration in perf_metrics.items():
            f.write(f"- {operation}: {duration:.2f} seconds\n")

        f.write(f"\nBest Model: {best_model}\n")
        f.write(f"- Accuracy: {best_performance['Accuracy']:.4f}\n")
        f.write(f"- Precision: {best_performance['Precision']:.4f}\n")
        f.write(f"- Recall: {best_performance['Recall']:.4f}\n")
        f.write(f"- F1 Score: {best_performance['F1 Score']:.4f}\n")
        f.write(f"- AUC: {best_performance['AUC']:.4f}\n")

        if importance_df is not None:
            f.write("\nTop Features:\n")
            for i, row in importance_df.sort_values('Importance', ascending=False).head(10).iterrows():
                f.write(f"- {row['Feature']}: {row['Importance']:.4f}\n")

    print("\nPerformance summary saved to performance_summary.txt")
    print("\nAll visualizations saved to the 'plots' directory.")

# Generate final performance summary
create_performance_summary()

print("\n✅ Credit card fraud detection pipeline complete!")

Starting GPU-optimized credit card fraud detection pipeline...
Checking for GPU availability...
GPU is available!
Starting Data Loading...
Local file not found, creating synthetic dataset for testing...
Synthetic data created with shape: (10000, 31)
Fraud transactions: 20.0
⏱ Data Loading: 0.02 seconds
Dataset shape: (10000, 31)
Number of fraud cases: 20.0
Percentage of fraud: 0.2000%
Starting Enhanced EDA...
Performing enhanced exploratory data analysis...
⏱ Enhanced EDA: 5.91 seconds
Starting Data Preprocessing...
⏱ Data Preprocessing: 4.26 seconds
Starting Train-Test Split...
Training set: (8000, 31), Test set: (2000, 31)
Fraud cases in training: 16.0, test: 4.0
⏱ Train-Test Split: 0.44 seconds
Starting SMOTE Resampling...
Applying SMOTE with 10% sampling ratio...
After SMOTE - X shape: (8782, 31), Fraud cases: 798.0
SMOTE failed: slice indices must be integers or None or have an __index__ method
Attempting basic random oversampling as fallback...
After manual oversampling - X shape

In [None]:
from google.colab import drive
drive.mount('/content/drive')