In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, learning_curve, cross_val_score
from sklearn.metrics import (confusion_matrix, classification_report, precision_recall_curve,
                           roc_curve, auc, precision_score, recall_score, f1_score, accuracy_score)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

# Try to import XGBoost, but continue if not available
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except ImportError:
    print("XGBoost is not installed. Skipping XGBoost classifier.")
    print("To install XGBoost, run: pip install xgboost")
    XGBOOST_AVAILABLE = False

class FraudDetectionAnalysis:
    def __init__(self, data_path):
        self.dataset = pd.read_csv(data_path)
        self.results = {}
        
        # Initialize classifiers
        self.classifiers = {
            "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42),
            "Logistic Regression": LogisticRegression(random_state=42),
            "KNN": KNeighborsClassifier(),
            "SVC": SVC(random_state=42, probability=True),
            "Decision Tree": DecisionTreeClassifier(random_state=42)
        }
        
        # Add XGBoost if available
        if XGBOOST_AVAILABLE:
            self.classifiers["XGBoost"] = XGBClassifier(random_state=42)
    
    def preprocess_data(self):
        print("Starting data preprocessing...")
        
        # Scale Amount and Time features
        rob_scaler = RobustScaler()
        self.dataset['scaled_amount'] = rob_scaler.fit_transform(self.dataset['Amount'].values.reshape(-1,1))
        self.dataset['scaled_time'] = rob_scaler.fit_transform(self.dataset['Time'].values.reshape(-1,1))
        self.dataset.drop(['Time', 'Amount'], axis=1, inplace=True)
        
        # Create balanced dataset
        fraud_df = self.dataset[self.dataset['Class'] == 1]
        non_fraud_df = self.dataset[self.dataset['Class'] == 0].sample(n=len(fraud_df), random_state=42)
        self.balanced_df = pd.concat([fraud_df, non_fraud_df]).sample(frac=1, random_state=42)
        
        # Prepare features and target
        X = self.balanced_df.drop('Class', axis=1)
        y = self.balanced_df['Class']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        print("Data preprocessing completed.")
    
    def plot_data_distribution(self):
        plt.style.use('seaborn')
        plt.figure(figsize=(15, 5))
        
        # Original distribution
        plt.subplot(121)
        sns.countplot(data=self.dataset, x='Class', palette=['#2ecc71', '#e74c3c'])
        plt.title('Original Class Distribution', fontsize=12)
        
        # Balanced distribution
        plt.subplot(122)
        sns.countplot(data=self.balanced_df, x='Class', palette=['#2ecc71', '#e74c3c'])
        plt.title('Balanced Class Distribution', fontsize=12)
        
        plt.tight_layout()
        plt.show()
        
        # Print distribution percentages
        print("\nClass Distribution Summary:")
        print("-" * 50)
        print("Original Dataset:")
        print(f"No Fraud: {len(self.dataset[self.dataset['Class']==0])/len(self.dataset)*100:.2f}%")
        print(f"Fraud: {len(self.dataset[self.dataset['Class']==1])/len(self.dataset)*100:.2f}%")
        print("\nBalanced Dataset:")
        print(f"No Fraud: {len(self.balanced_df[self.balanced_df['Class']==0])/len(self.balanced_df)*100:.2f}%")
        print(f"Fraud: {len(self.balanced_df[self.balanced_df['Class']==1])/len(self.balanced_df)*100:.2f}%")
    
    def evaluate_models(self):
        for name, classifier in self.classifiers.items():
            print(f"\nEvaluating {name}...")
            try:
                # Train and predict
                classifier.fit(self.X_train, self.y_train)
                y_pred = classifier.predict(self.X_test)
                
                # Calculate metrics
                self.results[name] = {
                    'accuracy': accuracy_score(self.y_test, y_pred),
                    'precision': precision_score(self.y_test, y_pred),
                    'recall': recall_score(self.y_test, y_pred),
                    'f1': f1_score(self.y_test, y_pred),
                    'confusion_matrix': confusion_matrix(self.y_test, y_pred)
                }
                print(f"{name} evaluation completed successfully.")
            except Exception as e:
                print(f"Error evaluating {name}: {str(e)}")
    
    def plot_learning_curves(self):
        plt.style.use('seaborn')
        n_models = len(self.classifiers)
        n_rows = (n_models + 1) // 2
        plt.figure(figsize=(15, 5 * n_rows))
        
        for idx, (name, classifier) in enumerate(self.classifiers.items(), 1):
            plt.subplot(n_rows, 2, idx)
            try:
                train_sizes, train_scores, test_scores = learning_curve(
                    classifier, self.X_train, self.y_train,
                    cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10)
                )
                
                train_mean = np.mean(train_scores, axis=1)
                train_std = np.std(train_scores, axis=1)
                test_mean = np.mean(test_scores, axis=1)
                test_std = np.std(test_scores, axis=1)
                
                plt.plot(train_sizes, train_mean, label='Training score', color='#2ecc71')
                plt.plot(train_sizes, test_mean, label='Cross-validation score', color='#e74c3c')
                plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='#2ecc71')
                plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='#e74c3c')
                
                plt.title(f'Learning Curve - {name}', fontsize=12)
                plt.xlabel('Training Examples', fontsize=10)
                plt.ylabel('Score', fontsize=10)
                plt.grid(True, linestyle='--', alpha=0.7)
                plt.legend(loc='lower right', fontsize=10)
            except Exception as e:
                plt.text(0.5, 0.5, f"Error plotting learning curve for {name}:\n{str(e)}", 
                        ha='center', va='center', wrap=True)
        
        plt.tight_layout()
        plt.show()
    
    def plot_roc_curves(self):
        plt.style.use('seaborn')
        plt.figure(figsize=(10, 8))
        
        for name, classifier in self.classifiers.items():
            try:
                if hasattr(classifier, "predict_proba"):
                    y_score = classifier.predict_proba(self.X_test)[:, 1]
                    fpr, tpr, _ = roc_curve(self.y_test, y_score)
                    roc_auc = auc(fpr, tpr)
                    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
            except Exception as e:
                print(f"Error plotting ROC curve for {name}: {str(e)}")
        
        plt.plot([0, 1], [0, 1], 'k--', label='Random')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves Comparison', fontsize=12)
        plt.legend(loc="lower right", fontsize=10)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.show()
    
    def generate_summary_report(self):
        # Create summary DataFrame
        summary_df = pd.DataFrame(self.results).T
        summary_df = summary_df.drop('confusion_matrix', axis=1)
        
        print("\nModel Performance Summary:")
        print("-" * 80)
        print(summary_df.round(4))
        
        # Print detailed classification reports
        print("\nDetailed Classification Reports:")
        print("-" * 80)
        for name, classifier in self.classifiers.items():
            try:
                print(f"\n{name}:")
                y_pred = classifier.predict(self.X_test)
                print(classification_report(self.y_test, y_pred))
            except Exception as e:
                print(f"Error generating report for {name}: {str(e)}")

# Usage example:
if __name__ == "__main__":
    try:
        analysis = FraudDetectionAnalysis('creditcard.csv')
        analysis.preprocess_data()
        analysis.plot_data_distribution()
        analysis.evaluate_models()
        analysis.plot_learning_curves()
        analysis.plot_roc_curves()
        analysis.generate_summary_report()
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        

An error occurred: [Errno 2] No such file or directory: 'creditcard.csv'
