<a href="https://colab.research.google.com/github/hphung188/UW_ML_Fraud/blob/main/fraud_pipeline_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Key Features of the Pipeline:

Data Loading & Preprocessing: Reads data, handles missing values, and encodes categorical features.

Feature Selection: Uses variance threshold, chi-square, mutual information, and ExtraTreesClassifier.

Resampling Techniques: Supports NearMiss (undersampling) and SMOTE (oversampling).

Model Training: Includes Decision Trees, Random Forest, XGBoost, and Neural Networks.

Hyperparameter Tuning: Implements GridSearchCV and RandomizedSearchCV.

Model Evaluation: Uses Confusion Matrix, ROC-AUC, and Recall scores.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, mutual_info_classif
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

warnings.filterwarnings('ignore')

class FraudDetectionPipeline:
    def __init__(self, filepath):
        self.filepath = filepath
        self.df = None
        self.preprocessed_data = None
        self.selected_features = None
        self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None

    def load_data(self):
        """Loads data from CSV file."""
        self.df = pd.read_csv(self.filepath)
        print("Data Loaded Successfully.")

    def preprocess_data(self):
        """Handles missing values, scales numerical data, and encodes categorical features."""
        # Handling missing values
        missing_features = ['prev_address_months_count', 'current_address_months_count',
                            'intended_balcon_amount', 'bank_months_count', 'session_length_in_minutes']
        for feature in missing_features:
            self.df.loc[self.df[feature] < 0, feature] = np.nan

        # One-hot encoding categorical features
        categorical_features = [col for col in self.df.columns if self.df[col].dtype == 'O']
        self.df = pd.get_dummies(self.df, columns=categorical_features)

        # Splitting features and target variable
        X = self.df.drop(columns=['fraud_bool'])
        y = self.df['fraud_bool']

        # Scaling numeric features
        numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
        preprocessor = ColumnTransformer([('scaled', MinMaxScaler(), numeric_features)], remainder='passthrough')
        X_scaled = preprocessor.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=numeric_features + list(set(X.columns) - set(numeric_features)))

        self.preprocessed_data = (X_scaled, y)
        print("Data Preprocessing Completed.")

    def feature_selection(self):
        """Applies multiple feature selection techniques and selects the most relevant features."""
        X_scaled, y = self.preprocessed_data

        # Variance Threshold
        selector = VarianceThreshold()
        X_scaled = X_scaled.loc[:, selector.fit(X_scaled).get_support()]

        # Chi-Square Test
        chi2_selector = SelectKBest(chi2, k=15).fit(X_scaled, y)
        chi2_features = X_scaled.columns[chi2_selector.get_support()]

        # Mutual Information
        mi_selector = SelectKBest(mutual_info_classif, k=15).fit(X_scaled, y)
        mi_features = X_scaled.columns[mi_selector.get_support()]

        # ExtraTreesClassifier Feature Importance
        extra = ExtraTreesClassifier(n_estimators=50, random_state=0).fit(X_scaled, y)
        extra_features = X_scaled.columns[extra.feature_importances_ > 0.02]

        # Final Feature Selection
        selected_features = set(chi2_features) | set(mi_features) | set(extra_features)
        self.selected_features = list(selected_features)
        self.preprocessed_data = (X_scaled[self.selected_features], y)
        print("Feature Selection Completed. Selected Features:", self.selected_features)

    def balance_data(self, method='undersampling'):
        """Balances dataset using NearMiss (undersampling) or SMOTE (oversampling)."""
        X, y = self.preprocessed_data

        if method == 'undersampling':
            sampler = NearMiss(sampling_strategy=0.1, n_jobs=-1)
        else:
            sampler = SMOTE(random_state=42)

        X_resampled, y_resampled = sampler.fit_resample(X, y)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=42, stratify=y_resampled)
        print(f"Data Balanced using {method}. Training size: {len(self.X_train)}, Test size: {len(self.X_test)}")

    def train_classifier(self, classifier, param_dist, search_type='random'):
        """Trains a classifier using GridSearchCV or RandomizedSearchCV with SMOTE."""
        smote_nc = SMOTENC(categorical_features=[i for i, col in enumerate(self.X_train.columns) if self.X_train[col].nunique() < 10],
                            sampling_strategy='minority', random_state=42)

        pipeline = make_pipeline(smote_nc, classifier)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        search = RandomizedSearchCV(pipeline, param_dist, n_iter=10, scoring="roc_auc", n_jobs=-1, cv=cv) if search_type == 'random' else GridSearchCV(pipeline, param_dist, scoring="roc_auc", n_jobs=-1, cv=cv)

        search.fit(self.X_train, self.y_train)
        return search

    def evaluate_model(self, model, model_name):
        """Evaluates the trained model using classification metrics."""
        y_pred = model.predict(self.X_test)
        y_prob = model.predict_proba(self.X_test)[:, 1]
        fpr, tpr, _ = roc_curve(self.y_test, y_prob)

        print(f"\nModel: {model_name}")
        print(classification_report(self.y_test, y_pred))
        plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc(fpr, tpr):.2f})')

    def train_and_evaluate_models(self):
        """Trains and evaluates multiple classifiers."""
        models = {
            'Decision Tree': (DecisionTreeClassifier(), {'decisiontreeclassifier__max_depth': [4, 6, 8]}),
            'Random Forest': (RandomForestClassifier(), {'randomforestclassifier__n_estimators': [50, 100], 'randomforestclassifier__max_depth': [6, 8]}),
            'XGBoost': (XGBClassifier(tree_method='hist'), {'xgbclassifier__max_depth': [6, 8], 'xgbclassifier__learning_rate': [0.05, 0.1]}),
            'Neural Network': (MLPClassifier(random_state=42), {'mlpclassifier__hidden_layer_sizes': [(50,), (100,)]})
        }

        plt.figure(figsize=(8, 8))
        for model_name, (clf, params) in models.items():
            trained_model = self.train_classifier(clf, params)
            self.evaluate_model(trained_model, model_name)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves')
        plt.legend()
        plt.show()

# Run the pipeline
pipeline = FraudDetectionPipeline("fraud_data.csv")
pipeline.load_data()
pipeline.preprocess_data()
pipeline.feature_selection()
pipeline.balance_data(method='undersampling')
pipeline.train_and_evaluate_models()
