In [24]:
# %% [markdown]
# # 02 - Healthcare Fraud Detection: Modeling
# 
# ## Project Overview
# This notebook covers:
# 1. Data preparation and splitting
# 2. Handling class imbalance
# 3. Model training (multiple algorithms as per project requirements)
# 4. Hyperparameter tuning
# 5. Model comparison
# 
# **Team:** [Your Team Name]
# **Date:** [Current Date]

# %%
# Cell 1: Import Libraries with debugger fix
import os
# Suppress debugger warnings
os.environ['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                           roc_auc_score, confusion_matrix, classification_report,
                           precision_recall_curve, roc_curve, average_precision_score)

# Imbalance handling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

# Models (All required models from project description)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Set random seed
np.random.seed(42)

In [5]:
# Cell 2: Load Processed Data
provider_features = pd.read_csv('../data/processed/provider_features_final.csv')

print("Dataset loaded successfully!")
print(f"Shape: {provider_features.shape}")
print(f"Fraud rate: {provider_features['PotentialFraud'].mean():.2%}")
print(f"\nFirst few rows:")
display(provider_features.head())


Dataset loaded successfully!
Shape: (5410, 40)
Fraud rate: 9.35%

First few rows:


Unnamed: 0,Provider,Claims_InscClaimAmtReimbursed_count,Claims_InscClaimAmtReimbursed_sum,Claims_InscClaimAmtReimbursed_mean,Claims_InscClaimAmtReimbursed_std,Claims_InscClaimAmtReimbursed_max,Claims_InscClaimAmtReimbursed_min,Claims_DeductibleAmtPaid_sum,Claims_DeductibleAmtPaid_mean,Claims_DeductibleAmtPaid_std,...,Beneficiary_Gender_<lambda>,Beneficiary_Race_nunique,UniquePhysicians_Count,UniqueBeneficiaries_Count,TotalClaims_Count,ClaimTimeSpan_Days,PotentialFraud,Avg_Reimbursement_per_Beneficiary,Claims_per_Beneficiary,Reimbursement_per_Claim
0,PRV51001,25,104640,4185.6,10796.091144,42000,10,5340.0,213.6,436.009174,...,0.375,2,14,24,25,358,0,4360.0,1.041667,4185.6
1,PRV51003,132,605670,4588.409091,7309.794729,57000,0,66286.0,502.166667,534.582439,...,0.418803,3,44,117,132,356,1,5176.666667,1.128205,4588.409091
2,PRV51004,149,52170,350.134228,689.963754,3300,0,310.0,2.080537,11.166744,...,0.326087,3,38,138,149,358,0,378.043478,1.07971,350.134228
3,PRV51005,1165,280910,241.124464,491.556392,4080,0,3700.0,3.175966,17.026584,...,0.420202,3,6,495,1165,376,1,567.494949,2.353535,241.124464
4,PRV51007,72,33710,468.194444,1433.769116,10000,0,3264.0,45.333333,214.820724,...,0.465517,2,10,58,72,356,0,581.206897,1.241379,468.194444


In [6]:
# Cell 3: Data Preparation Function
def prepare_data(df, test_size=0.2, val_size=0.1):
    """
    Prepare data for modeling with train/validation/test splits
    """
    # Separate features and target
    X = df.drop(['Provider', 'PotentialFraud'], axis=1)
    y = df['PotentialFraud']
    
    # Handle any remaining non-numeric columns
    for col in X.select_dtypes(include=['object']).columns:
        X = X.drop(columns=[col])
    
    # First split: train+val vs test
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )
    
    # Second split: train vs validation
    val_ratio = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_ratio, random_state=42, stratify=y_temp
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Convert back to DataFrames
    X_train = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
    X_val = pd.DataFrame(X_val_scaled, columns=X.columns, index=X_val.index)
    X_test = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)
    
    print("Data splits created:")
    print(f"  Training set: {X_train.shape}, Fraud rate: {y_train.mean():.2%}")
    print(f"  Validation set: {X_val.shape}, Fraud rate: {y_val.mean():.2%}")
    print(f"  Test set: {X_test.shape}, Fraud rate: {y_test.mean():.2%}")
    print(f"  Number of features: {X_train.shape[1]}")
    
    return X_train, X_val, X_test, y_train, y_val, y_test, scaler


In [7]:
# Cell 4: Create Data Splits
X_train, X_val, X_test, y_train, y_val, y_test, scaler = prepare_data(provider_features)


Data splits created:
  Training set: (3787, 38), Fraud rate: 9.35%
  Validation set: (541, 38), Fraud rate: 9.43%
  Test set: (1082, 38), Fraud rate: 9.33%
  Number of features: 38


In [8]:
# Cell 5: Handle Class Imbalance
print("=== Class Imbalance Handling ===")
print(f"Original training fraud rate: {y_train.mean():.2%}")

# Strategy 1: SMOTE
smote = SMOTE(random_state=42, sampling_strategy=0.3)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print(f"After SMOTE (30% fraud): {X_train_smote.shape}, Fraud rate: {y_train_smote.mean():.2%}")

# Strategy 2: Class weighting (for comparison)
print(f"\nClass distribution for weighting:")
print(f"  Class 0 (Non-Fraud): {sum(y_train == 0)}")
print(f"  Class 1 (Fraud): {sum(y_train == 1)}")

# Strategy 3: Random Under Sampling (for comparison)
rus = RandomUnderSampler(random_state=42, sampling_strategy=0.3)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
print(f"After Random Under Sampling: {X_train_rus.shape}, Fraud rate: {y_train_rus.mean():.2%}")

=== Class Imbalance Handling ===
Original training fraud rate: 9.35%
After SMOTE (30% fraud): (4462, 38), Fraud rate: 23.06%

Class distribution for weighting:
  Class 0 (Non-Fraud): 3433
  Class 1 (Fraud): 354
After Random Under Sampling: (1534, 38), Fraud rate: 23.08%


In [18]:
# Cell 6: Train Multiple Models (All required models)
def train_models(X_train, y_train, X_val, y_val, use_class_weight=True):
    """
    Train multiple classification models as per project requirements
    """
    # Calculate class weights if needed
    class_weight = None
    if use_class_weight:
        class_weight = 'balanced'
    
    # Define all models from project requirements plus some extras
    models = {
        # Required by project: Logistic Regression
        'Logistic Regression': LogisticRegression(
            class_weight=class_weight,
            random_state=42,
            max_iter=1000
        ),
        
        # Required by project: Decision Tree
        'Decision Tree': DecisionTreeClassifier(
            class_weight=class_weight,
            random_state=42,
            max_depth=10
        ),
        
        # Required by project: Random Forest
        'Random Forest': RandomForestClassifier(
            class_weight=class_weight,
            random_state=42,
            n_estimators=100,
            max_depth=10,
            n_jobs=1
        ),
        
        # Required by project: Gradient Boosting
        'Gradient Boosting': GradientBoostingClassifier(
            random_state=42,
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1
        ),
        
        # Required by project: SVM
        'SVM (Linear)': SVC(
            class_weight=class_weight,
            random_state=42,
            probability=True,
            kernel='linear'
        ),
        
        # SVM with RBF kernel (for comparison)
        'SVM (RBF)': SVC(
            class_weight=class_weight,
            random_state=42,
            probability=True,
            kernel='rbf'
        ),
        
        # Additional models for comparison
        'Naive Bayes': GaussianNB(),
        
        'K-Nearest Neighbors': KNeighborsClassifier(
            n_neighbors=5,
            n_jobs=-1
        ),
        
        'AdaBoost': GradientBoostingClassifier(  # Using Gradient Boosting as AdaBoost alternative
            random_state=42,
            n_estimators=50,
            max_depth=3,
            learning_rate=0.1
        )
    }
    
    results = {}
    
    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")
        
        try:
            # Train model
            model.fit(X_train, y_train)
            
            # Predict on validation set
            y_pred = model.predict(X_val)
            y_pred_proba = model.predict_proba(X_val)[:, 1]
            
            # Calculate metrics
            metrics = {
                'Accuracy': accuracy_score(y_val, y_pred),
                'Precision': precision_score(y_val, y_pred, zero_division=0),
                'Recall': recall_score(y_val, y_pred),
                'F1-Score': f1_score(y_val, y_pred),
                'ROC-AUC': roc_auc_score(y_val, y_pred_proba),
                'PR-AUC': average_precision_score(y_val, y_pred_proba)
            }
            
            results[model_name] = {
                'model': model,
                'metrics': metrics,
                'predictions': y_pred,
                'probabilities': y_pred_proba
            }
            
            # Print metrics
            print(f"  F1-Score: {metrics['F1-Score']:.4f}")
            print(f"  Recall: {metrics['Recall']:.4f}")
            print(f"  Precision: {metrics['Precision']:.4f}")
            print(f"  ROC-AUC: {metrics['ROC-AUC']:.4f}")
            
        except Exception as e:
            print(f"  Error training {model_name}: {str(e)}")
            results[model_name] = None
    
    return results


In [19]:
# Cell 7: Train Models with Different Imbalance Strategies
print("=== Training Models with SMOTE ===")
results_smote = train_models(X_train_smote, y_train_smote, X_val, y_val, use_class_weight=False)

print("\n=== Training Models with Class Weighting ===")
results_weighted = train_models(X_train, y_train, X_val, y_val, use_class_weight=True)

print("\n=== Training Models with Random Under Sampling ===")
results_rus = train_models(X_train_rus, y_train_rus, X_val, y_val, use_class_weight=False)


=== Training Models with SMOTE ===

Training Logistic Regression...
  F1-Score: 0.6481
  Recall: 0.6863
  Precision: 0.6140
  ROC-AUC: 0.9405

Training Decision Tree...
  F1-Score: 0.6545
  Recall: 0.7059
  Precision: 0.6102
  ROC-AUC: 0.7893

Training Random Forest...
  F1-Score: 0.7156
  Recall: 0.7647
  Precision: 0.6724
  ROC-AUC: 0.9524

Training Gradient Boosting...
  F1-Score: 0.6465
  Recall: 0.6275
  Precision: 0.6667
  ROC-AUC: 0.9409

Training SVM (Linear)...
  F1-Score: 0.6316
  Recall: 0.7059
  Precision: 0.5714
  ROC-AUC: 0.9368

Training SVM (RBF)...
  F1-Score: 0.6286
  Recall: 0.6471
  Precision: 0.6111
  ROC-AUC: 0.9238

Training Naive Bayes...
  F1-Score: 0.5857
  Recall: 0.8039
  Precision: 0.4607
  ROC-AUC: 0.9093

Training K-Nearest Neighbors...
  F1-Score: 0.5873
  Recall: 0.7255
  Precision: 0.4933
  ROC-AUC: 0.9069

Training AdaBoost...
  F1-Score: 0.6909
  Recall: 0.7451
  Precision: 0.6441
  ROC-AUC: 0.9508

=== Training Models with Class Weighting ===

Train