In [23]:
import os
import sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score
)

import joblib

In [2]:
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(PROJECT_ROOT)

In [3]:
data_path = os.path.join(
    PROJECT_ROOT, "data", "processed", "provider_features_train.csv"
)

df = pd.read_csv(data_path)
df.head()

Unnamed: 0,total_claims,total_reimbursed,avg_reimbursed,avg_duration_gap,pct_claimed_gt_admitted,avg_cost_per_day,age_avg,pct_chronic,PotentialFraud
0,2968.0,1386100,467.014825,2452.0,1,2686.24031,43.464791,0.113881,Yes
1,988.0,508210,514.382591,837.0,1,3365.629139,52.763158,0.135628,No
2,2287.0,502010,219.505903,2287.0,1,,58.911544,0.150415,Yes
3,193.0,74320,385.07772,158.0,1,2123.428571,29.807772,0.062176,No
4,335.0,60910,181.820896,335.0,1,,54.431045,0.146269,No


In [4]:
df.shape

(5410, 9)

In [5]:
df.isnull().sum()

total_claims                  0
total_reimbursed              0
avg_reimbursed                0
avg_duration_gap              0
pct_claimed_gt_admitted       0
avg_cost_per_day           3323
age_avg                       0
pct_chronic                   0
PotentialFraud                0
dtype: int64

### avg_cost_per_day = total_reimbursed / PRV_Admit_Duration
So avg_cost_per_day becomes NaN when:

1. PRV_Admit_Duration == 0

2. OR provider has no inpatient admissions

üëâ In other words:

Providers with only outpatient claims have no admitted days, so ‚Äúcost per day‚Äù is undefined.

### Adding an indicator feature
This tells the model:

0 ‚Üí OP-only provider

1 ‚Üí has IP claims

In [7]:
df['has_inpatient'] = (df['avg_cost_per_day'].notnull()).astype(int)

In [8]:
df.head()

Unnamed: 0,total_claims,total_reimbursed,avg_reimbursed,avg_duration_gap,pct_claimed_gt_admitted,avg_cost_per_day,age_avg,pct_chronic,PotentialFraud,has_inpatient
0,2968.0,1386100,467.014825,2452.0,1,2686.24031,43.464791,0.113881,Yes,1
1,988.0,508210,514.382591,837.0,1,3365.629139,52.763158,0.135628,No,1
2,2287.0,502010,219.505903,2287.0,1,,58.911544,0.150415,Yes,0
3,193.0,74320,385.07772,158.0,1,2123.428571,29.807772,0.062176,No,1
4,335.0,60910,181.820896,335.0,1,,54.431045,0.146269,No,0


In [9]:
TARGET_COL = "PotentialFraud"

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL].map({"Yes": 1, "No": 0})

In [10]:
X.columns

Index(['total_claims', 'total_reimbursed', 'avg_reimbursed',
       'avg_duration_gap', 'pct_claimed_gt_admitted', 'avg_cost_per_day',
       'age_avg', 'pct_chronic', 'has_inpatient'],
      dtype='object')

In [11]:
y.value_counts(normalize=True)

PotentialFraud
0    0.90647
1    0.09353
Name: proportion, dtype: float64

In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [13]:
numeric_features = X.columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        (
            "num",
            Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="median")),
                ("scaler", StandardScaler())
            ]),
            numeric_features
        )
    ]
)

In [14]:
model_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("model", LogisticRegression(
            class_weight="balanced",
            max_iter=1000,
            random_state=42
        ))
    ]
)

In [15]:
model_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [16]:
y_pred = model_pipeline.predict(X_val)
y_proba = model_pipeline.predict_proba(X_val)[:, 1]

In [17]:
print("ROC-AUC:", roc_auc_score(y_val, y_proba))
print("\nClassification Report:\n")
print(classification_report(y_val, y_pred))

ROC-AUC: 0.9315206750032802

Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.87      0.92       981
           1       0.41      0.85      0.55       101

    accuracy                           0.87      1082
   macro avg       0.70      0.86      0.74      1082
weighted avg       0.93      0.87      0.89      1082



In [18]:
confusion_matrix(y_val, y_pred)

array([[857, 124],
       [ 15,  86]])

In [19]:
custom_threshold = 0.4
y_pred_custom = (y_proba >= custom_threshold).astype(int)

print(classification_report(y_val, y_pred_custom))

              precision    recall  f1-score   support

           0       0.99      0.81      0.89       981
           1       0.32      0.88      0.47       101

    accuracy                           0.82      1082
   macro avg       0.65      0.85      0.68      1082
weighted avg       0.92      0.82      0.85      1082



In [20]:
for t in [0.3, 0.4, 0.5, 0.6]:
    preds = (y_proba >= t).astype(int)
    print(f"\nThreshold: {t}")
    print(classification_report(y_val, preds))



Threshold: 0.3
              precision    recall  f1-score   support

           0       0.99      0.70      0.82       981
           1       0.25      0.94      0.39       101

    accuracy                           0.73      1082
   macro avg       0.62      0.82      0.61      1082
weighted avg       0.92      0.73      0.78      1082


Threshold: 0.4
              precision    recall  f1-score   support

           0       0.99      0.81      0.89       981
           1       0.32      0.88      0.47       101

    accuracy                           0.82      1082
   macro avg       0.65      0.85      0.68      1082
weighted avg       0.92      0.82      0.85      1082


Threshold: 0.5
              precision    recall  f1-score   support

           0       0.98      0.87      0.92       981
           1       0.41      0.85      0.55       101

    accuracy                           0.87      1082
   macro avg       0.70      0.86      0.74      1082
weighted avg       0.93   

In [None]:
models_dir = os.path.join(PROJECT_ROOT, "models")
os.makedirs(models_dir, exist_ok=True)

model_path = os.path.join(models_dir, "fraud_model.joblib")

joblib.dump(model_pipeline, model_path)

For a resume + realistic healthcare fraud system:

üëâ Threshold = 0.5 is the best choice.

Why?

- Recall is still high (85%)

- Precision is acceptable

- System is defensible

- Looks mature and balanced

## Training with Random Forest Classifier to see whether adding complex model improves the performance

In [24]:
rf_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("model", RandomForestClassifier(
            n_estimators=300,
            max_depth=None,
            min_samples_split=5,
            min_samples_leaf=2,
            class_weight="balanced",
            random_state=42,
            n_jobs=-1
        ))
    ]
)

In [25]:
rf_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [26]:
rf_pred = rf_pipeline.predict(X_val)
rf_proba = rf_pipeline.predict_proba(X_val)[:, 1]

print("ROC-AUC:", roc_auc_score(y_val, rf_proba))
print("\nClassification Report:\n")
print(classification_report(y_val, rf_pred))

ROC-AUC: 0.934871468798256

Classification Report:

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       981
           1       0.61      0.57      0.59       101

    accuracy                           0.93      1082
   macro avg       0.78      0.77      0.78      1082
weighted avg       0.92      0.93      0.93      1082



In [27]:
for t in [0.3, 0.4, 0.5, 0.6]:
    preds = (rf_proba >= t).astype(int)
    print(f"\nThreshold: {t}")
    print(classification_report(y_val, preds))


Threshold: 0.3
              precision    recall  f1-score   support

           0       0.97      0.92      0.95       981
           1       0.49      0.74      0.59       101

    accuracy                           0.90      1082
   macro avg       0.73      0.83      0.77      1082
weighted avg       0.93      0.90      0.91      1082


Threshold: 0.4
              precision    recall  f1-score   support

           0       0.97      0.94      0.95       981
           1       0.54      0.67      0.60       101

    accuracy                           0.92      1082
   macro avg       0.75      0.81      0.78      1082
weighted avg       0.93      0.92      0.92      1082


Threshold: 0.5
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       981
           1       0.61      0.57      0.59       101

    accuracy                           0.93      1082
   macro avg       0.78      0.77      0.78      1082
weighted avg       0.92   

## Final decision:

- Final model: Logistic Regression

- Final threshold: 0.5

- Reason: Best recall‚Äìprecision trade-off for fraud detection

### Now saving the model

In [28]:
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
models_dir = os.path.join(PROJECT_ROOT, "models")
os.makedirs(models_dir, exist_ok=True)

model_path = os.path.join(models_dir, "fraud_model.joblib")

joblib.dump(model_pipeline, model_path)


['c:\\Users\\Koushik\\Desktop\\All Files\\Master Project\\healthcare-fraud-detection\\models\\fraud_model.joblib']

### Saving the thresold config

In [29]:
model_config = {
    "model_name": "logistic_regression",
    "decision_threshold": 0.5,
    "business_objective": "maximize fraud recall with acceptable precision",
    "metrics": {
        "roc_auc": 0.93,
        "fraud_recall": 0.85,
        "fraud_precision": 0.41
    }
}

In [30]:
import json

config_path = os.path.join(models_dir, "model_config.json")

with open(config_path, "w") as f:
    json.dump(model_config, f, indent=4)

### ‚úÖ Model Training Summary

- Trained a provider-level fraud detection model using Logistic Regression with imbalance-aware class weighting.

- Evaluated multiple decision thresholds and selected 0.5 to balance high fraud recall (~85%) with manageable false positives.

- Benchmarked Random Forest, but Logistic Regression was retained due to superior fraud recall and interpretability.

- Saved the full preprocessing + model pipeline and decision threshold for deployment via FastAPI.