Health care fraud detection

In [85]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load training datasets
train = pd.read_csv("Train-1542865627584.csv")
train_beneficiary = pd.read_csv("Train_Beneficiarydata-1542865627584.csv")
train_inpatient = pd.read_csv("Train_Inpatientdata-1542865627584.csv")
train_outpatient = pd.read_csv("Train_Outpatientdata-1542865627584.csv")

# Load test datasets
test = pd.read_csv("Unseen-1542969243754.csv")
test_beneficiary = pd.read_csv("Unseen_Beneficiarydata-1542969243754.csv")
test_inpatient = pd.read_csv("Unseen_Inpatientdata-1542969243754.csv")
test_outpatient = pd.read_csv("Unseen_Outpatientdata-1542969243754.csv")


In [86]:
# Inpatient Aggregation
inpatient_agg = train_inpatient.groupby("Provider").agg({
    "ClaimID": "count",
    "InscClaimAmtReimbursed": "sum"
}).reset_index()
inpatient_agg.columns = ["Provider", "IP_Claim_Count", "IP_Reimbursed_Sum"]

# Outpatient Aggregation
outpatient_agg = train_outpatient.groupby("Provider").agg({
    "ClaimID": "count",
    "InscClaimAmtReimbursed": "sum"
}).reset_index()
outpatient_agg.columns = ["Provider", "OP_Claim_Count", "OP_Reimbursed_Sum"]


In [87]:
# Calculate Age from DOB
train_beneficiary["DOB"] = pd.to_datetime(train_beneficiary["DOB"])
train_beneficiary["Age"] = 2016 - train_beneficiary["DOB"].dt.year

# Total Chronic Conditions
chronic_cols = [col for col in train_beneficiary.columns if "ChronicCond" in col]
train_beneficiary["ChronicCond_Count"] = train_beneficiary[chronic_cols].replace(-1, 0).sum(axis=1)

# Beneficiary level aggregation (mean per Provider)
beneficiary_agg = train_beneficiary.groupby("BeneID").agg({
    "Age": "mean",
    "ChronicCond_Count": "mean"
}).reset_index()

# Merge to get Provider level data
bene_provider = pd.concat([train_inpatient[["BeneID", "Provider"]],
                           train_outpatient[["BeneID", "Provider"]]]).drop_duplicates()
bene_with_provider = pd.merge(bene_provider, beneficiary_agg, on="BeneID")
provider_bene_agg = bene_with_provider.groupby("Provider").agg({
    "Age": "mean",
    "ChronicCond_Count": "mean"
}).reset_index()


In [88]:
# Merge all features
features = train[["Provider", "PotentialFraud"]]
features = features.merge(inpatient_agg, on="Provider", how="left")
features = features.merge(outpatient_agg, on="Provider", how="left")
features = features.merge(provider_bene_agg, on="Provider", how="left")

# Fill missing values
features.fillna(0, inplace=True)


In [89]:
# Prepare inputs
X = features.drop(columns=["Provider", "PotentialFraud"])
y = features["PotentialFraud"].map({"Yes": 1, "No": 0})

# Train/Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))


[[958  19]
 [ 65  40]]
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       977
           1       0.68      0.38      0.49       105

    accuracy                           0.92      1082
   macro avg       0.81      0.68      0.72      1082
weighted avg       0.91      0.92      0.91      1082



In [90]:
# Age Calculation
test_beneficiary["DOB"] = pd.to_datetime(test_beneficiary["DOB"], errors='coerce')
test_beneficiary["Age"] = 2016 - test_beneficiary["DOB"].dt.year

# Count Chronic Conditions
chronic_cols_test = [col for col in test_beneficiary.columns if "ChronicCond" in col]
test_beneficiary["ChronicCond_Count"] = test_beneficiary[chronic_cols_test].replace(-1, 0).sum(axis=1)

# Beneficiary-level aggregation
beneficiary_agg_test = test_beneficiary.groupby("BeneID").agg({
    "Age": "mean",
    "ChronicCond_Count": "mean"
}).reset_index()

# Link BeneID to Provider
bene_provider_test = pd.concat([test_inpatient[["BeneID", "Provider"]],
                                test_outpatient[["BeneID", "Provider"]]]).drop_duplicates()
bene_with_provider_test = pd.merge(bene_provider_test, beneficiary_agg_test, on="BeneID")
provider_bene_agg_test = bene_with_provider_test.groupby("Provider").agg({
    "Age": "mean",
    "ChronicCond_Count": "mean"
}).reset_index()


In [91]:
# Inpatient Test Aggregation
inpatient_agg_test = test_inpatient.groupby("Provider").agg({
    "ClaimID": "count",
    "InscClaimAmtReimbursed": "sum"
}).reset_index()
inpatient_agg_test.columns = ["Provider", "IP_Claim_Count", "IP_Reimbursed_Sum"]

# Outpatient Test Aggregation
outpatient_agg_test = test_outpatient.groupby("Provider").agg({
    "ClaimID": "count",
    "InscClaimAmtReimbursed": "sum"
}).reset_index()
outpatient_agg_test.columns = ["Provider", "OP_Claim_Count", "OP_Reimbursed_Sum"]


In [92]:
# Merge all features
test_features = test[["Provider"]].drop_duplicates()
test_features = test_features.merge(inpatient_agg_test, on="Provider", how="left")
test_features = test_features.merge(outpatient_agg_test, on="Provider", how="left")
test_features = test_features.merge(provider_bene_agg_test, on="Provider", how="left")

# Fill missing values
test_features.fillna(0, inplace=True)

# Final test input (exclude Provider column for model input)
X_test = test_features.drop(columns=["Provider"])


In [93]:
# Predict on unseen data
test_predictions = model.predict(X_test)
test_features["PotentialFraud_Prediction"] = np.where(test_predictions == 1, "Yes", "No")

# View predictions
print(test_features[["Provider", "PotentialFraud_Prediction"]].head())


   Provider PotentialFraud_Prediction
0  PRV51002                        No
1  PRV51006                        No
2  PRV51009                        No
3  PRV51010                        No
4  PRV51018                        No
