In [1]:
pip install faker pandas

Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.1.0


In [2]:
import pandas as pd
import faker
import random
from datetime import datetime, timedelta

# Initialize Faker
fake = faker.Faker()

# Define the number of rows
num_rows = 500

# Generate synthetic data
data = {
    "Claim_ID": [fake.uuid4() for _ in range(num_rows)],
    "Claim_Date": [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_rows)],
    "Customer_ID": [fake.uuid4() for _ in range(num_rows)],
    "Claim_Amount": [round(random.uniform(100, 10000), 2) for _ in range(num_rows)],
    "Claim_Type": [random.choice(["Medical", "Auto", "Home", "Life"]) for _ in range(num_rows)],
    "Suspicious_Flags": [random.choice([0, 1]) for _ in range(num_rows)],
    "Fraud_Label": [random.choice([0, 1]) for _ in range(num_rows)],
    # Adding 'annual_income' column
    "annual_income": [round(random.uniform(30000, 150000), 2) for _ in range(num_rows)]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("fraudulent_claims_dataset.csv", index=False)

print("Dataset generated and saved as 'fraudulent_claims_dataset.csv'.")

Dataset generated and saved as 'fraudulent_claims_dataset.csv'.


In [3]:
df = pd.read_csv("fraudulent_claims_dataset.csv")
df.head()

Unnamed: 0,Claim_ID,Claim_Date,Customer_ID,Claim_Amount,Claim_Type,Suspicious_Flags,Fraud_Label,annual_income
0,3af0a9e2-a032-4712-a4ef-740951029616,2024-06-18,ce73a713-8bd5-407f-a561-cf6e5b15082f,8991.44,Medical,0,0,81953.57
1,da13a665-a42a-429f-9cc4-24aa071a3f42,2024-09-01,32ba2092-c2dc-4ec6-8505-97317800b4e7,6927.37,Medical,0,1,33662.43
2,43bfb5dc-af19-4106-bb84-bfda15c4ec66,2024-10-02,61f5b3ba-22c6-42e0-900c-e9ae8632b511,8577.67,Life,0,0,93154.72
3,774fadf0-4ed1-41e7-980f-1de2a92a6765,2024-08-08,a4b71b15-5294-4d3f-a9e6-cd1f3c6a7ec1,7013.92,Auto,0,0,39094.46
4,cf620b4c-6580-4c9b-8c76-beadd4bc44e3,2024-12-19,2a61a7a2-4799-4ad6-b54b-3bdfeba96422,6087.93,Life,0,1,101284.26


In [4]:
# Feature: Claim-to-Income Ratio (Assume Annual_Income column exists)
df["Claim_to_Income_Ratio"] = df["Claim_Amount"] / df["annual_income"]
df.head()

Unnamed: 0,Claim_ID,Claim_Date,Customer_ID,Claim_Amount,Claim_Type,Suspicious_Flags,Fraud_Label,annual_income,Claim_to_Income_Ratio
0,3af0a9e2-a032-4712-a4ef-740951029616,2024-06-18,ce73a713-8bd5-407f-a561-cf6e5b15082f,8991.44,Medical,0,0,81953.57,0.109714
1,da13a665-a42a-429f-9cc4-24aa071a3f42,2024-09-01,32ba2092-c2dc-4ec6-8505-97317800b4e7,6927.37,Medical,0,1,33662.43,0.205789
2,43bfb5dc-af19-4106-bb84-bfda15c4ec66,2024-10-02,61f5b3ba-22c6-42e0-900c-e9ae8632b511,8577.67,Life,0,0,93154.72,0.09208
3,774fadf0-4ed1-41e7-980f-1de2a92a6765,2024-08-08,a4b71b15-5294-4d3f-a9e6-cd1f3c6a7ec1,7013.92,Auto,0,0,39094.46,0.17941
4,cf620b4c-6580-4c9b-8c76-beadd4bc44e3,2024-12-19,2a61a7a2-4799-4ad6-b54b-3bdfeba96422,6087.93,Life,0,1,101284.26,0.060107


In [5]:
today = pd.to_datetime(datetime.now().date())

# Convert 'Claim_Date' to datetime objects
df['Claim_Date'] = pd.to_datetime(df['Claim_Date'])

# Calculate the difference between today and 'Claim_Date' in days
df['Days_Since_Issuance'] = (today - df['Claim_Date']).dt.days

# Select numerical features for anomaly detection
features = ["Claim_Amount", "Claim_to_Income_Ratio", "Days_Since_Issuance"]
df_selected = df[features]

In [6]:
# 1️⃣ **Elliptic Envelope** (Assumes Gaussian distribution)
from sklearn.covariance import EllipticEnvelope
elliptic = EllipticEnvelope(contamination=0.05)  # 5% contamination rate
df["Elliptic_Outlier"] = elliptic.fit_predict(df_selected)


In [7]:

# 2️⃣ **Isolation Forest** (Randomly isolates anomalies)
from sklearn.ensemble import IsolationForest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
df["IsoForest_Outlier"] = iso_forest.fit_predict(df_selected)

In [8]:
# 3️⃣ **Local Outlier Factor (LOF)** (Detects local anomalies)
from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
df["LOF_Outlier"] = lof.fit_predict(df_selected)

In [9]:
# Convert predictions (-1 = outlier, 1 = normal) to binary (1 = suspicious)
df["Elliptic_Outlier"] = df["Elliptic_Outlier"].apply(lambda x: 1 if x == -1 else 0)
df["IsoForest_Outlier"] = df["IsoForest_Outlier"].apply(lambda x: 1 if x == -1 else 0)
df["LOF_Outlier"] = df["LOF_Outlier"].apply(lambda x: 1 if x == -1 else 0)

In [10]:
# 4️⃣ **Final Suspicious Tag**
df["Anomaly_Flag"] = df[["Elliptic_Outlier", "IsoForest_Outlier", "LOF_Outlier"]].max(axis=1)

In [11]:

# Filter suspicious claims
suspicious_claims = df[df["Anomaly_Flag"] == 1]

# Save suspicious claims
suspicious_claims.to_csv("suspicious_claims_detected.csv", index=False)

print(f"{len(suspicious_claims)} suspicious claims identified and saved.")


52 suspicious claims identified and saved.


In [12]:
# Set Fraud_Label to 1 for suspicious claims and 0 for others
df["Fraud_Label"] = df["Anomaly_Flag"].apply(lambda x: 1 if x == 1 else 0)

# Save updated dataset
df.to_csv("updated_insurance_claims.csv", index=False)

print("Fraud_Label column updated successfully.")

Fraud_Label column updated successfully.


In [13]:
df.head()

Unnamed: 0,Claim_ID,Claim_Date,Customer_ID,Claim_Amount,Claim_Type,Suspicious_Flags,Fraud_Label,annual_income,Claim_to_Income_Ratio,Days_Since_Issuance,Elliptic_Outlier,IsoForest_Outlier,LOF_Outlier,Anomaly_Flag
0,3af0a9e2-a032-4712-a4ef-740951029616,2024-06-18,ce73a713-8bd5-407f-a561-cf6e5b15082f,8991.44,Medical,0,0,81953.57,0.109714,287,0,0,0,0
1,da13a665-a42a-429f-9cc4-24aa071a3f42,2024-09-01,32ba2092-c2dc-4ec6-8505-97317800b4e7,6927.37,Medical,0,1,33662.43,0.205789,212,1,1,0,1
2,43bfb5dc-af19-4106-bb84-bfda15c4ec66,2024-10-02,61f5b3ba-22c6-42e0-900c-e9ae8632b511,8577.67,Life,0,0,93154.72,0.09208,181,0,0,0,0
3,774fadf0-4ed1-41e7-980f-1de2a92a6765,2024-08-08,a4b71b15-5294-4d3f-a9e6-cd1f3c6a7ec1,7013.92,Auto,0,1,39094.46,0.17941,236,1,0,0,1
4,cf620b4c-6580-4c9b-8c76-beadd4bc44e3,2024-12-19,2a61a7a2-4799-4ad6-b54b-3bdfeba96422,6087.93,Life,0,0,101284.26,0.060107,103,0,0,0,0


In [14]:
df["Claim_Type"].unique()

array(['Medical', 'Life', 'Auto', 'Home'], dtype=object)

In [15]:
df = pd.get_dummies(df, columns=["Claim_Type"], prefix="Claim")
df.head()


Unnamed: 0,Claim_ID,Claim_Date,Customer_ID,Claim_Amount,Suspicious_Flags,Fraud_Label,annual_income,Claim_to_Income_Ratio,Days_Since_Issuance,Elliptic_Outlier,IsoForest_Outlier,LOF_Outlier,Anomaly_Flag,Claim_Auto,Claim_Home,Claim_Life,Claim_Medical
0,3af0a9e2-a032-4712-a4ef-740951029616,2024-06-18,ce73a713-8bd5-407f-a561-cf6e5b15082f,8991.44,0,0,81953.57,0.109714,287,0,0,0,0,False,False,False,True
1,da13a665-a42a-429f-9cc4-24aa071a3f42,2024-09-01,32ba2092-c2dc-4ec6-8505-97317800b4e7,6927.37,0,1,33662.43,0.205789,212,1,1,0,1,False,False,False,True
2,43bfb5dc-af19-4106-bb84-bfda15c4ec66,2024-10-02,61f5b3ba-22c6-42e0-900c-e9ae8632b511,8577.67,0,0,93154.72,0.09208,181,0,0,0,0,False,False,True,False
3,774fadf0-4ed1-41e7-980f-1de2a92a6765,2024-08-08,a4b71b15-5294-4d3f-a9e6-cd1f3c6a7ec1,7013.92,0,1,39094.46,0.17941,236,1,0,0,1,True,False,False,False
4,cf620b4c-6580-4c9b-8c76-beadd4bc44e3,2024-12-19,2a61a7a2-4799-4ad6-b54b-3bdfeba96422,6087.93,0,0,101284.26,0.060107,103,0,0,0,0,False,False,True,False


In [16]:
# Add 'Policy_Issuance_Date' column if it doesn't exist
if 'Policy_Issuance_Date' not in df.columns:
    df['Policy_Issuance_Date'] = [fake.date_between(start_date='-5y', end_date='-1y') for _ in range(len(df))] # Generate random issuance dates between 5 years and 1 year ago

# Convert 'Policy_Issuance_Date' to datetime objects
df["Policy_Issuance_Date"] = pd.to_datetime(df["Policy_Issuance_Date"], errors='coerce')
df["Policy_Issuance_Date"] = pd.to_datetime(df["Policy_Issuance_Date"], errors='coerce')


In [17]:
df["Claim_Date"] = pd.to_datetime(df["Claim_Date"], errors='coerce')
print(df.columns)


Index(['Claim_ID', 'Claim_Date', 'Customer_ID', 'Claim_Amount',
       'Suspicious_Flags', 'Fraud_Label', 'annual_income',
       'Claim_to_Income_Ratio', 'Days_Since_Issuance', 'Elliptic_Outlier',
       'IsoForest_Outlier', 'LOF_Outlier', 'Anomaly_Flag', 'Claim_Auto',
       'Claim_Home', 'Claim_Life', 'Claim_Medical', 'Policy_Issuance_Date'],
      dtype='object')


In [18]:
df.columns = df.columns.str.strip()  # Remove leading/trailing spaces

In [19]:
print(df.dtypes)  # Identify problematic columns


Claim_ID                         object
Claim_Date               datetime64[ns]
Customer_ID                      object
Claim_Amount                    float64
Suspicious_Flags                  int64
Fraud_Label                       int64
annual_income                   float64
Claim_to_Income_Ratio           float64
Days_Since_Issuance               int64
Elliptic_Outlier                  int64
IsoForest_Outlier                 int64
LOF_Outlier                       int64
Anomaly_Flag                      int64
Claim_Auto                         bool
Claim_Home                         bool
Claim_Life                         bool
Claim_Medical                      bool
Policy_Issuance_Date     datetime64[ns]
dtype: object


In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score

# Convert Claim_Date to datetime
df["Claim_Date"] = pd.to_datetime(df["Claim_Date"], errors='coerce')

# Drop datetime columns and non-relevant IDs before training
columns_to_drop = ['Fraud_Label', 'Claim_ID', 'Policy_Issuance_Date', 'Claim_Date', 'Customer_ID']  # Added Customer_ID to the list of columns to drop

# Ensure Policyholder_ID exists before dropping
if 'Policyholder_ID' in df.columns:
    columns_to_drop.append('Policyholder_ID')

X = df.drop(columns=columns_to_drop)

# Target (Fraud Label)
y = df['Fraud_Label']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale only numeric features
# Select only numeric columns for scaling
numeric_cols = X_train.select_dtypes(include=np.number).columns.tolist()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_cols]) # Scale only numeric features
X_test_scaled = scaler.transform(X_test[numeric_cols]) # Scale only numeric features

# 1️⃣ **Train Random Forest**
rf_model = RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=42)
rf_model.fit(X_train, y_train)  # Use the unscaled data for Random Forest
rf_probs = rf_model.predict_proba(X_test)[:, 1]  # Get fraud probabilities

# 2️⃣ **Train Neural Network**
nn_model = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', max_iter=500, random_state=42)
nn_model.fit(X_train_scaled, y_train)  # Use scaled data for Neural Network
nn_probs = nn_model.predict_proba(X_test_scaled)[:, 1]  # Get fraud probabilities

# 3️⃣ **Ensemble: Compute Final Fraud Score**
df_test = X_test.copy()
df_test["Fraud_Score"] = (rf_probs + nn_probs) / 2  # Average both scores
df_test["Actual_Label"] = y_test

# Evaluate performance with error handling
try:
    auc_score = roc_auc_score(y_test, df_test["Fraud_Score"])
    print(f"AUC-ROC Score (Ensemble): {auc_score:.4f}")
except ValueError:
    print("Error computing ROC-AUC Score. Check data.")

# Display top suspicious claims
df_test.sort_values("Fraud_Score", ascending=False).head(5)

AUC-ROC Score (Ensemble): 1.0000


Unnamed: 0,Claim_Amount,Suspicious_Flags,annual_income,Claim_to_Income_Ratio,Days_Since_Issuance,Elliptic_Outlier,IsoForest_Outlier,LOF_Outlier,Anomaly_Flag,Claim_Auto,Claim_Home,Claim_Life,Claim_Medical,Fraud_Score,Actual_Label
334,9636.47,0,44416.56,0.216957,27,1,1,0,1,False,False,True,False,0.994977,1
9,9088.96,1,45193.81,0.201111,322,1,1,0,1,False,True,False,False,0.989977,1
415,4644.75,1,133232.06,0.034862,33,0,0,1,1,False,False,True,False,0.980672,1
483,6432.25,0,38880.92,0.165435,229,1,0,0,1,False,True,False,False,0.974953,1
154,9933.83,0,145464.44,0.06829,229,0,0,1,1,True,False,False,False,0.973284,1


In [21]:
from sklearn.metrics import accuracy_score, f1_score, recall_score

# Convert fraud scores into binary predictions (Threshold = 0.5)
df_test["Predicted_Label"] = (df_test["Fraud_Score"] >= 0.5).astype(int)

# Compute Evaluation Metrics
accuracy = accuracy_score(df_test["Actual_Label"], df_test["Predicted_Label"])
f1 = f1_score(df_test["Actual_Label"], df_test["Predicted_Label"])
recall = recall_score(df_test["Actual_Label"], df_test["Predicted_Label"])

# Print Results
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")

Accuracy: 1.0000
F1 Score: 1.0000
Recall: 1.0000


In [22]:
import pickle

# Save the best model using pickle - Replace 'model' with the desired model
# Options: rf_model, nn_model, or create an ensemble
with open("fraudulent_claims_model.pkl", "wb") as file:
    # Example: Saving the Random Forest model
    pickle.dump(rf_model, file)
    # or
    # pickle.dump(nn_model, file) # To save the Neural Network