In [3]:
import pandas as pd
import numpy as np
import random

# Set seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 10000

# Generate Loan IDs
loan_ids = np.arange(1, n_samples + 1)

# Disbursement Date (Random Dates in the past 5 years)
disbursement_dates = pd.to_datetime(np.random.randint(0, 1461, n_samples), unit='D', origin='2021-01-01')

# Principal Amount (between 5,000 and 500,000)
principal_amounts = np.random.uniform(5000, 500000, n_samples)

# Interest Rate (Between 5% and 30%)
interest_rates = np.random.uniform(5, 30, n_samples)

# Loan Tenure (Randomly selecting between 30 and 1095 days)
loan_tenures = np.random.randint(30, 1095, n_samples)

# Borrower Age (Between 21 and 65)
borrower_ages = np.random.randint(21, 65, n_samples)

# Employment Status (Realistic employment categories)
employment_statuses = np.random.choice(["Employed", "Self-Employed", "Unemployed", "Retired"], n_samples, p=[0.6, 0.25, 0.1, 0.05])

# Income Range (Influenced by employment status)
income_ranges = np.where(
    employment_statuses == "Employed", np.random.uniform(50000, 300000, n_samples),
    np.where(employment_statuses == "Self-Employed", np.random.uniform(30000, 200000, n_samples),
             np.random.uniform(10000, 70000, n_samples))
)

# State/Location (Random selection of states)
states = np.random.choice(['Urban', 'Suburban', 'Rural'], n_samples)

# Credit Score (Between 300 and 850, influenced by income and missed payments)
credit_scores = np.clip(
    np.random.normal(650, 100, n_samples) - (5 * np.random.randint(0, 6, n_samples)), 300, 850
)

# Days Past Due (DPD) (Higher for lower credit scores)
dpd = np.round(np.maximum(0, (900 - credit_scores) / 3 + np.random.normal(0, 10, n_samples)))

# Number of Loans per borrower (Between 1 and 5)
number_of_loans = np.random.randint(1, 6, n_samples)

# Number of Missed Payments (Based on DPD)
missed_payments = np.random.randint(0, 12, n_samples)

# Previous Collection Attempts (Higher if DPD is high)
previous_collection_attempts = np.random.randint(0, 5, n_samples) * (dpd > 60).astype(int)

# Current Collection Stage (Categorized)
collection_stages = np.where(
    dpd == 0, "No Collection",
    np.where(dpd < 30, "Early Collection",
             np.where(dpd < 90, "Mid-Stage Collection", "Legal Action"))
)

# Loan Type (Random categories)
loan_types = np.random.choice(['New', 'Returning'], n_samples)

# Bucket (60+ Days in Arrears) (Flagged if DPD > 60)
bucket_60_plus = (dpd > 60).astype(int)

# First Payment Default (FPD) (20% chance for defaulters)
fpd = np.random.choice([0, 1], n_samples, p=[0.8, 0.2]) * (dpd > 0).astype(int)

# --- NEW: More Realistic Loan Repayment Calculation ---

# Monthly interest rate
interest_rate_monthly = interest_rates / 100 / 12  

# Expected Total Repayment (Principal + Interest over tenure)
expected_total_repayment = principal_amounts * (1 + interest_rate_monthly * loan_tenures / 30)

# Total Repaid (Depends on missed payments and credit score)
total_repaid = expected_total_repayment * (
    0.7 + 0.3 * np.exp(-missed_payments / 4)
) * (credit_scores / 850)

# Ensure borrowers can't pay more than expected total repayment
total_repaid = np.minimum(total_repaid, expected_total_repayment)

# Outstanding Balance (What remains unpaid)
outstanding_balance = expected_total_repayment - total_repaid

# Apply penalty for overdue balances (10% increase if overdue for 30+ days)
overdue_mask = dpd > 30
outstanding_balance[overdue_mask] *= 1.1  

# Ensure no negative balances
outstanding_balance = np.maximum(outstanding_balance, 0)

# Collections Status (Based on Collection Stage)
collections_status = np.where(dpd == 0, "Paid", np.where(dpd < 90, "In Collections", "Written-Off"))

# Restructured Loan (Flag if repayment modified)
restructured_loan = np.random.choice([0, 1], n_samples, p=[0.85, 0.15])

# Reasons for Default (Only for those with DPD > 60)
default_reasons = np.where(
    dpd > 60,
    np.random.choice(["Loss of Job", "Medical Emergency", "Business Failure", "Overspending", "Family Issues"], n_samples),
    "Unknown"
)


# Create DataFrame
df = pd.DataFrame({
    "Loan ID": loan_ids,
    "Disbursement Date": disbursement_dates,
    "Principal Amount": principal_amounts,
    "Interest Rate": interest_rates,
    "Loan Tenure (days/weeks/months)": loan_tenures,
    "Total Repaid": total_repaid,
    "Outstanding Balance": outstanding_balance,
    "Number of Missed Payments": missed_payments,
    "Borrower Age": borrower_ages,
    "Employment Status": employment_statuses,
    "Income Range": income_ranges,
    "State/Location": states,
    "Days Past Due (DPD)": dpd,
    "Previous Collection Attempts": previous_collection_attempts,
    "Current Collection Stage": collection_stages,
    "Loan Type": loan_types,
    "Bucket (60+ Days in Arrears)": bucket_60_plus,
    "First Payment Default (FPD)": fpd,
    "Number of Loans": number_of_loans,
    "Credit Score": credit_scores,
    "Collections Status": collections_status,
    "Restructured Loan": restructured_loan,
    "Reasons for Default": default_reasons
})

# Display sample rows
print(df.head())

# Save to CSV
df.to_csv("synthetic_loan_data.csv", index=False)


   Loan ID Disbursement Date  Principal Amount  Interest Rate  \
0        1        2024-02-01     305297.457239      29.447789   
1        2        2024-12-30     356017.959786      17.798398   
2        3        2023-05-11      15423.602860      20.736963   
3        4        2024-07-18      23030.594434      21.769930   
4        5        2024-02-05     451758.346847      14.060306   

   Loan Tenure (days/weeks/months)   Total Repaid  Outstanding Balance  \
0                              538  270837.062405        185697.554475   
1                               45  282790.554368         89262.901419   
2                              554   14960.055376          5924.060324   
3                              229   14029.577468         13409.346902   
4                              338  385637.489962        125757.771544   

   Number of Missed Payments  Borrower Age Employment Status  ...  \
0                          6            47          Employed  ...   
1                         