In [10]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Constants
NUM_LOANS = 30000
START_DATE = datetime(2015, 1, 1)
END_DATE = datetime(2020, 1, 1)

# Helper functions
def random_date(start, end):
    """Generate a random datetime between `start` and `end`."""
    return start + timedelta(days=random.randint(0, int((end - start).days)))

def employment_status():
    return random.choice(['Employed', 'Unemployed', 'Self-Employed', 'Student'])

def income_range():
    return random.choice(['<2000', '2000-4000', '4000-6000', '6000-8000', '>8000'])

def geographic_location():
    return random.choice(['Urban', 'Suburban', 'Rural'])

def collection_stage():
    return random.choice(['Internal Collections', 'External Collectors', 'Written Off'])

def loan_type():
    return random.choice(['New', 'Returning'])

def bucket_category(dpd):
    return '60+ Days in Arrears' if dpd >= 60 else 'Current'

def first_payment_default():
    return random.choice(['Yes', 'No'])

def num_loans():
    return random.choice(['1', '2-3', '4+'])

def credit_score():
    return round(np.random.uniform(300, 850), 2)

def collections_status():
    return random.choice(['PTP', 'Unreachable', 'Disputed', 'Paid', 'Escalated'])

def restructured_loan():
    return random.choice([True, False])

def reasons_for_default():
    return random.choice(['DTI/DSR', 'Job Loss', 'Emergency', 'Death', 'Business Failure'])

# Generate sample data
data = {
    'Loan ID': [f'LN{i:05d}' for i in range(1, NUM_LOANS + 1)],
    'Disbursement Date': [random_date(START_DATE, END_DATE) for _ in range(NUM_LOANS)],
    'Principal Amount': np.random.uniform(1000, 50000, NUM_LOANS).round(2),
    'Interest Rate': np.random.uniform(5, 25, NUM_LOANS).round(2),
    'Loan Tenure (days/weeks/months)': np.random.randint(30, 720, NUM_LOANS),
    'Total Repaid': np.random.uniform(0, 50000, NUM_LOANS).round(2),
    'Outstanding Balance': np.random.uniform(500, 50000, NUM_LOANS).round(2),
    'Number of Missed Payments': np.random.randint(0, 12, NUM_LOANS),
    'Borrower Age': np.random.randint(18, 70, NUM_LOANS),
    'Employment Status': [employment_status() for _ in range(NUM_LOANS)],
    'Income Range': [income_range() for _ in range(NUM_LOANS)],
    'State/Location': [geographic_location() for _ in range(NUM_LOANS)],
    'Days Past Due (DPD)': np.random.randint(0, 365, NUM_LOANS),
    'Previous Collection Attempts': np.random.randint(0, 5, NUM_LOANS),
    'Current Collection Stage': [collection_stage() for _ in range(NUM_LOANS)],
    'Loan Type': [loan_type() for _ in range(NUM_LOANS)],
    'Bucket (60+ Days in Arrears)': [bucket_category(dpd) for dpd in np.random.randint(0, 365, NUM_LOANS)],
    'First Payment Default (FPD)': [first_payment_default() for _ in range(NUM_LOANS)],
    'Number of Loans': [num_loans() for _ in range(NUM_LOANS)],
    'Credit Score': [credit_score() for _ in range(NUM_LOANS)],
    'Collections Status': [collections_status() for _ in range(NUM_LOANS)],
    'Restructured Loan': [restructured_loan() for _ in range(NUM_LOANS)],
    'Reasons for Default': [reasons_for_default() for _ in range(NUM_LOANS)]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('loan_data.csv', index=False)

# Display sample data
df.head()


Unnamed: 0,Loan ID,Disbursement Date,Principal Amount,Interest Rate,Loan Tenure (days/weeks/months),Total Repaid,Outstanding Balance,Number of Missed Payments,Borrower Age,Employment Status,...,Previous Collection Attempts,Current Collection Stage,Loan Type,Bucket (60+ Days in Arrears),First Payment Default (FPD),Number of Loans,Credit Score,Collections Status,Restructured Loan,Reasons for Default
0,LN00001,2016-06-19,19352.47,17.76,154,4089.31,822.4,0,20,Unemployed,...,2,External Collectors,New,60+ Days in Arrears,No,4+,684.98,Paid,False,DTI/DSR
1,LN00002,2017-10-11,47585.0,14.19,337,970.33,26051.83,1,42,Student,...,0,Internal Collections,Returning,Current,No,1,318.25,Escalated,True,Business Failure
2,LN00003,2017-03-04,36867.7,24.29,227,21756.52,10881.88,0,51,Unemployed,...,1,Written Off,Returning,60+ Days in Arrears,Yes,1,341.15,Unreachable,True,Death
3,LN00004,2016-12-25,30334.27,9.38,544,12784.35,38852.51,2,49,Student,...,1,Internal Collections,New,60+ Days in Arrears,Yes,2-3,465.84,Disputed,True,DTI/DSR
4,LN00005,2017-04-07,8644.91,16.76,507,31098.41,4519.74,5,24,Self-Employed,...,4,Internal Collections,New,60+ Days in Arrears,Yes,4+,368.26,Disputed,True,Business Failure


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 23 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   Loan ID                          30000 non-null  object        
 1   Disbursement Date                30000 non-null  datetime64[ns]
 2   Principal Amount                 30000 non-null  float64       
 3   Interest Rate                    30000 non-null  float64       
 4   Loan Tenure (days/weeks/months)  30000 non-null  int64         
 5   Total Repaid                     30000 non-null  float64       
 6   Outstanding Balance              30000 non-null  float64       
 7   Number of Missed Payments        30000 non-null  int64         
 8   Borrower Age                     30000 non-null  int64         
 9   Employment Status                30000 non-null  object        
 10  Income Range                     30000 non-null  object   

In [15]:
df.to_excel("test_debt_buy_data.xlsx", index=False)

In [16]:
df.to_csv("test_debt_buy_data.csv", index=False)