In [None]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

In [None]:
# Initialize Faker for realistic data
fake = Faker()
Faker.seed(42)
np.random.seed(42)
random.seed(42)

In [None]:
# Helper function to generate random dates
def random_dates(start_date, end_date, n):
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    delta = end - start
    return [start + timedelta(days=random.randint(0, delta.days)) for _ in range(n)]


In [None]:
# 1. Loan Origination and Pipeline Dataset
def generate_loan_origination_data(n=1000):
    application_ids = [f"APP{str(i).zfill(6)}" for i in range(1, n+1)]
    application_dates = random_dates("2024-01-01", "2025-06-11", n)
    stages = ["Submitted", "In Review", "Approved", "Funded"]
    stage_weights = [0.4, 0.3, 0.2, 0.1]  # More applications in earlier stages
    loan_types = ["Fixed", "Adjustable", "FHA", "VA"]
    loan_type_weights = [0.5, 0.2, 0.2, 0.1]
    branch_ids = [f"BR{str(i).zfill(3)}" for i in range(1, 11)]  # 10 branches

    data = {
        "application_id": application_ids,
        "application_date": application_dates,
        "stage": np.random.choice(stages, n, p=stage_weights),
        "loan_amount": np.random.normal(300000, 100000, n).round(2),  # Mean $300k, SD $100k
        "loan_type": np.random.choice(loan_types, n, p=loan_type_weights),
        "branch_id": np.random.choice(branch_ids, n),
        "processing_time_days": np.random.randint(1, 30, n),  # 1-30 days
        "credit_score": np.random.randint(600, 850, n)  # Typical range
    }

    df = pd.DataFrame(data)
    # Ensure loan_amount is positive
    df["loan_amount"] = df["loan_amount"].clip(lower=50000)
    # Sort by application date
    df = df.sort_values("application_date").reset_index(drop=True)
    return df

In [None]:
# 2. Customer Acquisition and Retention Dataset
def generate_customer_acquisition_data(n=1000):
    customer_ids = [f"CUST{str(i).zfill(6)}" for i in range(1, n+1)]
    acquisition_dates = random_dates("2023-01-01", "2025-06-11", n)
    channels = ["Online", "Broker", "Referral", "Direct"]
    channel_weights = [0.4, 0.3, 0.2, 0.1]
    segments = ["First-Time Buyer", "Refinancer", "Investor"]
    segment_weights = [0.5, 0.3, 0.2]

    data = {
        "customer_id": customer_ids,
        "acquisition_date": acquisition_dates,
        "acquisition_channel": np.random.choice(channels, n, p=channel_weights),
        "loan_amount": np.random.normal(300000, 100000, n).round(2),
        "customer_segment": np.random.choice(segments, n, p=segment_weights),
        "customer_lifetime_value": np.random.normal(20000, 10000, n).round(2),  # Mean $20k
        "refinanced": np.random.choice(["Yes", "No"], n, p=[0.3, 0.7]),  # 30% refinance
        "nps_score": np.random.randint(0, 11, n)  # 0-10 scale
    }

    df = pd.DataFrame(data)
    # Ensure positive values
    df["loan_amount"] = df["loan_amount"].clip(lower=50000)
    df["customer_lifetime_value"] = df["customer_lifetime_value"].clip(lower=5000)
    # Sort by acquisition date
    df = df.sort_values("acquisition_date").reset_index(drop=True)
    return df

In [None]:
# Generate datasets
loan_origination_df = generate_loan_origination_data(1000)
customer_acquisition_df = generate_customer_acquisition_data(1000)

# Save to CSV
loan_origination_df.to_csv("loan_origination_data.csv", index=False)
customer_acquisition_df.to_csv("customer_acquisition_data.csv", index=False)

# Preview first few rows
print("Loan Origination Dataset Preview:")
print(loan_origination_df.head())
print("\nCustomer Acquisition Dataset Preview:")
print(customer_acquisition_df.head())

Loan Origination Dataset Preview:
  application_id application_date      stage  loan_amount loan_type branch_id  \
0      APP000127       2024-01-01   Approved    164014.39     Fixed     BR006   
1      APP000402       2024-01-01     Funded    439200.23     Fixed     BR008   
2      APP000380       2024-01-02  Submitted    267516.86     Fixed     BR005   
3      APP000776       2024-01-02  Submitted    336736.55     Fixed     BR009   
4      APP000220       2024-01-03   Approved    357258.28     Fixed     BR004   

   processing_time_days  credit_score  
0                    22           696  
1                    17           673  
2                     7           681  
3                    20           728  
4                    22           682  

Customer Acquisition Dataset Preview:
  customer_id acquisition_date acquisition_channel  loan_amount  \
0  CUST000764       2023-01-02              Online    199377.42   
1  CUST000072       2023-01-03              Broker    332728.93   