In [None]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

In [None]:
# Initialize Faker for realistic data
fake = Faker()
Faker.seed(42)
np.random.seed(42)
random.seed(42)

In [None]:
# Helper function to generate random dates
def random_dates(start_date, end_date, n):
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    delta = end - start
    return [start + timedelta(days=random.randint(0, delta.days)) for _ in range(n)]

In [None]:
# Generate portfolio performance dataset
def generate_portfolio_performance_data(n=1000):
    # Load previous datasets to get application_id and customer_id
    try:
        loan_df = pd.read_csv('/content//drive/MyDrive/loan_origination_data.csv')
        customer_df = pd.read_csv('/content/drive/MyDrive/customer_acquisition_data.csv')
    except FileNotFoundError:
        print("Error: loan_origination_data.csv or customer_acquisition_data.csv not found in /content")
        return None

In [None]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

# Initialize Faker for realistic data
fake = Faker()
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# Helper function to generate random dates
def random_dates(start_date, end_date, n):
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    delta = end - start
    return [start + timedelta(days=random.randint(0, delta.days)) for _ in range(n)]

# Generate portfolio performance dataset
def generate_portfolio_performance_data(n=1000):
    # Load previous datasets to get application_id and customer_id
    try:
        loan_df = pd.read_csv('/content//drive/MyDrive/loan_origination_data.csv')
        customer_df = pd.read_csv('/content/drive/MyDrive/customer_acquisition_data.csv')
    except FileNotFoundError:
        print("Error: loan_origination_data.csv or customer_acquisition_data.csv not found in /content")
        return None

    # Ensure we have 1000 records
    if len(loan_df) < n or len(customer_df) < n:
        print(f"Error: Insufficient records in input datasets (loan: {len(loan_df)}, customer: {len(customer_df)})")
        return None

    # Extract application_id and customer_id (first 1000 records)
    application_ids = loan_df['application_id'].iloc[:n].tolist()
    customer_ids = customer_df['customer_id'].iloc[:n].tolist()

    # Generate portfolio performance data
    delinquency_statuses = ['Current', '30 Days', '60 Days', '90+ Days']
    delinquency_weights = [0.85, 0.10, 0.03, 0.02]  # Most loans are current
    default_probs = {'Current': 0.01, '30 Days': 0.1, '60 Days': 0.3, '90+ Days': 0.7}  # Higher delinquency, higher default risk
    last_payment_dates = random_dates("2024-06-01", "2025-06-11", n)

    data = {
        'application_id': application_ids,
        'customer_id': customer_ids,
        'loan_balance': np.random.normal(250000, 100000, n).round(2),  # Mean $250k, SD $100k
        'delinquency_status': np.random.choice(delinquency_statuses, n, p=delinquency_weights),
        'default_flag': [None] * n,  # Placeholder, filled below
        'loan_to_value_ratio': np.random.normal(80, 15, n).round(2),  # Mean 80%, SD 15%
        'credit_score_current': np.random.randint(600, 850, n),  # Typical range
        'net_interest_margin': np.random.normal(2.5, 0.5, n).round(2),  # Mean 2.5%, SD 0.5%
        'last_payment_date': last_payment_dates
    }

    df = pd.DataFrame(data)

    # Assign default_flag based on delinquency status
    df['default_flag'] = df['delinquency_status'].apply(
        lambda x: 'Yes' if random.random() < default_probs[x] else 'No'
    )

    # Ensure loan_balance and LTV are positive and realistic
    df['loan_balance'] = df['loan_balance'].clip(lower=50000)
    df['loan_to_value_ratio'] = df['loan_to_value_ratio'].clip(lower=50, upper=120)

    # Ensure net_interest_margin is positive
    df['net_interest_margin'] = df['net_interest_margin'].clip(lower=0.5)

    # Sort by last_payment_date
    df = df.sort_values('last_payment_date').reset_index(drop=True)
    return df




In [None]:

# Generate dataset
portfolio_df = generate_portfolio_performance_data(1000)

# Save to CSV if generation was successful
if portfolio_df is not None:
    portfolio_df.to_csv('/content/portfolio_performance_data.csv', index=False)

    # Preview first few rows
    print("Portfolio Performance Dataset Preview:")
    print(portfolio_df.head())

Portfolio Performance Dataset Preview:
  application_id customer_id  loan_balance delinquency_status default_flag  \
0      APP000539  CUST000607      65136.43            30 Days           No   
1      APP000534  CUST000934     333630.18            Current           No   
2      APP000030  CUST000296     366884.36            Current           No   
3      APP000840  CUST000817     284921.68            Current           No   
4      APP000252  CUST000810      91681.27            30 Days           No   

   loan_to_value_ratio  credit_score_current  net_interest_margin  \
0                83.86                   701                 2.53   
1                70.89                   698                 1.60   
2                65.23                   778                 2.33   
3                71.37                   784                 1.63   
4                58.39                   781                 2.76   

  last_payment_date  
0        2024-06-02  
1        2024-06-02  
2        20