In [13]:
pip install faker

Collecting faker
  Downloading Faker-27.4.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-27.4.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faker
Successfully installed faker-27.4.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Credit Score Bias: Adjust the Interest_Rate and Application_Status based on the Credit_Score. Higher credit scores will have lower interest rates and a higher likelihood of approval.
# Geographic Bias: Influence the Promotions field based on the Location. Certain locations will have a higher chance of receiving better promotional offers.
# increase Sample Size: Update n_samples to 70,000.
# Introduce Missing Data: Use probabilities to assign None values to certain fields at random, representing missing data.

In [30]:
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime, timedelta
from faker import Faker
import random
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize Faker to generate synthetic data
fake = Faker()

# Number of samples
num_samples = 2000

# Seed for reproducibility
np.random.seed(42)
random.seed(42)  # For random.sample and random.choice in non-numpy contexts

def generate_correlated_features(num_samples):
    """
    Generate correlated personal and financial features.
    """
    # Generate Age with normal distribution, clipped between 18 and 80
    age = np.random.normal(40, 12, num_samples).clip(18, 80).astype(int)
    
    # Generate Experience based on Age, ensuring non-negative
    experience = (age - 18 - np.random.normal(4, 2, num_samples)).clip(0).astype(int)
    
    # Generate Education Level with predefined probabilities
    education_level = np.random.choice(
        ['High School', 'Associate', 'Bachelor', 'Master', 'Doctorate'], 
        num_samples, 
        p=[0.3, 0.2, 0.3, 0.15, 0.05]
    )
    
    # Impact of Education on Income and Credit Score
    edu_impact = {'High School': 0, 'Associate': 0.1, 'Bachelor': 0.2, 'Master': 0.3, 'Doctorate': 0.4}
    edu_factor = np.array([edu_impact[level] for level in education_level])
    
    # Generate Annual Income using log-normal distribution influenced by education and experience
    base_income = np.random.lognormal(10.5, 0.6, num_samples) * (1 + edu_factor) * (1 + experience / 100)
    income_noise = np.random.normal(0, 0.1, num_samples)
    annual_income = (base_income * (1 + income_noise)).clip(15000, 300000).astype(int)
    
    # Generate Credit Score influenced by education, experience, and income
    credit_score_base = 300 + 300 * stats.beta.rvs(5, 1.5, size=num_samples)
    credit_score = (credit_score_base + edu_factor * 100 + experience * 1.5 + income_noise * 100).clip(300, 850).astype(int)
    
    # Generate Employment Status probabilities based on education
    employment_status_probs = np.column_stack([
        0.9 - edu_factor * 0.3,  # Employed
        0.05 + edu_factor * 0.2,  # Self-Employed
        0.05 + edu_factor * 0.1   # Unemployed
    ])
    # Normalize probabilities to sum to 1
    employment_status_probs = employment_status_probs / employment_status_probs.sum(axis=1, keepdims=True)
    
    # Assign Employment Status based on probabilities
    employment_status = np.array(['Employed', 'Self-Employed', 'Unemployed'])[
        np.argmax(np.random.random((num_samples, 1)) < employment_status_probs.cumsum(axis=1), axis=1)
    ]
    
    return age, experience, education_level, annual_income, credit_score, employment_status

def generate_time_based_features(num_samples):
    """
    Generate sequential application dates starting from January 1, 2018.
    """
    start_date = datetime(2018, 1, 1)
    dates = [start_date + timedelta(days=i) for i in range(num_samples)]
    return dates

# Generate correlated features
age, experience, education_level, annual_income, credit_score, employment_status = generate_correlated_features(num_samples)
application_dates = generate_time_based_features(num_samples)

# Define a dictionary mapping vehicle makes to possible models
make_model_mapping = {
    'Toyota': ['Camry', 'Corolla', 'RAV4', 'Prius', 'Highlander', 'Tacoma'],
    'Honda': ['Civic', 'Accord', 'CR-V', 'Pilot', 'Fit', 'Odyssey'],
    'Ford': ['F-150', 'Escape', 'Explorer', 'Mustang', 'Fusion', 'Ranger'],
    'Chevrolet': ['Silverado', 'Equinox', 'Malibu', 'Traverse', 'Camaro', 'Tahoe'],
    'BMW': ['3 Series', '5 Series', 'X3', 'X5', '7 Series', 'X1'],
    'Mercedes-Benz': ['C-Class', 'E-Class', 'GLC', 'GLE', 'S-Class', 'GLA'],
    'Nissan': ['Altima', 'Sentra', 'Rogue', 'Versa', 'Pathfinder', 'Maxima'],
    'Hyundai': ['Elantra', 'Sonata', 'Tucson', 'Santa Fe', 'Accent', 'Kona'],
    'Kia': ['Soul', 'Optima', 'Sportage', 'Sorento', 'Rio', 'Seltos'],
    'Subaru': ['Forester', 'Outback', 'Impreza', 'Crosstrek', 'Legacy', 'Ascent'],
    'Mazda': ['CX-5', 'Mazda3', 'Mazda6', 'MX-5 Miata', 'CX-9', 'Mazda CX-30'],
    'Audi': ['A3', 'A4', 'A6', 'Q5', 'Q7', 'TT'],
    'Volkswagen': ['Golf', 'Passat', 'Tiguan', 'Jetta', 'Atlas', 'Arteon'],
    'Volvo': ['XC90', 'S60', 'S90', 'XC60', 'V60', 'V90'],
    'Porsche': ['911', 'Cayenne', 'Macan', 'Panamera', 'Taycan', 'Boxster'],
    'Jeep': ['Wrangler', 'Grand Cherokee', 'Renegade', 'Compass', 'Cherokee', 'Gladiator'],
    'Lexus': ['RX', 'ES', 'NX', 'GX', 'LS', 'IS'],
    'Acura': ['MDX', 'RDX', 'TLX', 'ILX', 'RLX', 'NSX'],
    'Cadillac': ['Escalade', 'XT5', 'CT5', 'XT4', 'ATS', 'XT6'],
    'Lincoln': ['Navigator', 'Aviator', 'Corsair', 'Nautilus', 'MKZ', 'MKC'],
    'Infiniti': ['Q50', 'QX60', 'QX80', 'Q30', 'QX50', 'QX55'],
    'Genesis': ['G70', 'G80', 'G90', 'GV70', 'GV80', 'G70 Convertible'],
    'Bentley': ['Continental', 'Flying Spur', 'Bentayga', 'Mulsanne', 'Azure'],
    'Maserati': ['Ghibli', 'Quattroporte', 'Levante', 'GranTurismo', 'MC20'],
    'Alfa Romeo': ['Giulia', 'Stelvio', '4C', 'Giulietta', 'Tonale', 'GT'],
    'Fiat': ['500', 'Panda', '124 Spider', 'Tipo', '500X', '500L'],
    'Mitsubishi': ['Outlander', 'Eclipse Cross', 'Mirage', 'Galant', 'Lancer', 'ASX'],
    'Mini': ['Cooper', 'Countryman', 'Clubman', 'Convertible', 'Hardtop'],
    'Ram': ['1500', '2500', '3500', 'ProMaster', 'Chassis Cab'],
    'Suzuki': ['Swift', 'Vitara', 'Jimny', 'Baleno', 'Celerio', 'S-Cross']
}

# Expanded list of vehicle makes with approximate weights (illustrative)
vehicle_makes = list(make_model_mapping.keys())

# Corresponding weights (adjust based on actual market data as needed)
vehicle_make_weights = [
    10,  # Toyota
    9,   # Honda
    8,   # Ford
    7,   # Chevrolet
    6,   # BMW
    5,   # Mercedes-Benz
    6,   # Nissan
    5,   # Hyundai
    5,   # Kia
    4,   # Subaru
    3,   # Mazda
    3,   # Audi
    2,   # Volkswagen
    2,   # Volvo
    1,   # Porsche
    4,   # Jeep
    3,   # Lexus
    2,   # Acura
    1,   # Cadillac
    1,   # Lincoln
    2,   # Infiniti
    2,   # Genesis
    1,   # Bentley
    1,   # Maserati
    1,   # Alfa Romeo
    1,   # Fiat
    1,   # Mitsubishi
    1,   # Mini
    1,   # Ram
    1    # Suzuki
]

# Convert weights to probabilities
total_weight = sum(vehicle_make_weights)
vehicle_make_probabilities = [weight / total_weight for weight in vehicle_make_weights]

# Generate Vehicle_Make data based on probabilities
vehicle_make_data = np.random.choice(
    vehicle_makes, 
    size=num_samples, 
    p=vehicle_make_probabilities
)

# Function to assign a model based on make
def assign_model(make):
    return random.choice(make_model_mapping.get(make, ['Model_Not_Specified']))

# Generate Vehicle_Model data based on Vehicle_Make
vehicle_model_data = [assign_model(make) for make in vehicle_make_data]

# Define probabilities for Vehicle_Type
vehicle_type_probs = [0.4, 0.6]  # 40% New, 60% Used

# Generate Vehicle_Type data
vehicle_type_data = np.random.choice(['New', 'Used'], size=num_samples, p=vehicle_type_probs)

# Generate Vehicle_Year based on Vehicle_Type
vehicle_year_data = []
for vt in vehicle_type_data:
    if vt == 'New':
        # New vehicles: Recent years (e.g., 2018-2024)
        year = np.random.randint(2018, 2025)
    else:
        # Used vehicles: Older years (e.g., 2005-2017)
        year = np.random.randint(2005, 2018)
    vehicle_year_data.append(year)

# Generate Vehicle_Mileage based on Vehicle_Year and Vehicle_Type
vehicle_mileage_data = []
current_year = 2024

for year, vt in zip(vehicle_year_data, vehicle_type_data):
    age = current_year - year
    if vt == 'New':
        # New vehicles: Low mileage, e.g., 0-30,000 miles
        mileage = np.random.randint(0, 30001)
    else:
        # Used vehicles: Mileage increases with age, e.g., 30,000 + (age * 12,000) +/- 10,000
        avg_mileage = age * 12000
        min_mileage = max(avg_mileage - 10000, 30000)
        max_mileage = avg_mileage + 10000
        mileage = np.random.randint(min_mileage, max_mileage + 1)
    vehicle_mileage_data.append(mileage)

# Define base prices for each Vehicle_Make (in USD)
base_price_mapping = {
    'Toyota': 25000,
    'Honda': 24000,
    'Ford': 26000,
    'Chevrolet': 25500,
    'BMW': 45000,
    'Mercedes-Benz': 47000,
    'Nissan': 23000,
    'Hyundai': 22000,
    'Kia': 21000,
    'Subaru': 23500,
    'Mazda': 22500,
    'Audi': 44000,
    'Volkswagen': 20000,
    'Volvo': 42000,
    'Porsche': 60000,
    'Jeep': 28000,
    'Lexus': 43000,
    'Acura': 39000,
    'Cadillac': 50000,
    'Lincoln': 48000,
    'Infiniti': 40000,
    'Genesis': 41000,
    'Bentley': 90000,
    'Maserati': 85000,
    'Alfa Romeo': 37000,
    'Fiat': 18000,
    'Mitsubishi': 19000,
    'Mini': 22000,
    'Ram': 30000,
    'Suzuki': 17000
}

# Assign base price to each vehicle based on Vehicle_Make
vehicle_price_data = []
for make in vehicle_make_data:
    base_price = base_price_mapping.get(make, 20000)  # Default base price if make not found
    vehicle_price_data.append(base_price)

# Adjust Vehicle_Price based on Vehicle_Type, Vehicle_Year, and Vehicle_Mileage
adjusted_vehicle_price_data = []
for i in range(num_samples):
    make = vehicle_make_data[i]
    base_price = base_price_mapping.get(make, 20000)
    vt = vehicle_type_data[i]
    year = vehicle_year_data[i]
    mileage = vehicle_mileage_data[i]
    
    if vt == 'New':
        # New vehicles: Slight adjustment for models or additional features can be added here
        price = base_price
    else:
        # Used vehicles: Apply depreciation based on age and mileage
        age = current_year - year
        # Depreciation rate: 5% per year
        depreciation = 0.05 * age
        # Mileage factor: Assume higher mileage reduces price
        mileage_factor = min(mileage / 150000, 1)  # Cap at 1
        mileage_depreciation = 0.2 * mileage_factor  # Up to 20% depreciation based on mileage
        
        total_depreciation = depreciation + mileage_depreciation
        total_depreciation = min(total_depreciation, 0.8)  # Cap total depreciation at 80%
        
        price = base_price * (1 - total_depreciation)
    
    # Add some randomness (±5%)
    price *= np.random.uniform(0.95, 1.05)
    
    # Ensure price is not negative
    price = max(price, 1000)
    
    adjusted_vehicle_price_data.append(int(price))

# Define Loan_Amount based on Vehicle_Price and Vehicle_Type
loan_amount_data = []
for i in range(num_samples):
    price = adjusted_vehicle_price_data[i]
    vt = vehicle_type_data[i]
    
    if vt == 'New':
        # New vehicles: LTV between 80% - 100%
        ltv = np.random.uniform(0.8, 1.0)
    else:
        # Used vehicles: LTV between 50% - 80%
        ltv = np.random.uniform(0.5, 0.8)
    
    loan_amount = price * ltv
    
    # Add some randomness (±5%)
    loan_amount *= np.random.uniform(0.95, 1.05)
    
    # Ensure loan amount does not exceed vehicle price
    loan_amount = min(loan_amount, price)
    
    # Convert to integer
    loan_amount_data.append(int(loan_amount))

# Generate Location data using Faker
location_data = [fake.city() for _ in range(num_samples)]

# Generate Down_Payment as a percentage of Vehicle_Price (10% - 30%)
down_payment_data = []
for price in adjusted_vehicle_price_data:
    down_payment = np.random.randint(int(price * 0.1), int(price * 0.3) + 1)
    down_payment_data.append(down_payment)

# Generate Loan_Tenure_Years based on Loan_Amount
loan_tenure_data = []
for loan in loan_amount_data:
    if loan > 30000:
        tenure = np.random.choice([5, 6, 7], p=[0.5, 0.3, 0.2])
    elif loan > 20000:
        tenure = np.random.choice([4, 5, 6], p=[0.4, 0.4, 0.2])
    else:
        tenure = np.random.choice([3, 4, 5], p=[0.5, 0.3, 0.2])
    loan_tenure_data.append(tenure)

# Generate Interest_Rate based on Credit_Score, Loan_Amount, Loan_Tenure_Years, and Annual_Income
interest_rate_data = []
for credit, loan, tenure, income in zip(credit_score, loan_amount_data, loan_tenure_data, annual_income):
    base_rate = 2.0  # Base interest rate
    # Higher credit score reduces interest rate
    credit_factor = (850 - credit) / 2000  # Scaled factor
    # Higher loan amount may increase interest rate
    loan_factor = (loan - 5000) / 100000  # Scaled factor
    # Longer tenure may increase interest rate
    tenure_factor = (tenure - 3) * 0.2
    # Higher income may reduce interest rate
    income_factor = (income - 50000) / 200000  # Scaled factor
    
    interest = base_rate + credit_factor + loan_factor + tenure_factor - income_factor
    # Add some randomness
    interest += np.random.uniform(-0.3, 0.3)
    # Clip interest rate to realistic bounds
    interest = min(max(interest, 1.9), 6.5)
    interest_rate_data.append(round(interest, 2))

    

    
# Initialize the data dictionary with existing fields (excluding 'Debt_To_Income_Ratio')
data = {
    'User_ID': [fake.uuid4() for _ in range(num_samples)],
    'ApplicationDate': application_dates,
    'Age': age,
    'Gender': np.random.choice(['Male', 'Female'], size=num_samples),
    'Annual_Income': annual_income,
    'Credit_Score': credit_score,
    'Employment_Status': employment_status,
    'Education_Level': education_level,
    'Experience': experience,
    'Loan_Amount': loan_amount_data,  # Dependency-based Loan_Amount
    'Loan_Duration': np.random.choice(
        [12, 24, 36, 48, 60, 72, 84, 96, 108, 120], 
        num_samples, 
        p=[0.05, 0.1, 0.2, 0.2, 0.2, 0.1, 0.05, 0.05, 0.025, 0.025]
    ),
    'Marital_Status': np.random.choice(
        ['Single', 'Married', 'Divorced', 'Widowed'], 
        num_samples, 
        p=[0.3, 0.5, 0.15, 0.05]
    ),
    'Number_Of_Dependents': np.random.choice(
        [0, 1, 2, 3, 4, 5], 
        num_samples, 
        p=[0.3, 0.25, 0.2, 0.15, 0.07, 0.03]
    ),
    'Home_Ownership_Status': np.random.choice(
        ['Own', 'Rent', 'Mortgage', 'Other'], 
        num_samples, 
        p=[0.2, 0.3, 0.4, 0.1]
    ),
    'Monthly_Debt_Payments': np.random.lognormal(6, 0.5, num_samples).astype(int),
    'Credit_Card_Utilization_Rate': np.random.beta(2, 5, num_samples),
    'Number_Of_Open_CreditLines': np.random.poisson(3, num_samples).clip(0, 15).astype(int),
    'Number_Of_Credit_Inquiries': np.random.poisson(1, num_samples).clip(0, 10).astype(int),
    'Debt_To_IncomeRatio': np.random.beta(2, 5, num_samples),  # Removed 'Debt_To_Income_Ratio'
    'Bankruptcy_History': np.random.choice([0, 1], num_samples, p=[0.95, 0.05]),
    'Previous_Loan_Defaults': np.random.choice([0, 1], num_samples, p=[0.9, 0.1]),
    'Payment_History': np.random.poisson(24, num_samples).clip(0, 60).astype(int),
    'Length_Of_CreditHistory': np.random.randint(1, 30, num_samples),
    'Savings_Account_Balance': np.random.lognormal(8, 1, num_samples).astype(int),
    'Checking_Account_Balance': np.random.lognormal(7, 1, num_samples).astype(int),
    'Total_Assets': np.random.lognormal(11, 1, num_samples).astype(int),
    'Total_Liabilities': np.random.lognormal(10, 1, num_samples).astype(int),
    'Monthly_Income': annual_income / 12,
    'Utility_Bills_Payment_History': np.random.beta(8, 2, num_samples),
    'Job_Tenure': np.random.poisson(5, num_samples).clip(0, 40).astype(int),
    
    'Location': location_data,
    'Vehicle_Type': vehicle_type_data,
    'Vehicle_Make': vehicle_make_data,
    'Vehicle_Model': vehicle_model_data,
    'Vehicle_Year': vehicle_year_data,
    'Vehicle_Mileage': vehicle_mileage_data,
    'Vehicle_Price': adjusted_vehicle_price_data,
    'Down_Payment': down_payment_data,
    'Loan_Tenure_Years': loan_tenure_data,
    'Interest_Rate': interest_rate_data,
    # 'Application_Status' will be determined via the loan approval function
    'Session_Duration_Minutes': np.random.randint(5, 60, size=num_samples),
    'Number_of_Interactions': np.random.randint(10, 100, size=num_samples),
    'Notifications_Responded': np.random.choice([0, 1], size=num_samples, p=[0.7, 0.3]),
    'Support_Queries': np.random.choice([0, 1, 2, 3], size=num_samples, p=[0.5, 0.3, 0.15, 0.05]),
    'Application_Submitted': np.random.choice([True, False], size=num_samples, p=[0.8, 0.2])
}

# Define additional fields to be added
marital_statuses = ['Single', 'Married', 'Divorced', 'Widowed']
device_types = ['iPhone', 'Android', 'Windows Phone']
os_versions = ['iOS 15', 'iOS 14', 'Android 11', 'Android 10', 'Windows 10 Mobile']
app_versions = ['1.0', '1.1', '1.2']
network_types = ['Wi-Fi', '4G', '5G']
dealer_info = ['Dealer A', 'Dealer B', 'Dealer C', 'Dealer D']
promotions = ['0% APR', '$1000 Cashback', 'No Payments for 90 Days', 'Low Down Payment']
event_sequences = ['Application Start', 'Vehicle Selection', 'Loan Calculator', 'Document Upload', 'Credit Check', 'Approval']
screens = ['Home', 'Loan Calculator', 'Vehicle Selection', 'Document Upload', 'Credit Check', 'Approval']

data.update({
    "Monthly_Expenses": np.random.randint(1000, 10000, size=num_samples),
    "Previous_Vehicle_Ownership": np.random.choice([True, False], size=num_samples, p=[0.7, 0.3]),
    "Trade_In_Details": np.random.choice([None, 'Old Car Trade-In'], size=num_samples, p=[0.7, 0.3]),
    "Session_Start_Time": [fake.date_time_this_year() for _ in range(num_samples)],
    "Session_End_Time": [fake.date_time_this_year() for _ in range(num_samples)],
    "Navigation_Paths": [random.sample(event_sequences, k=random.randint(3, len(event_sequences))) for _ in range(num_samples)],
    
    "Device_Type": np.random.choice(device_types, size=num_samples),
    "OS_Version": np.random.choice(os_versions, size=num_samples),
    "App_Version": np.random.choice(app_versions, size=num_samples),
    "Network_Type": np.random.choice(network_types, size=num_samples),
    "Dealer_Info": np.random.choice(dealer_info, size=num_samples),
    "Promotions": np.random.choice(promotions, size=num_samples),
 
    
    "Regulatory_Compliance": np.random.choice(['Compliant', 'Non-Compliant'], size=num_samples, p=[0.95, 0.05]),
    "Consent_Provided": np.random.choice([True, False], size=num_samples, p=[0.98, 0.02]),
    "User_Type": np.random.choice(['New', 'Returning'], size=num_samples),
    "Behavioral_Segment": np.random.choice(['Low Engagement', 'Medium Engagement', 'High Engagement'], size=num_samples),
    "User_Feedback_Rating": np.random.randint(1, 5, size=num_samples),
    "Common_Issues_Faced": np.random.choice(
        [None, 'Document Upload Failed', 'Credit Check Issue', 'App Crash'], 
        size=num_samples, 
        p=[0.7, 0.1, 0.1, 0.1]
    ),
    "User_Satisfaction": np.random.choice(
        ['Very Satisfied', 'Satisfied', 'Neutral', 'Dissatisfied', 'Very Dissatisfied'], 
        size=num_samples
    )
})

# Additional Interaction Event data to be added
data.update({
    "Frequency_of_App_Usage": np.random.randint(1, 30, size=num_samples),  # Frequency of app usage in the past month
    "Clicks": np.random.randint(1, 50, size=num_samples),
    "Taps": np.random.randint(1, 50, size=num_samples),
    "Swipes": np.random.randint(1, 50, size=num_samples),
    "Form_Entries": np.random.randint(1, 20, size=num_samples),
    "Time_Spent_on_Home_Screen_Minutes": np.random.randint(1, 10, size=num_samples),
    "Time_Spent_on_Loan_Calculator_Minutes": np.random.randint(1, 15, size=num_samples),
    "Time_Spent_on_Vehicle_Selection_Minutes": np.random.randint(1, 20, size=num_samples),
    "Time_Spent_on_Document_Upload_Minutes": np.random.randint(1, 10, size=num_samples),
    "Time_Spent_on_Credit_Check_Minutes": np.random.randint(1, 5, size=num_samples),
    "Time_Spent_on_Approval_Screen_Minutes": np.random.randint(1, 5, size=num_samples),
    "Common_Paths": [random.sample(screens, k=random.randint(3, len(screens))) for _ in range(num_samples)],
    "Drop_Off_Point": np.random.choice(
        screens + [None], 
        size=num_samples, 
        p=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.4]  # 40% complete all steps
    ),
    "Comparison_of_Loan_Options": np.random.choice([True, False], size=num_samples, p=[0.6, 0.4])
})

# Create the DataFrame with all existing and new fields
df = pd.DataFrame(data)

# Calculate Debt_To_IncomeRatio based on Monthly_Debt_Payments, MonthlyLoanPayment, and Monthly_Income
df['Debt_To_IncomeRatio'] = (
    df['Monthly_Debt_Payments'] + df['Loan_Amount'] * (df['Interest_Rate']/100)/12 / (1 - (1 + df['Interest_Rate']/100/12)**(-df['Loan_Tenure_Years']*12))
) / df['Monthly_Income']

# Create NetWorth ensuring a minimum value
min_net_worth = 1000  # Set a minimum net worth
df['NetWorth'] = np.maximum(df['Total_Assets'] - df['Total_Liabilities'], min_net_worth)

# Calculate MonthlyLoanPayment using the loan amortization formula
# Formula: P = (L * c) / (1 - (1 + c)^-n)
# Where:
# P = monthly payment
# L = loan amount
# c = monthly interest rate
# n = number of payments

df['MonthlyLoanPayment'] = (
    (df['Loan_Amount'] * (df['Interest_Rate']/100) / 12) / 
    (1 - (1 + df['Interest_Rate']/100 / 12) ** (-df['Loan_Tenure_Years'] * 12))
).fillna(0).round(2)

# Recalculate Debt_To_IncomeRatio with MonthlyLoanPayment
df['Debt_To_IncomeRatio'] = (
    df['Monthly_Debt_Payments'] + df['MonthlyLoanPayment']
) / df['Monthly_Income']

# Define a function to calculate approval probability based on multiple factors, with increased weight for DTI and Credit Score
def calculate_approval_probability(employment_status, credit_score, dti, loan_amount, vehicle_type, annual_income):
    """
    Calculate the probability of loan approval based on employment status, credit score, DTI, loan amount, vehicle type, and annual income.
    Increased weight is given to Credit Score and Debt-To-Income Ratio.
    """
    probability = 0.0
    
    # Employment Status Factor
    if employment_status == 'Employed':
        probability += 0.2
    elif employment_status == 'Self-Employed':
        probability += 0.15
    elif employment_status == 'Unemployed':
        probability -= 0.25  # Negative impact
    
    # Credit Score Factor (Increased Weight)
    if credit_score >= 750:
        probability += 0.35
    elif 700 <= credit_score < 750:
        probability += 0.25
    elif 650 <= credit_score < 700:
        probability += 0.15
    else:
        probability -= 0.35  # Negative impact for low scores
    
    # Debt-To-Income Ratio Factor (Increased Weight)
    if dti <= 0.25:
        probability += 0.35
    elif 0.25 < dti <= 0.35:
        probability += 0.25
    elif 0.35 < dti <= 0.45:
        probability += 0.15
    else:
        probability -= 0.35  # Negative impact for high DTI
    
    # Loan Amount Factor
    if loan_amount <= 20000:
        probability += 0.15
    elif 20000 < loan_amount <= 40000:
        probability += 0.1
    else:
        probability -= 0.25  # Negative impact for very high loans
    
    # Annual Income Factor
    if annual_income >= 100000:
        probability += 0.25
    elif 75000 <= annual_income < 100000:
        probability += 0.2
    elif 50000 <= annual_income < 75000:
        probability += 0.1
    else:
        probability -= 0.25  # Negative impact for low income
    
    # Vehicle Type Factor
    if vehicle_type == 'New':
        probability += 0.1  # Slightly higher chance for new vehicles
    else:
        probability += 0.0  # No additional impact for used vehicles
    
    # Normalize probability to be between 0 and 1
    probability = max(min(probability, 1.0), 0.0)
    
    # Determine Application Status based on probability thresholds
    if probability >= 0.75:
        status = 'Approved'
    elif probability >= 0.45:
        status = 'Pending'
    else:
        status = 'Rejected'
    
    return status

# Apply the loan approval rule to each row using DataFrame.apply
df['LoanApproved'] = df.apply(
    lambda row: calculate_approval_probability(
        row['Employment_Status'], 
        row['Credit_Score'], 
        row['Debt_To_IncomeRatio'], 
        row['Loan_Amount'], 
        row['Vehicle_Type'], 
        row['Annual_Income']
    ), 
    axis=1
)

# Ensure that if "LoanApproved" is "Approved", then "Drop_Off_Point" should only show "Approval"
df.loc[df['LoanApproved'] == 'Approved', 'Drop_Off_Point'] = 'Approval'

# Define possible treatments with a higher probability for "Ads" when Drop_Off_Point is "Approval"
treatments = ['Ads', 'No-Ads']

# Create a new column 'Treatment_Assignment' initialized with None
df['Treatment_Assignment'] = None

# Filter rows where 'Drop_Off_Point' is 'Approval'
approval_condition = df['Drop_Off_Point'] == 'Approval'
non_approval_condition = df['Drop_Off_Point'].isin(['Document Upload', 'Credit Check'])

# Assign treatments with higher probability for "Ads" when 'Drop_Off_Point' is 'Approval'
df.loc[approval_condition, 'Treatment_Assignment'] = np.random.choice(
    treatments, size=approval_condition.sum(), p=[0.8, 0.2]
)

# Assign random treatment to the non-approval filtered rows
df.loc[non_approval_condition, 'Treatment_Assignment'] = np.random.choice(
    treatments, size=non_approval_condition.sum(), p=[0.5, 0.5]
)

# Assign 'No-Ads' to all other rows where 'Treatment_Assignment' is still None
df['Treatment_Assignment'].fillna('No-Ads', inplace=True)

# Ensure Total_Assets is always greater than or equal to the sum of Savings_Account_Balance and Checking_Account_Balance
df['Total_Assets'] = np.maximum(df['Total_Assets'], df['Savings_Account_Balance'] + df['Checking_Account_Balance'])

# Add more complex derived features
df['NetWorth'] = np.maximum(df['Total_Assets'] - df['Total_Liabilities'], min_net_worth)

# Add some noise and outliers
noise_mask = np.random.choice([True, False], num_samples, p=[0.01, 0.99])
df.loc[noise_mask, 'Annual_Income'] = (
    df.loc[noise_mask, 'Annual_Income'] * np.random.uniform(1.5, 2.0, noise_mask.sum())
).astype(int)

low_net_worth_mask = df['NetWorth'] == min_net_worth
df.loc[low_net_worth_mask, 'NetWorth'] += np.random.randint(0, 10000, size=low_net_worth_mask.sum())

# Save the updated DataFrame to a CSV file
csv_file_path = "Synthetic_Auto_Loan_Application_Data_jz3.csv"
df.to_csv(csv_file_path, index=False)



In [32]:
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime, timedelta
from faker import Faker
import random
import matplotlib.pyplot as plt
import seaborn as sns

# ==========================
# Initialization and Constants
# ==========================

# Initialize Faker for synthetic data generation
fake = Faker()

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

# Number of samples
NUM_SAMPLES = 20000

# Current year for calculations
CURRENT_YEAR = 2024

# Vehicle make to models mapping
MAKE_MODEL_MAPPING = {
    'Toyota': ['Camry', 'Corolla', 'RAV4', 'Prius', 'Highlander', 'Tacoma'],
    'Honda': ['Civic', 'Accord', 'CR-V', 'Pilot', 'Fit', 'Odyssey'],
    'Ford': ['F-150', 'Escape', 'Explorer', 'Mustang', 'Fusion', 'Ranger'],
    'Chevrolet': ['Silverado', 'Equinox', 'Malibu', 'Traverse', 'Camaro', 'Tahoe'],
    'BMW': ['3 Series', '5 Series', 'X3', 'X5', '7 Series', 'X1'],
    'Mercedes-Benz': ['C-Class', 'E-Class', 'GLC', 'GLE', 'S-Class', 'GLA'],
    'Nissan': ['Altima', 'Sentra', 'Rogue', 'Versa', 'Pathfinder', 'Maxima'],
    'Hyundai': ['Elantra', 'Sonata', 'Tucson', 'Santa Fe', 'Accent', 'Kona'],
    'Kia': ['Soul', 'Optima', 'Sportage', 'Sorento', 'Rio', 'Seltos'],
    'Subaru': ['Forester', 'Outback', 'Impreza', 'Crosstrek', 'Legacy', 'Ascent'],
    'Mazda': ['CX-5', 'Mazda3', 'Mazda6', 'MX-5 Miata', 'CX-9', 'Mazda CX-30'],
    'Audi': ['A3', 'A4', 'A6', 'Q5', 'Q7', 'TT'],
    'Volkswagen': ['Golf', 'Passat', 'Tiguan', 'Jetta', 'Atlas', 'Arteon'],
    'Volvo': ['XC90', 'S60', 'S90', 'XC60', 'V60', 'V90'],
    'Porsche': ['911', 'Cayenne', 'Macan', 'Panamera', 'Taycan', 'Boxster'],
    'Jeep': ['Wrangler', 'Grand Cherokee', 'Renegade', 'Compass', 'Cherokee', 'Gladiator'],
    'Lexus': ['RX', 'ES', 'NX', 'GX', 'LS', 'IS'],
    'Acura': ['MDX', 'RDX', 'TLX', 'ILX', 'RLX', 'NSX'],
    'Cadillac': ['Escalade', 'XT5', 'CT5', 'XT4', 'ATS', 'XT6'],
    'Lincoln': ['Navigator', 'Aviator', 'Corsair', 'Nautilus', 'MKZ', 'MKC'],
    'Infiniti': ['Q50', 'QX60', 'QX80', 'Q30', 'QX50', 'QX55'],
    'Genesis': ['G70', 'G80', 'G90', 'GV70', 'GV80', 'G70 Convertible'],
    'Bentley': ['Continental', 'Flying Spur', 'Bentayga', 'Mulsanne', 'Azure'],
    'Maserati': ['Ghibli', 'Quattroporte', 'Levante', 'GranTurismo', 'MC20'],
    'Alfa Romeo': ['Giulia', 'Stelvio', '4C', 'Giulietta', 'Tonale', 'GT'],
    'Fiat': ['500', 'Panda', '124 Spider', 'Tipo', '500X', '500L'],
    'Mitsubishi': ['Outlander', 'Eclipse Cross', 'Mirage', 'Galant', 'Lancer', 'ASX'],
    'Mini': ['Cooper', 'Countryman', 'Clubman', 'Convertible', 'Hardtop'],
    'Ram': ['1500', '2500', '3500', 'ProMaster', 'Chassis Cab'],
    'Suzuki': ['Swift', 'Vitara', 'Jimny', 'Baleno', 'Celerio', 'S-Cross']
}

# Vehicle make weights for probability distribution
VEHICLE_MAKE_WEIGHTS = [
    10, 9, 8, 7, 6, 5, 6, 5, 5, 4, 3, 3, 2, 2, 1,
    4, 3, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1
]

VEHICLE_MAKES = list(MAKE_MODEL_MAPPING.keys())

# Ensure VEHICLE_MAKE_WEIGHTS matches the length of VEHICLE_MAKES
assert len(VEHICLE_MAKE_WEIGHTS) == len(VEHICLE_MAKES), "Weights and makes length mismatch."

# Base price mapping for vehicle makes
BASE_PRICE_MAPPING = {
    'Toyota': 25000,
    'Honda': 24000,
    'Ford': 26000,
    'Chevrolet': 25500,
    'BMW': 45000,
    'Mercedes-Benz': 47000,
    'Nissan': 23000,
    'Hyundai': 22000,
    'Kia': 21000,
    'Subaru': 23500,
    'Mazda': 22500,
    'Audi': 44000,
    'Volkswagen': 20000,
    'Volvo': 42000,
    'Porsche': 60000,
    'Jeep': 28000,
    'Lexus': 43000,
    'Acura': 39000,
    'Cadillac': 50000,
    'Lincoln': 48000,
    'Infiniti': 40000,
    'Genesis': 41000,
    'Bentley': 90000,
    'Maserati': 85000,
    'Alfa Romeo': 37000,
    'Fiat': 18000,
    'Mitsubishi': 19000,
    'Mini': 22000,
    'Ram': 30000,
    'Suzuki': 17000
}

# Treatment options
TREATMENTS = ['Ads', 'No-Ads']

# Event sequences and screens for user interactions
EVENT_SEQUENCES = ['Application Start', 'Vehicle Selection', 'Loan Calculator', 'Document Upload', 'Credit Check', 'Approval']
SCREENS = ['Home', 'Loan Calculator', 'Vehicle Selection', 'Document Upload', 'Credit Check', 'Approval']

# ==========================
# Helper Functions
# ==========================

def generate_correlated_features(num_samples):
    """
    Generate correlated personal and financial features.
    """
    # Age: Normal distribution, clipped between 18 and 80
    age = np.random.normal(40, 12, num_samples).clip(18, 80).astype(int)
    
    # Experience: Based on Age, ensuring non-negative
    experience = (age - 18 - np.random.normal(4, 2, num_samples)).clip(0).astype(int)
    
    # Education Level with predefined probabilities
    education_levels = ['High School', 'Associate', 'Bachelor', 'Master', 'Doctorate']
    education_probs = [0.3, 0.2, 0.3, 0.15, 0.05]
    education_level = np.random.choice(education_levels, num_samples, p=education_probs)
    
    # Education Impact on Income and Credit Score
    edu_impact = {'High School': 0, 'Associate': 0.1, 'Bachelor': 0.2, 'Master': 0.3, 'Doctorate': 0.4}
    edu_factor = np.array([edu_impact[level] for level in education_level])
    
    # Annual Income: Log-normal distribution influenced by education and experience
    base_income = np.random.lognormal(10.5, 0.6, num_samples) * (1 + edu_factor) * (1 + experience / 100)
    income_noise = np.random.normal(0, 0.1, num_samples)
    annual_income = (base_income * (1 + income_noise)).clip(15000, 300000).astype(int)
    
    # Credit Score: Influenced by education, experience, and income
    credit_score_base = 300 + 300 * stats.beta.rvs(5, 1.5, size=num_samples)
    credit_score = (credit_score_base + edu_factor * 100 + experience * 1.5 + income_noise * 100).clip(300, 850).astype(int)
    
    # Employment Status based on education
    employment_status_probs = np.column_stack([
        0.9 - edu_factor * 0.3,  # Employed
        0.05 + edu_factor * 0.2,  # Self-Employed
        0.05 + edu_factor * 0.1   # Unemployed
    ])
    employment_status_probs /= employment_status_probs.sum(axis=1, keepdims=True)  # Normalize
    
    employment_status = np.array(['Employed', 'Self-Employed', 'Unemployed'])[
        np.argmax(np.random.random((num_samples, 1)) < employment_status_probs.cumsum(axis=1), axis=1)
    ]
    
    return age, experience, education_level, annual_income, credit_score, employment_status

def generate_application_dates(num_samples):
    """
    Generate sequential application dates starting from January 1, 2018.
    """
    start_date = datetime(2018, 1, 1)
    return [start_date + timedelta(days=i) for i in range(num_samples)]

def assign_vehicle_model(make):
    """
    Assign a vehicle model based on the make.
    """
    return random.choice(MAKE_MODEL_MAPPING.get(make, ['Model_Not_Specified']))

def generate_vehicle_features(num_samples, vehicle_makes):
    """
    Generate vehicle-related features.
    """
    # Vehicle Model
    vehicle_models = [assign_vehicle_model(make) for make in vehicle_makes]
    
    # Vehicle Type
    vehicle_type_probs = [0.4, 0.6]  # New vs Used
    vehicle_types = np.random.choice(['New', 'Used'], size=num_samples, p=vehicle_type_probs)
    
    # Vehicle Year
    vehicle_years = np.where(
        vehicle_types == 'New',
        np.random.randint(2018, CURRENT_YEAR + 1, num_samples),
        np.random.randint(2005, 2018, num_samples)
    )
    
    # Vehicle Mileage
    age = CURRENT_YEAR - vehicle_years
    mileage_new = np.random.randint(0, 30001, num_samples)
    mileage_used = 30000 + (age * 12000) + np.random.randint(-10000, 10001, num_samples)
    mileage_used = mileage_used.clip(30000, None)
    vehicle_mileage = np.where(vehicle_types == 'New', mileage_new, mileage_used).astype(int)
    
    return vehicle_types, vehicle_models, vehicle_years, vehicle_mileage

def generate_vehicle_price(num_samples, vehicle_makes, vehicle_types, vehicle_years, vehicle_mileage):
    """
    Generate vehicle prices based on type, year, and mileage.
    """
    base_prices = np.array([BASE_PRICE_MAPPING.get(make, 20000) for make in vehicle_makes])
    
    # Depreciation for used vehicles
    age = CURRENT_YEAR - vehicle_years
    depreciation = 0.05 * age
    mileage_factor = np.minimum(vehicle_mileage / 150000, 1)
    mileage_depreciation = 0.2 * mileage_factor
    total_depreciation = depreciation + mileage_depreciation
    total_depreciation = np.minimum(total_depreciation, 0.8)  # Cap at 80%
    
    # Calculate price
    prices = np.where(
        vehicle_types == 'New',
        base_prices,
        base_prices * (1 - total_depreciation)
    )
    
    # Add randomness ±5%
    prices *= np.random.uniform(0.95, 1.05, num_samples)
    prices = np.maximum(prices, 1000).astype(int)
    
    return prices

def generate_loan_amount(prices, vehicle_types):
    """
    Generate loan amounts based on vehicle price and type.
    """
    ltv_new = np.random.uniform(0.8, 1.0, NUM_SAMPLES)
    ltv_used = np.random.uniform(0.5, 0.8, NUM_SAMPLES)
    ltv = np.where(vehicle_types == 'New', ltv_new, ltv_used)
    
    loan_amount = prices * ltv
    loan_amount *= np.random.uniform(0.95, 1.05, NUM_SAMPLES)  # Add randomness ±5%
    loan_amount = np.minimum(loan_amount, prices).astype(int)
    
    return loan_amount

def generate_interest_rate(credit_scores, loan_amounts, loan_tenures, annual_incomes):
    """
    Generate interest rates based on multiple financial factors.
    """
    base_rate = 2.0
    credit_factor = (850 - credit_scores) / 2000
    loan_factor = (loan_amounts - 5000) / 100000
    tenure_factor = (loan_tenures - 3) * 0.2
    income_factor = (annual_incomes - 50000) / 200000
    
    interest = base_rate + credit_factor + loan_factor + tenure_factor - income_factor
    interest += np.random.uniform(-0.3, 0.3, NUM_SAMPLES)  # Add randomness
    interest = np.clip(interest, 1.9, 6.5).round(2)
    
    return interest

def calculate_monthly_loan_payment(loan_amount, interest_rate, tenure_years):
    """
    Calculate monthly loan payment using the amortization formula.
    """
    monthly_rate = interest_rate / 100 / 12
    num_payments = tenure_years * 12
    payment = (loan_amount * monthly_rate) / (1 - (1 + monthly_rate) ** -num_payments)
    payment = np.where(
        monthly_rate > 0,
        payment,
        0
    ).round(2)
    return payment

def calculate_debt_to_income_ratio(monthly_debt, monthly_loan_payment, monthly_income):
    """
    Calculate Debt-To-Income Ratio.
    """
    return (monthly_debt + monthly_loan_payment) / monthly_income

def calculate_net_worth(total_assets, total_liabilities, min_net_worth=1000):
    """
    Calculate Net Worth ensuring a minimum value.
    """
    return np.maximum(total_assets - total_liabilities, min_net_worth)

def calculate_approval_status(row):
    """
    Determine loan approval status based on multiple factors.
    """
    return calculate_approval_probability(
        employment_status=row['Employment_Status'],
        credit_score=row['Credit_Score'],
        dti=row['Debt_To_IncomeRatio'],
        loan_amount=row['Loan_Amount'],
        vehicle_type=row['Vehicle_Type'],
        annual_income=row['Annual_Income']
    )

def calculate_approval_probability(employment_status, credit_score, dti, loan_amount, vehicle_type, annual_income):
    """
    Calculate the probability of loan approval based on various factors.
    """
    probability = 0.0
    
    # Employment Status
    if employment_status == 'Employed':
        probability += 0.2
    elif employment_status == 'Self-Employed':
        probability += 0.15
    elif employment_status == 'Unemployed':
        probability -= 0.25
    
    # Credit Score
    if credit_score >= 750:
        probability += 0.35
    elif 700 <= credit_score < 750:
        probability += 0.25
    elif 650 <= credit_score < 700:
        probability += 0.15
    else:
        probability -= 0.35
    
    # Debt-To-Income Ratio
    if dti <= 0.25:
        probability += 0.35
    elif 0.25 < dti <= 0.35:
        probability += 0.25
    elif 0.35 < dti <= 0.45:
        probability += 0.15
    else:
        probability -= 0.35
    
    # Loan Amount
    if loan_amount <= 20000:
        probability += 0.15
    elif 20000 < loan_amount <= 40000:
        probability += 0.1
    else:
        probability -= 0.25
    
    # Annual Income
    if annual_income >= 100000:
        probability += 0.25
    elif 75000 <= annual_income < 100000:
        probability += 0.2
    elif 50000 <= annual_income < 75000:
        probability += 0.1
    else:
        probability -= 0.25
    
    # Vehicle Type
    if vehicle_type == 'New':
        probability += 0.1  # Slightly higher chance for new vehicles
    
    # Normalize probability between 0 and 1
    probability = np.clip(probability, 0.0, 1.0)
    
    # Determine status based on thresholds
    if probability >= 0.75:
        return 'Approved'
    elif probability >= 0.45:
        return 'Pending'
    else:
        return 'Rejected'

# ==========================
# Data Generation
# ==========================

# Generate correlated personal and financial features
age, experience, education_level, annual_income, credit_score, employment_status = generate_correlated_features(NUM_SAMPLES)

# Generate application dates
application_dates = generate_application_dates(NUM_SAMPLES)

# Generate Vehicle Make based on weighted probabilities
vehicle_make_probabilities = np.array(VEHICLE_MAKE_WEIGHTS) / sum(VEHICLE_MAKE_WEIGHTS)
vehicle_makes = np.random.choice(VEHICLE_MAKES, size=NUM_SAMPLES, p=vehicle_make_probabilities)

# Generate vehicle-related features
vehicle_types, vehicle_models, vehicle_years, vehicle_mileage = generate_vehicle_features(NUM_SAMPLES, vehicle_makes)

# Generate vehicle prices
vehicle_prices = generate_vehicle_price(NUM_SAMPLES, vehicle_makes, vehicle_types, vehicle_years, vehicle_mileage)

# Generate loan amounts
loan_amounts = generate_loan_amount(vehicle_prices, vehicle_types)

# Generate loan tenures based on loan amounts
def generate_loan_tenures(loan_amounts):
    """
    Generate loan tenure in years based on loan amount.
    """
    tenure = np.zeros(NUM_SAMPLES, dtype=int)
    tenure[loan_amounts > 30000] = np.random.choice([5, 6, 7], size=(loan_amounts > 30000).sum(), p=[0.5, 0.3, 0.2])
    tenure[(loan_amounts > 20000) & (loan_amounts <= 30000)] = np.random.choice([4, 5, 6], size=((loan_amounts > 20000) & (loan_amounts <= 30000)).sum(), p=[0.4, 0.4, 0.2])
    tenure[loan_amounts <= 20000] = np.random.choice([3, 4, 5], size=(loan_amounts <= 20000).sum(), p=[0.5, 0.3, 0.2])
    return tenure

loan_tenures = generate_loan_tenures(loan_amounts)

# Generate interest rates
interest_rates = generate_interest_rate(credit_score, loan_amounts, loan_tenures, annual_income)

# Generate Loan Duration in months for amortization calculations
loan_duration_months = loan_tenures * 12

# Generate monthly loan payments
monthly_loan_payments = calculate_monthly_loan_payment(loan_amounts, interest_rates, loan_tenures)

# Generate monthly debt payments
monthly_debt_payments = np.random.lognormal(6, 0.5, NUM_SAMPLES).astype(int)

# Calculate Debt-To-Income Ratio
dti_ratio = calculate_debt_to_income_ratio(monthly_debt_payments, monthly_loan_payments, annual_income / 12)

# Generate location data
locations = [fake.city() for _ in range(NUM_SAMPLES)]

# ==========================
# Additional Features
# ==========================

# Define additional categorical options
marital_statuses = ['Single', 'Married', 'Divorced', 'Widowed']
device_types = ['iPhone', 'Android', 'Windows Phone']
os_versions = ['iOS 15', 'iOS 14', 'Android 11', 'Android 10', 'Windows 10 Mobile']
app_versions = ['1.0', '1.1', '1.2']
network_types = ['Wi-Fi', '4G', '5G']
dealer_info = ['Dealer A', 'Dealer B', 'Dealer C', 'Dealer D']
promotions = ['0% APR', '$1000 Cashback', 'No Payments for 90 Days', 'Low Down Payment']
behavioral_segments = ['Low Engagement', 'Medium Engagement', 'High Engagement']
user_types = ['New', 'Returning']
common_issues = [None, 'Document Upload Failed', 'Credit Check Issue', 'App Crash']
user_satisfactions = ['Very Satisfied', 'Satisfied', 'Neutral', 'Dissatisfied', 'Very Dissatisfied']

# Generate the data dictionary
data = {
    'User_ID': [fake.uuid4() for _ in range(NUM_SAMPLES)],
    'ApplicationDate': application_dates,
    'Age': age,
    'Gender': np.random.choice(['Male', 'Female'], size=NUM_SAMPLES),
    'Annual_Income': annual_income,
    'Credit_Score': credit_score,
    'Employment_Status': employment_status,
    'Education_Level': education_level,
    'Experience': experience,
    'Loan_Amount': loan_amounts,
    'Loan_Tenure_Years': loan_tenures,
    'Loan_Duration': np.random.choice(
        [12, 24, 36, 48, 60, 72, 84, 96, 108, 120],
        NUM_SAMPLES,
        p=[0.05, 0.1, 0.2, 0.2, 0.2, 0.1, 0.05, 0.05, 0.025, 0.025]
    ),
    'Marital_Status': np.random.choice(marital_statuses, NUM_SAMPLES, p=[0.3, 0.5, 0.15, 0.05]),
    'Number_Of_Dependents': np.random.choice([0, 1, 2, 3, 4, 5], NUM_SAMPLES, p=[0.3, 0.25, 0.2, 0.15, 0.07, 0.03]),
    'Home_Ownership_Status': np.random.choice(['Own', 'Rent', 'Mortgage', 'Other'], NUM_SAMPLES, p=[0.2, 0.3, 0.4, 0.1]),
    'Monthly_Debt_Payments': monthly_debt_payments,
    'Credit_Card_Utilization_Rate': np.random.beta(2, 5, NUM_SAMPLES),
    'Number_Of_Open_CreditLines': np.random.poisson(3, NUM_SAMPLES).clip(0, 15).astype(int),
    'Number_Of_Credit_Inquiries': np.random.poisson(1, NUM_SAMPLES).clip(0, 10).astype(int),
    'Bankruptcy_History': np.random.choice([0, 1], NUM_SAMPLES, p=[0.95, 0.05]),
    'Previous_Loan_Defaults': np.random.choice([0, 1], NUM_SAMPLES, p=[0.9, 0.1]),
    'Payment_History': np.random.poisson(24, NUM_SAMPLES).clip(0, 60).astype(int),
    'Length_Of_CreditHistory': np.random.randint(1, 30, NUM_SAMPLES),
    'Savings_Account_Balance': np.random.lognormal(8, 1, NUM_SAMPLES).astype(int),
    'Checking_Account_Balance': np.random.lognormal(7, 1, NUM_SAMPLES).astype(int),
    'Total_Assets': np.random.lognormal(11, 1, NUM_SAMPLES).astype(int),
    'Total_Liabilities': np.random.lognormal(10, 1, NUM_SAMPLES).astype(int),
    'Monthly_Income': annual_income / 12,
    'Utility_Bills_Payment_History': np.random.beta(8, 2, NUM_SAMPLES),
    'Job_Tenure': np.random.poisson(5, NUM_SAMPLES).clip(0, 40).astype(int),
    
    # Vehicle-related features
    'Location': locations,
    'Vehicle_Type': vehicle_types,
    'Vehicle_Make': vehicle_makes,
    'Vehicle_Model': vehicle_models,
    'Vehicle_Year': vehicle_years,
    'Vehicle_Mileage': vehicle_mileage,
    'Vehicle_Price': vehicle_prices,
    'Down_Payment': np.random.randint(0.1 * vehicle_prices, 0.3 * vehicle_prices + 1).astype(int),
    
    # Interest Rate
    'Interest_Rate': interest_rates,
    
    # Session and Interaction Features
    'Session_Duration_Minutes': np.random.randint(5, 60, size=NUM_SAMPLES),
    'Number_of_Interactions': np.random.randint(10, 100, size=NUM_SAMPLES),
    'Notifications_Responded': np.random.choice([0, 1], size=NUM_SAMPLES, p=[0.7, 0.3]),
    'Support_Queries': np.random.choice([0, 1, 2, 3], size=NUM_SAMPLES, p=[0.5, 0.3, 0.15, 0.05]),
    'Application_Submitted': np.random.choice([True, False], size=NUM_SAMPLES, p=[0.8, 0.2])
}

# Additional Features
additional_data = {
    "Monthly_Expenses": np.random.randint(1000, 10000, size=NUM_SAMPLES),
    "Previous_Vehicle_Ownership": np.random.choice([True, False], size=NUM_SAMPLES, p=[0.7, 0.3]),
    "Trade_In_Details": np.random.choice([None, 'Old Car Trade-In'], size=NUM_SAMPLES, p=[0.7, 0.3]),
    "Session_Start_Time": [fake.date_time_this_year() for _ in range(NUM_SAMPLES)],
    "Session_End_Time": [fake.date_time_this_year() for _ in range(NUM_SAMPLES)],
    "Navigation_Paths": [random.sample(EVENT_SEQUENCES, k=random.randint(3, len(EVENT_SEQUENCES))) for _ in range(NUM_SAMPLES)],
    "Device_Type": np.random.choice(device_types, size=NUM_SAMPLES),
    "OS_Version": np.random.choice(os_versions, size=NUM_SAMPLES),
    "App_Version": np.random.choice(app_versions, size=NUM_SAMPLES),
    "Network_Type": np.random.choice(network_types, size=NUM_SAMPLES),
    "Dealer_Info": np.random.choice(dealer_info, size=NUM_SAMPLES),
    "Promotions": np.random.choice(promotions, size=NUM_SAMPLES),
    "Regulatory_Compliance": np.random.choice(['Compliant', 'Non-Compliant'], size=NUM_SAMPLES, p=[0.95, 0.05]),
    "Consent_Provided": np.random.choice([True, False], size=NUM_SAMPLES, p=[0.98, 0.02]),
    "User_Type": np.random.choice(user_types, size=NUM_SAMPLES),
    "Behavioral_Segment": np.random.choice(behavioral_segments, size=NUM_SAMPLES),
    "User_Feedback_Rating": np.random.randint(1, 5, size=NUM_SAMPLES),
    "Common_Issues_Faced": np.random.choice(common_issues, size=NUM_SAMPLES, p=[0.7, 0.1, 0.1, 0.1]),
    "User_Satisfaction": np.random.choice(user_satisfactions, size=NUM_SAMPLES)
}

# Update main data dictionary with additional features
data.update(additional_data)

# Interaction Event Data
interaction_data = {
    "Frequency_of_App_Usage": np.random.randint(1, 30, size=NUM_SAMPLES),
    "Clicks": np.random.randint(1, 50, size=NUM_SAMPLES),
    "Taps": np.random.randint(1, 50, size=NUM_SAMPLES),
    "Swipes": np.random.randint(1, 50, size=NUM_SAMPLES),
    "Form_Entries": np.random.randint(1, 20, size=NUM_SAMPLES),
    "Time_Spent_on_Home_Screen_Minutes": np.random.randint(1, 10, size=NUM_SAMPLES),
    "Time_Spent_on_Loan_Calculator_Minutes": np.random.randint(1, 15, size=NUM_SAMPLES),
    "Time_Spent_on_Vehicle_Selection_Minutes": np.random.randint(1, 20, size=NUM_SAMPLES),
    "Time_Spent_on_Document_Upload_Minutes": np.random.randint(1, 10, size=NUM_SAMPLES),
    "Time_Spent_on_Credit_Check_Minutes": np.random.randint(1, 5, size=NUM_SAMPLES),
    "Time_Spent_on_Approval_Screen_Minutes": np.random.randint(1, 5, size=NUM_SAMPLES),
    "Common_Paths": [random.sample(SCREENS, k=random.randint(3, len(SCREENS))) for _ in range(NUM_SAMPLES)],
    "Drop_Off_Point": np.random.choice(
        SCREENS + [None],
        size=NUM_SAMPLES,
        p=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.4]  # 40% complete all steps
    ),
    "Comparison_of_Loan_Options": np.random.choice([True, False], size=NUM_SAMPLES, p=[0.6, 0.4])
}

# Update main data dictionary with interaction data
data.update(interaction_data)

# ==========================
# Create DataFrame
# ==========================

df = pd.DataFrame(data)

# ==========================
# Feature Engineering
# ==========================

# Calculate Net Worth
df['Total_Assets'] = np.maximum(df['Total_Assets'], df['Savings_Account_Balance'] + df['Checking_Account_Balance'])
df['NetWorth'] = calculate_net_worth(df['Total_Assets'], df['Total_Liabilities'])

# Calculate Monthly Loan Payment
df['MonthlyLoanPayment'] = calculate_monthly_loan_payment(df['Loan_Amount'], df['Interest_Rate'], df['Loan_Tenure_Years'])

# Recalculate Debt-To-Income Ratio
df['Debt_To_IncomeRatio'] = calculate_debt_to_income_ratio(df['Monthly_Debt_Payments'], df['MonthlyLoanPayment'], df['Monthly_Income'])

# Add Noise and Outliers
# Add noise to Annual Income
noise_mask = np.random.choice([True, False], NUM_SAMPLES, p=[0.01, 0.99])
df.loc[noise_mask, 'Annual_Income'] = (
    df.loc[noise_mask, 'Annual_Income'] * np.random.uniform(1.5, 2.0, noise_mask.sum())
).astype(int)

# Add random net worth to low net worth cases
low_net_worth_mask = df['NetWorth'] == 1000
df.loc[low_net_worth_mask, 'NetWorth'] += np.random.randint(0, 10000, size=low_net_worth_mask.sum())

# ==========================
# Loan Approval Processing
# ==========================

# Calculate Loan Approval Status
df['LoanApproved'] = df.apply(calculate_approval_status, axis=1)

# Ensure that approved loans have 'Approval' as the drop-off point
df.loc[df['LoanApproved'] == 'Approved', 'Drop_Off_Point'] = 'Approval'

# Assign Treatment Based on Drop-Off Point
def assign_treatment(row):
    if row['Drop_Off_Point'] == 'Approval':
        return np.random.choice(TREATMENTS, p=[0.8, 0.2])
    elif row['Drop_Off_Point'] in ['Document Upload', 'Credit Check']:
        return np.random.choice(TREATMENTS, p=[0.5, 0.5])
    else:
        return 'No-Ads'

df['Treatment_Assignment'] = df.apply(assign_treatment, axis=1)

# ==========================
# Final Adjustments
# ==========================

# Ensure Total Assets >= Savings + Checking
df['Total_Assets'] = np.maximum(df['Total_Assets'], df['Savings_Account_Balance'] + df['Checking_Account_Balance'])

# ==========================
# Save to CSV
# ==========================

csv_file_path = "Synthetic_Auto_Loan_Application_Data_jz3.csv"
df.to_csv(csv_file_path, index=False)

# ==========================
# Optional: Display DataFrame Info
# ==========================

# Uncomment the following lines if you want to see a summary of the generated DataFrame
# print(df.head())
# print(df.info())
