# Generate synthetic data for the Citi Workshop

Use faker for generating synthetic data. See [Faker Documentation](https://faker.readthedocs.io/en/master/)

In [1]:
# Code to generate synthetic dataset
# Call generate_dataset(num_records, seed)
#    - num_records defaults to 1000
#    - seed defaults to 1212
#    - returns a pd dataframe

import pandas as pd
import random
import uuid
import numpy as np
from faker import Faker

# fake = Faker(['en_US', 'en_CA'])
fake = Faker('en_US')

def generate_flag(probability):
    return "Y" if random.random() < probability else "N"

def generate_random_value(probability, min_val, max_val):
    return random.randint(min_val, max_val) if random.random() < probability else 0

def generate_documentation_completed_flag(call_id):
    return "Y" if call_id == 3 or (call_id in [1, 2] and random.random() < 0.3) else "N"

# Function to generate employment type
def generate_employment_type():
    common_types = [ "P", "S",  "L", "N", "B"]
    rare_types = ["E", "O", "F", "M","U", "R","H", "T"]
    
    # Adjust this probability to control the ratio of common to rare types
    common_probability = 0.95  # 90% chance of selecting a common type
    
    if random.random() < common_probability:
        return random.choice(common_types)
    else:
        return random.choice(rare_types)

def generate_wps_lifetime():
    return random.choices(["0", "L"], weights=[99, 1])[0]

# Function to generate customer status
def generate_customer_status():
    return random.choices(["O", "C", "S"], weights=[75, 20, 5])[0]

# Function to determine if the customer is "Bad" (Delinquent)
def generate_bad_flag():
    return 1 if random.random() < 0.06 else 0  # 6% bad customers

# Generate documented_income using normal distribution
def generate_documented_income():
    mu, sigma = 9000, 3000  # Mean and standard deviation
    # income = int(random.gauss(mu, sigma))
    # return max(3000, min(income, 15000))  # Ensuring the value stays within range
    income = None
    while income is None or not (3000 <= income <= 15000):
        income = int(random.gauss(mu, sigma))
    return income

def generate_customer_age():
    age = None
    while age is None or not (16 <= age <= 75):  # Allow some outside of [18, 70] for cutoff demo
        age = int(random.gauss(40, 12))  # Mean = 40, Std Dev = 12
    return age    

def generate_pledged_amount():
    left_skewed_value = np.random.beta(a=3, b=10)  # a > b creates left skew
    return int(left_skewed_value * 2000)  # Scale to range 0-5000

def generate_dataset(num_records=1000, seed=1212):
    Faker.seed(seed)
    random.seed(seed)
    data = []
    for _ in range(num_records):
        call_id = random.choices([1, 2, 3], weights=[20, 20, 60])[0]
        customer_age = generate_customer_age()
        documented_income = generate_documented_income()
        customer_monthly_expenses = int(documented_income * random.uniform(0.2, 1.2))
        total_cc_amount = generate_random_value(0.75, 600, 12000)
        verified_income = int(documented_income * random.uniform(0.9, 1.1))
        home_inconsistency_flg = generate_flag(0.07)
        business_incosistency_flg = generate_flag(0.02)
        fraud_flg = generate_flag(0.02)
        is_employee_flg = generate_flag(0.5)  # 50% Y, 50% N
        bureau_score = random.randint(601, 850) if random.random() < 0.6 else random.randint(250, 600)
   
        # make bad conditional on various factors
        bad_weight = verified_income < documented_income
        bad_weight += 5 * customer_age < 28 or customer_age > 65
        bad_weight += home_inconsistency_flg == 'Y'
        bad_weight += business_incosistency_flg == 'Y'
        bad_weight += is_employee_flg != 'Y'
        bad_weight += 10 * bureau_score < 580
        bad_weight += 10 * (customer_monthly_expenses > verified_income)
        bad_weight += 10 * (fraud_flg == 'Y')
        bad = random.choices([0, 1], weights=([50, bad_weight]))[0]
        
        record = {
            # "RANDOM_ID": str(uuid.uuid4()),
            "CUSTOMER_RK": fake.unique.random_int(min=100000, max=999999),
            "NATIONAL_ID": fake.unique.ssn(),
            "NAME": fake.name(),
            "ADDRESS": fake.address().replace("\n", " "),
            "PHONE": fake.phone_number(),
            "EMAIL": fake.email(),
            "PERMANENT_RESIDENT_FLG": generate_flag(0.97),
            "CUSTOMER_AGE": customer_age,
            "CUSTOMER_STATUS":generate_customer_status(),
            "BUREAU_SCORE": bureau_score,
            "TOTAL_NON_CARD_BALANCE": random.randint(0, 50000),
            "BUREAU_INVESTIGATION_DATE_SINCE": random.randint(0, 90),
            "BUREAU_CARD_UTILIZATION": round(random.uniform(0.0, 1.0), 2),
            "DOCUMENTATION_COMPLETED_FLG": generate_documentation_completed_flag(call_id),
            "CUSTOMER_EMPLOYMENT_TYPE": generate_employment_type(),
            "DOCUMENTATION_TYPE": random.choices(["ID", "SC", "BS"], weights=[80, 10, 10])[0],
            "DPD30_AUTO_LOANS_CNT": generate_random_value(0.01, 1, 12),
            "DPD30_CASH_COLLATERAL_CNT": generate_random_value(0.01, 1, 12),
            "DPD30_CC_LOANS_CNT": generate_random_value(0.01, 1, 12),
            "DPD30_MORTGAGE_LOANS_CNT": generate_random_value(0.01, 1, 12),
            "DPD30_OVERDRAFT_LOANS_CNT": generate_random_value(0.01, 1, 12),
            "DPD30_PERSONAL_LOANS_CNT": generate_random_value(0.01, 1, 12),
            "LAST_MONTH_30":random.randint(0, 2),
            "NUM_CHARGE_OFF_36M":random.randint(0, 5),
            "HOME_INCONSISTENCY_FLG": home_inconsistency_flg,
            "INTEREST_INCOME": random.randint(0, 950),
            "MAIDEN_NAME_INCONSISTENCY_FLG": generate_flag(0.05),
            "OTHER_INCOME": generate_random_value(0.05, 250, 1800),
            "RENT_INCOME": generate_random_value(0.10, 500, 3000),
            "REQUESTED_LOAN_TENOR": random.choice([6, 12, 18, 24, 36]),
            "REQUESTED_LOAN_AMOUNT": random.randint(1000, 50000),
            "REQUESTED_PAYMENT_PLAN": random.choice(["B", "F", "U"]),
            "SALARY_TIE_UP_AMOUNT": random.randint(0, 400),
            "TOTAL_CC_AMOUNT": total_cc_amount,
            "TOTAL_LOAN_INSTALLMENT": random.randint(150, 1000),
            "TOTAL_OPEN_CARD_BALANCE": random.randint(200, 1500),
            "TOTAL_OPEN_CC_BALANCE": random.randint(200, 1500),
            "TOTAL_OPEN_LOAN_INSTALLMENT": random.randint(150, 1000),
            "TOTAL_OVERDRAFTS_AMOUNT": random.randint(150, 400),
            "TOTAL_PERS_LOANS_BAL_UNSECURE": random.randint(500, 3000),
            "IB_WPS_CARD_LAST_12M": random.randint(0, 10),
            "IB_WPS_MORTGAGE_LAST_12M": random.randint(0, 3),
            "IB_WPS_OD_LAST_12M": random.randint(0, 10),
            "IB_WPS_AUTO_LOAN_LAST_12M": random.randint(0, 10),
            "IB_WPS_LOAN_LAST_12M": random.randint(0, 10),
            "MAX_DUR_SINCE_CC_OPEN": random.randint(1, 120),
            "EXISTING_CUSTOMER": generate_flag(0.5),
            "FRAUD_FLG": fraud_flg,
            "BUSINESS_INCONSISTENCY_FLG": business_incosistency_flg,
            "VERIFICATION_COMPLETED_FLG": "Y" if call_id == 3 else "N",
            "VERIFIED_DOCUMENT_INCOME_FLG": "Y" if call_id == 3 else "N",
            "PRODUCT_CODE": random.choice(["PL1", "PL2"]),
            "EMAIL_PROVIDED_FLG": generate_flag(0.9),
            "EMAIL_INCONSISTENCY_FLG": generate_flag(0.02),
            "IS_EMPLOYEE_FLG": is_employee_flg,
            "DECLARED_INCOME": random.randint(3000, 15000),
            "DOCUMENTED_INCOME":documented_income,
            "TOTAL_EMPLOYMENT_DURATION": random.randint(6, 240),  # in months
            "PLEDGED_AMOUNT": generate_pledged_amount(),
            "VERIFIED_INCOME": verified_income,
            "WPS_LIFETIME": generate_wps_lifetime(),
            "CUSTOMER_MONTHLY_EXPENSES": customer_monthly_expenses,
            # "CALL_ID": call_id,
            "BAD": bad
        }
        data.append(record)

    return pd.DataFrame(data)

## Output to CSV

In [2]:
train_df = generate_dataset(num_records=5000)
# train_df.head()

train_df.to_csv("synthetic_customer_data_train.csv", index=False)
print("Synthetic customer data generation completed for training set.")

Synthetic customer data generation completed for training set.


In [3]:
test_df = generate_dataset(seed=4567)
# test_df.head()

test_df.to_csv("synthetic_customer_data_test.csv", index=False)
print("Synthetic customer data generation completed for testing set.")

Synthetic customer data generation completed for testing set.
