# Synthetic Korean Customer Dataset Generator

This notebook generates realistic Korean-style customer data for testing, analytics, and ML training.

## Features

- **Korean demographics**: Names, addresses, phone numbers
- **Financial profiles**: Income, credit scores, spending patterns
- **Behavioral data**: Activity status, churn risk, loyalty tiers
- **Payment methods**: Korean banks, card brands, payment preferences

## Output

- CSV file: `customers_ko_YYYYMMDD-HHMMSS.csv`
- Excel file: `customers_ko_YYYYMMDD-HHMMSS.xlsx` (with formatting)

Files are saved to: `../data/raw/`

## Configuration

Adjust these parameters to customize your dataset:

In [None]:
# Dataset configuration
N = 1000            # number of customers
SEED = 42           # random seed for reproducibility
EXCHANGE_RATE = 1350  # USD to KRW conversion rate

## Setup and Imports

In [None]:
from __future__ import annotations
import random
import math
import string
import json
from datetime import datetime, timedelta, date
from pathlib import Path

import pandas as pd

# Setup output directory
OUTDIR = Path("../data/raw")
OUTDIR.mkdir(parents=True, exist_ok=True)

TS = datetime.now().strftime("%Y%m%d-%H%M%S")
CSV_PATH = OUTDIR / f"customers_ko_{TS}.csv"
XLSX_PATH = OUTDIR / f"customers_ko_{TS}.xlsx"

random.seed(SEED)

print(f"✅ Setup complete")
print(f"📂 Output directory: {OUTDIR.absolute()}")
print(f"🎲 Random seed: {SEED}")
print(f"👥 Target records: {N:,}")

## Data Pools and Constants

Korean-specific data pools for realistic generation:

In [None]:
# Korean name components
last_names = list("김이박최정강조윤장임한오서신권황안송류홍전고문양손배조백허유남심노하곽성차주우구민진지엄채원천방공강현")
given_syllables = [
    "민","서","윤","준","현","우","지","하","도","건","수","연",
    "아","유","예","슬","나","리","태","은","재","성","승","시","소"
]

# Geographic data
cities = [
    "서울","부산","대구","인천","광주","대전","울산","세종",
    "수원","성남","의정부","춘천","청주","전주","포항","창원","제주"
]
districts = [
    "중구","서초구","강남구","송파구","마포구","성동구",
    "분당구","팔달구","해운대구","수영구","북구","동구","남구","서구"
]
streets = [
    "테헤란로","세종대로","충무로","광안로","해운대로","백제고분로",
    "중앙대로","시장로","문화로","백범로","정자일로","분당로","월드컵로"
]

# Contact and communication
email_domains = [
    "example.com","mail.com","gmail.com",
    "naver.com","daum.net","kakao.com","company.co.kr"
]
channels = ["email","sms","kakao","push","phone"]

# Business categories
segments = ["Retail","Finance","Edu","IT","FMCG","Travel","Healthcare"]
loyalty = ["Basic","Silver","Gold","Platinum","Diamond"]

# Payment methods (Korean market)
payment_primary = [
    "신용카드","체크카드","계좌이체",
    "간편결제(카카오/네이버)","현금"
]
banks = ["신한","국민","우리","하나","농협","카카오뱅크","토스뱅크"]
card_brands = ["Visa","Mastercard","Amex","UnionPay","JCB","BC"]

# Demographics
genders = ["M","F"]

print(f"✅ Data pools initialized")
print(f"   - {len(last_names)} last names")
print(f"   - {len(cities)} cities")
print(f"   - {len(banks)} banks")
print(f"   - {len(segments)} customer segments")

## Generator Functions

Utility functions to generate realistic Korean customer attributes:

In [None]:
def ko_name() -> str:
    """Generate Korean name (surname + given name)"""
    ln = random.choice(last_names)
    g1 = random.choice(given_syllables)
    g2 = random.choice(given_syllables)
    if random.random() < 0.2:
        g3 = random.choice(given_syllables)
        given = g1 + g2 + g3
    else:
        given = g1 + g2
    return ln + given


def email_from_name(name: str) -> str:
    """Generate email from Korean name"""
    base = "".join(ch for ch in name if ch.isalpha())
    ascii_base = ""
    for ch in base:
        code = ord(ch)
        ascii_base += random.choice(string.ascii_lowercase) if code > 127 else ch.lower()
    ascii_base += str(random.randint(10,99))
    return f"{ascii_base}@{random.choice(email_domains)}"


def phone_kr() -> str:
    """Generate Korean mobile phone number (010-XXXX-XXXX)"""
    return f"010-{random.randint(1000,9999)}-{random.randint(1000,9999)}"


def address_kr() -> tuple[str, str, str]:
    """Generate Korean address (city, district, street detail)"""
    c = random.choice(cities)
    d = random.choice(districts)
    st = random.choice(streets)
    no = random.randint(1, 300)
    detail = f"{st} {no}"
    return c, d, detail


def birthdate_and_age() -> tuple[date, int]:
    """Generate birthdate and age (18-75, skewed toward 25-45)"""
    r = random.random()
    if r < 0.6:
        age = int(random.gauss(34, 6))
    elif r < 0.9:
        age = int(random.gauss(45, 7))
    else:
        age = int(random.gauss(58, 8))
    age = max(18, min(75, age))
    today = date.today()
    birth = date(today.year - age, random.randint(1,12), random.randint(1,28))
    return birth, age


def signup_dates() -> tuple[date, date, bool]:
    """Generate signup and last active dates, plus activity status"""
    today = date.today()
    start = today - timedelta(days=5*365)
    sign = start + timedelta(days=random.randint(0, 5*365))
    last = sign + timedelta(days=random.randint(0, max(1, (today - sign).days)))
    is_active = (today - last).days <= random.choice([7,14,30,60,90])
    return sign, last, is_active


def monthly_income_krw(age: int) -> int:
    """Generate monthly income in KRW (age-dependent)"""
    base = 2_000_000 + (min(age,45)-25)*120_000 if age >= 25 else 1_800_000
    noise = random.randint(-400_000, 800_000)
    income = max(1_200_000, base + noise)
    return int(income)


def credit_score(age: int, income: int, active: bool) -> int:
    """Generate credit score (350-950) based on age, income, and activity"""
    base = 600 + (income // 200_000) + (10 if active else -20)
    base += random.randint(-60, 60)
    return max(350, min(950, base))


def spend_profile(income: int, active: bool) -> tuple[int, int, int]:
    """Generate spending metrics (avg monthly spend, order count, last order value)"""
    avg_spend = int(income * random.uniform(0.25, 0.6))
    orders = max(1, int(avg_spend / random.uniform(30_000, 80_000)))
    last_order = int(random.uniform(8_000, 400_000))
    if not active:
        last_order = int(last_order * random.uniform(0.4, 0.8))
    return avg_spend, orders, last_order


def churn_risk_from_activity(active: bool, last_active: date, orders: int) -> float:
    """Calculate churn risk score (0.01-0.95)"""
    days = (date.today() - last_active).days
    base = 0.15 if active else 0.45
    base += min(0.4, days/365*0.5)
    base -= min(0.15, orders/200*0.15)
    base = max(0.01, min(0.95, base + random.uniform(-0.05, 0.05)))
    return round(base, 3)


def loyalty_from_spend(spend: int) -> str:
    """Determine loyalty tier from average spending"""
    if spend > 6_000_000: return "Diamond"
    if spend > 4_000_000: return "Platinum"
    if spend > 2_500_000: return "Gold"
    if spend > 1_200_000: return "Silver"
    return "Basic"


def preferred_channel() -> str:
    """Select preferred communication channel"""
    r = random.random()
    if r < 0.35: return "kakao"
    if r < 0.6: return "sms"
    if r < 0.8: return "email"
    if r < 0.95: return "push"
    return "phone"

print("✅ Generator functions defined")

## Data Generation

Generate the customer dataset:

In [None]:
rows = []

print(f"🔄 Generating {N:,} customer records...\n")

for i in range(1, N+1):
    if i % 200 == 0:
        print(f"   Progress: {i:,} / {N:,} ({i/N*100:.1f}%)")
    
    cid = f"CUST{TS}{i:05d}"
    name = ko_name()
    gender = random.choice(genders)
    birth, age = birthdate_and_age()
    email = email_from_name(name)
    phone = phone_kr()
    city, district, addr_detail = address_kr()
    sign, last, active = signup_dates()
    income = monthly_income_krw(age)
    cs = credit_score(age, income, active)
    avg_spend, orders, last_order = spend_profile(income, active)
    churn = churn_risk_from_activity(active, last, orders)
    loy = loyalty_from_spend(avg_spend)
    pay = random.choice(payment_primary)
    bank = random.choice(banks)
    card = random.choice(card_brands)
    seg = random.choice(segments)
    mk_opt = random.random() < 0.72

    rows.append({
        "customer_id": cid,
        "name_ko": name,
        "gender": gender,
        "birthdate": birth.isoformat(),
        "age": age,
        "email": email,
        "phone": phone,
        "address_city": city,
        "address_district": district,
        "address_detail": addr_detail,
        "signup_date": sign.isoformat(),
        "last_active_date": last.isoformat(),
        "is_active": active,
        "churn_risk": churn,
        "income_monthly_krw": income,
        "credit_score": cs,
        "preferred_channel": preferred_channel(),
        "segment": seg,
        "marketing_opt_in": mk_opt,
        "loyalty_tier": loy,
        "avg_monthly_spend": avg_spend,
        "total_orders": orders,
        "last_order_value": last_order,
        "payment_method_primary": pay,
        "bank_name": bank,
        "card_brand": card,
    })

print(f"\n✅ Generated {len(rows):,} records")

## Create DataFrame

Convert to pandas DataFrame with ordered columns:

In [None]:
df = pd.DataFrame(rows)

# Define column order
cols = [
    "customer_id","name_ko","gender","birthdate","age",
    "email","phone",
    "address_city","address_district","address_detail",
    "signup_date","last_active_date","is_active","churn_risk",
    "income_monthly_krw","avg_monthly_spend","total_orders","last_order_value",
    "credit_score","loyalty_tier","preferred_channel","segment","marketing_opt_in",
    "payment_method_primary","bank_name","card_brand"
]
df = df[cols]

print(f"✅ DataFrame created: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"\nColumn list:")
for i, col in enumerate(df.columns, 1):
    print(f"   {i:2d}. {col}")

## Preview Dataset

Display sample records and summary statistics:

In [None]:
print("\n" + "="*80)
print("DATASET PREVIEW (First 10 Records)")
print("="*80 + "\n")
display(df.head(10))

In [None]:
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80 + "\n")

# Demographic summary
print("👥 Demographics:")
print(f"   Gender distribution:\n{df['gender'].value_counts()}\n")
print(f"   Age: min={df['age'].min()}, max={df['age'].max()}, mean={df['age'].mean():.1f}\n")

# Activity summary
print("📊 Activity:")
print(f"   Active customers: {df['is_active'].sum():,} ({df['is_active'].sum()/len(df)*100:.1f}%)")
print(f"   Avg churn risk: {df['churn_risk'].mean():.3f}\n")

# Financial summary
print("💰 Financial:")
print(f"   Avg monthly income: ₩{df['income_monthly_krw'].mean():,.0f}")
print(f"   Avg monthly spend: ₩{df['avg_monthly_spend'].mean():,.0f}")
print(f"   Avg credit score: {df['credit_score'].mean():.0f}\n")

# Loyalty distribution
print("🏆 Loyalty Tiers:")
print(df['loyalty_tier'].value_counts().sort_index())

# Channel preferences
print("\n📱 Preferred Channels:")
print(df['preferred_channel'].value_counts())

# Segment distribution
print("\n🎯 Customer Segments:")
print(df['segment'].value_counts())

## Save to Files

Export dataset to CSV and formatted Excel:

In [None]:
# Save CSV
df.to_csv(CSV_PATH, index=False, encoding="utf-8")
print(f"✅ CSV saved: {CSV_PATH}")
print(f"   Size: {CSV_PATH.stat().st_size / 1024:.1f} KB\n")

# Save Excel with formatting
with pd.ExcelWriter(XLSX_PATH, engine="xlsxwriter") as writer:
    df.to_excel(writer, index=False, sheet_name="customers")
    
    # Add formatting
    wb = writer.book
    fmt_money = wb.add_format({"num_format": "#,##0"})
    ws = writer.sheets["customers"]
    
    # Format money columns (P, Q, R, S = income, avg_spend, orders, last_order)
    money_cols = ["P", "Q", "R", "S"]
    for col in money_cols:
        ws.set_column(f"{col}:{col}", 14, fmt_money)
    
    # Freeze header row
    ws.freeze_panes(1, 0)

print(f"✅ Excel saved: {XLSX_PATH}")
print(f"   Size: {XLSX_PATH.stat().st_size / 1024:.1f} KB")

print("\n" + "="*80)
print("✅ GENERATION COMPLETE")
print("="*80)

## Advanced Analysis (Optional)

Run additional analysis on the generated dataset:

In [None]:
# Correlation between income and spending
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
plt.scatter(df['income_monthly_krw'], df['avg_monthly_spend'], alpha=0.3)
plt.xlabel('Monthly Income (KRW)')
plt.ylabel('Average Monthly Spend (KRW)')
plt.title('Income vs. Spending Pattern')
plt.tight_layout()
plt.show()

print(f"Correlation (income vs spend): {df['income_monthly_krw'].corr(df['avg_monthly_spend']):.3f}")

In [None]:
# Churn risk distribution by loyalty tier
plt.figure(figsize=(10, 6))
df.boxplot(column='churn_risk', by='loyalty_tier', grid=False)
plt.suptitle('')
plt.title('Churn Risk by Loyalty Tier')
plt.xlabel('Loyalty Tier')
plt.ylabel('Churn Risk Score')
plt.tight_layout()
plt.show()

In [None]:
# Age distribution histogram
plt.figure(figsize=(10, 6))
plt.hist(df['age'], bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Customer Age Distribution')
plt.axvline(df['age'].mean(), color='red', linestyle='--', label=f'Mean: {df["age"].mean():.1f}')
plt.legend()
plt.tight_layout()
plt.show()