# Synthetic Churn Data
This is a realistic Python generator for a churn dataset using your exact feature setup.
It creates plausible distributions and makes churn depend on behavior (tenure, usage, segment, etc.), so it feels close to real-world data.

## Data generator

In [9]:
import numpy as np
import pandas as pd

np.random.seed(42)

# -----------------------------
# Configuration
# -----------------------------
N = 6000

identifier_features = ["customer_id"]
target = "churn"

numeric_features = ["age", "income", "monthly_charges", "tenure_years", "avg_monthly_usage"]
categorical_features = ["contract_type", "payment_method", "region"]
ordinal_features = {"customer_segment": ["Low", "Medium", "High"]}
boolean_features = ["has_dependents", "paperless_billing"]

# -----------------------------
# Generate numeric features
# -----------------------------
data = pd.DataFrame({
    "customer_id": np.arange(1, N + 1),

    # Age
    "age": np.clip(np.random.normal(42, 13, N).astype(int), 18, 85),

    # Income (right-skewed)
    "income": np.random.lognormal(mean=10.6, sigma=0.55, size=N).astype(int),

    # Tenure: many new customers, few very loyal
    "tenure_years": np.round(np.random.exponential(scale=3.5, size=N), 1),

    # Usage (GB / hours / units)
    "avg_monthly_usage": np.clip(
        np.random.normal(55, 18, N),
        5,
        250
    )
})

# Monthly charges depend on usage + segment noise
data["monthly_charges"] = np.clip(
    25 + data["avg_monthly_usage"] * np.random.uniform(0.4, 0.7, N)
    + np.random.normal(0, 12, N),
    20,
    250
).round(2)

# -----------------------------
# Categorical features
# -----------------------------
data["contract_type"] = np.random.choice(
    ["Month-to-Month", "One Year", "Two Year"],
    size=N,
    p=[0.55, 0.25, 0.20]
)

data["payment_method"] = np.random.choice(
    ["Electronic Check", "Credit Card", "Bank Transfer", "Mailed Check"],
    size=N,
    p=[0.35, 0.30, 0.25, 0.10]
)

data["region"] = np.random.choice(
    ["North", "South", "East", "West"],
    size=N,
    p=[0.25, 0.30, 0.25, 0.20]
)

# -----------------------------
# Ordinal feature
# -----------------------------
data["customer_segment"] = np.random.choice(
    ["Low", "Medium", "High"],
    size=N,
    p=[0.45, 0.35, 0.20]
)

# -----------------------------
# Boolean features
# -----------------------------
data["has_dependents"] = np.random.choice([0, 1], size=N, p=[0.42, 0.58])
data["paperless_billing"] = np.random.choice([0, 1], size=N, p=[0.30, 0.70])

# -----------------------------
# Churn probability logic
# -----------------------------
churn_prob = 0.22

# Contract type effect
churn_prob += np.where(data["contract_type"] == "Month-to-Month", 0.20, 0)
churn_prob -= np.where(data["contract_type"] == "Two Year", 0.15, 0)

# Tenure effect
churn_prob += np.where(data["tenure_years"] < 1, 0.18, 0)
churn_prob -= np.where(data["tenure_years"] > 5, 0.10, 0)

# High charges increase churn
churn_prob += np.where(data["monthly_charges"] > 120, 0.12, 0)

# Low usage increases churn
churn_prob += np.where(data["avg_monthly_usage"] < 30, 0.10, 0)

# Customer segment effect
churn_prob += data["customer_segment"].map({
    "Low": 0.15,
    "Medium": 0.00,
    "High": -0.15
})

# Payment method risk
churn_prob += np.where(data["payment_method"] == "Electronic Check", 0.05, 0)

# Dependents & paperless reduce churn
churn_prob -= data["has_dependents"] * 0.05
churn_prob -= data["paperless_billing"] * 0.04

# Bound probabilities
churn_prob = np.clip(churn_prob, 0.02, 0.90)

# Sample churn
data["churn"] = np.random.binomial(1, churn_prob)

# -----------------------------
# Final output
# -----------------------------
data.head()

Unnamed: 0,customer_id,age,income,tenure_years,avg_monthly_usage,monthly_charges,contract_type,payment_method,region,customer_segment,has_dependents,paperless_billing,churn
0,1,48,21747,1.6,39.953084,32.88,Two Year,Bank Transfer,South,High,0,0,0
1,2,40,28367,3.0,26.016944,59.44,Month-to-Month,Credit Card,North,Low,0,1,0
2,3,50,23905,2.0,48.391416,56.08,One Year,Bank Transfer,North,High,0,0,0
3,4,61,29691,0.1,75.213449,87.44,Month-to-Month,Credit Card,South,Low,1,0,1
4,5,38,35675,17.9,66.80729,52.78,Month-to-Month,Credit Card,South,Low,0,0,1


In [10]:
print("\nChurn rate:", data["churn"].mean())


Churn rate: 0.31983333333333336


# Optional: save

In [11]:
data.to_csv("../datasets/synthetic_churn_data.csv", index=False)