In [14]:
!pip install faker

import pandas as pd
import numpy as np
import random
from faker import Faker

# 1. Setup
fake = Faker()
np.random.seed(42)
num_users = 100000  # <--- Change this to 1 Million if you want huge data!

# 2. Generate Base Data
data = {
    'user_id': [f'U{i}' for i in range(1000, 1000 + num_users)],
    'city': np.random.choice(['Mumbai', 'Delhi', 'Bangalore', 'Pune', 'Chennai'], num_users, p=[0.3, 0.3, 0.2, 0.1, 0.1]),
    'membership': np.random.choice(['Gold', 'Regular'], num_users, p=[0.4, 0.6]),
    'join_date': [fake.date_between(start_date='-2y', end_date='today') for _ in range(num_users)]
}

df = pd.DataFrame(data)

# 3. Add Behavioral Metrics (The Logic)
# Gold members order more frequently
df['avg_monthly_orders'] = np.where(df['membership']=='Gold', 
                                    np.random.normal(8, 2, num_users), 
                                    np.random.normal(3, 1, num_users))
df['avg_monthly_orders'] = df['avg_monthly_orders'].clip(lower=1).astype(int)

# Rain Fees (Higher in Mumbai/Bangalore)
rain_prob = df['city'].map({'Mumbai': 0.8, 'Bangalore': 0.6, 'Delhi': 0.2, 'Pune': 0.3, 'Chennai': 0.4})
df['rain_fees_paid'] = (np.random.binomial(n=5, p=rain_prob, size=num_users) * 25) # 25 Rs per fee

# Delivery Issues (Proxy for bad experience)
df['late_deliveries_last_month'] = np.random.poisson(lam=0.5, size=num_users)

# Average Order Value (AOV)
df['aov'] = np.random.normal(450, 150, num_users).astype(int)

# 4. Define Churn Logic (Ground Truth)
# IF (Rain Fees > 50 AND Membership = Gold) OR (Late Deliveries > 2) -> High Risk of Churn
conditions = [
    (df['membership'] == 'Gold') & (df['rain_fees_paid'] > 50),
    (df['late_deliveries_last_month'] > 2),
    (df['avg_monthly_orders'] < 2)
]
choices = [0.65, 0.80, 0.50] # Probability of Churn for each condition
df['churn_prob'] = np.select(conditions, choices, default=0.10) # Base churn is 10%

# Add some randomness to churn so it's not perfect
df['churn_status'] = np.random.binomial(1, df['churn_prob'])

# 5. Export
print(f"Generated {len(df)} rows.")
print(df.head())
df.to_csv('zomato_gold_churn_dataset.csv', index=False)

Generated 100000 rows.
  user_id       city membership   join_date  avg_monthly_orders  \
0   U1000      Delhi    Regular  2024-01-17                   4   
1   U1001    Chennai    Regular  2025-01-02                   2   
2   U1002  Bangalore       Gold  2025-09-10                   7   
3   U1003      Delhi    Regular  2025-01-13                   2   
4   U1004     Mumbai       Gold  2025-10-26                   6   

   rain_fees_paid  late_deliveries_last_month  aov  churn_prob  churn_status  
0              25                           0  583        0.10             0  
1              25                           0  309        0.10             0  
2             100                           0  546        0.65             1  
3              25                           0  493        0.10             0  
4              75                           0  459        0.65             0  
