In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
print(sys.executable)
!{sys.executable} -m pip install --upgrade Faker

/usr/bin/python3
Collecting Faker
  Downloading faker-38.2.0-py3-none-any.whl.metadata (16 kB)
Downloading faker-38.2.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Faker
Successfully installed Faker-38.2.0


In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from faker import Faker
import random
from datetime import datetime, timedelta, date

spark = SparkSession.builder.appName("subscription_churn_fake").getOrCreate()

fake = Faker()
random.seed(42)
Faker.seed(42)

END_DATE = datetime.now()
START_DATE = END_DATE - timedelta(days=365)  # last 12 months


In [4]:
def build_users_df(n_users=2000):
    plans = ["free", "basic", "premium"]
    countries = ["US", "BR", "UK", "CA", "DE", "FR", "IN", "AU"]
    channels = ["organic", "paid_search", "social_ads", "referral", "partner"]

    plan_weights = [0.45, 0.40, 0.15]

    data = []
    for user_id in range(1, n_users + 1):
        signup_dt = fake.date_time_between(start_date=START_DATE, end_date=END_DATE)
        plan = random.choices(plans, weights=plan_weights, k=1)[0]
        country = random.choice(countries)
        acquisition_channel = random.choices(channels, weights=[0.35, 0.20, 0.20, 0.20, 0.05], k=1)[0]

        data.append((user_id, signup_dt.date(), plan, country, acquisition_channel))

    cols = ["user_id", "signup_date", "plan", "country", "acquisition_channel"]
    return spark.createDataFrame(data, cols)

df_users = build_users_df(2000)
df_users.show(5, truncate=False)


+-------+-----------+-----+-------+-------------------+
|user_id|signup_date|plan |country|acquisition_channel|
+-------+-----------+-----+-------+-------------------+
|1      |2025-08-06 |basic|US     |social_ads         |
|2      |2024-12-25 |free |UK     |social_ads         |
|3      |2025-03-26 |basic|BR     |social_ads         |
|4      |2025-03-07 |free |BR     |organic            |
|5      |2025-09-10 |basic|US     |social_ads         |
+-------+-----------+-----+-------+-------------------+
only showing top 5 rows


In [5]:
def build_churn_events_df(df_users, churn_rate_by_plan=None):
    if churn_rate_by_plan is None:
        churn_rate_by_plan = {"free": 0.28, "basic": 0.16, "premium": 0.10}

    reasons = ["price", "not_using", "missing_features", "bugs", "competitor", "other"]

    users = df_users.select("user_id", "signup_date", "plan").collect()

    data = []
    for r in users:
        user_id = int(r["user_id"])
        signup_date = r["signup_date"]  # python date
        plan = r["plan"]

        churn_prob = churn_rate_by_plan.get(plan, 0.15)
        churned = (random.random() < churn_prob)

        if churned:
            min_dt = datetime.combine(signup_date, datetime.min.time()) + timedelta(days=14)
            max_dt = END_DATE - timedelta(days=1)
            if min_dt < max_dt:
                churn_dt = fake.date_time_between(start_date=min_dt, end_date=max_dt).date()
                reason = random.choices(reasons, weights=[0.22, 0.26, 0.18, 0.10, 0.14, 0.10], k=1)[0]
                data.append((user_id, churn_dt, reason))

    cols = ["user_id", "churn_date", "reason"]
    return spark.createDataFrame(data, cols)

df_churn = build_churn_events_df(df_users)
df_churn.show(5, truncate=False)


+-------+----------+---------+
|user_id|churn_date|reason   |
+-------+----------+---------+
|5      |2025-10-05|price    |
|10     |2025-08-21|price    |
|23     |2025-06-21|price    |
|24     |2025-05-23|not_using|
|26     |2025-05-14|not_using|
+-------+----------+---------+
only showing top 5 rows
