In [43]:
import numpy as np
import pandas as pd
import uuid, base64
import json
import itertools
from datetime import timedelta

In [3]:
# Helper: shortuuid-like generator
def short_uuid():
    return base64.urlsafe_b64encode(uuid.uuid4().bytes).decode('utf-8').rstrip('=\n')[:8]

In [107]:
id_generator = itertools.count(1)

In [79]:
rng = np.random.default_rng(seed=42)

0.7739560485559633 0.4388784397520523


In [81]:
# Create accounts and subscription events
n_accounts = 30
start_date = pd.to_datetime("2024-01-01")
end_date = pd.to_datetime("2025-06-30")

# Parameters
scale_months = 15  # mean of exponential distribution for number of months before cancelation
subscription_events = []

accounts = []
for _ in range(n_accounts):
    account_id = short_uuid()

    n_subaccounts = np.clip(rng.poisson(1), 1, 3)
    # start ts of each tier
    tier_change_ts = []
    # Generate subaccounts and subscribe/upgrade events
    for i in range(n_subaccounts):
        subaccount_id = short_uuid()
        if i==0:
            creation_ts = start_date + (end_date - start_date) * rng.random()
            accounts.append({"account_id": account_id, "subaccount_id": subaccount_id, "creation_ts": creation_ts})
            
            tier_change_ts.append(creation_ts)
            # Subscribe event
            subscription_events.append({
                "event_id": next(id_generator),
                "account_id": account_id,
                "timestamp": creation_ts,
                "event": "subscribe",
                "tier": "Tier1"
            })
            # Determine cancel timestamp
            duration_months = int(rng.exponential(scale=scale_months))
            duration_months = max(duration_months, 1)
            cancel_ts = creation_ts + pd.DateOffset(months=duration_months)
        else:
            creation_ts += (end_date - creation_ts) * rng.random()
            if creation_ts < cancel_ts:
                accounts.append({"account_id": account_id, "subaccount_id": subaccount_id, "creation_ts": creation_ts})
        
                tier_change_ts.append(creation_ts)
                subscription_events.append({
                    "event_id": next(id_generator),
                    "account_id": account_id,
                    "timestamp": creation_ts - timedelta(seconds=1),
                    "event": "upgrade",
                    "tier": "Tier2" if i == 1 else "Tier3"
                })

    # Current renewal ts
    renew_ts = tier_change_ts[0] + pd.DateOffset(months=1)
    current_tier = "Tier1"
    while renew_ts < cancel_ts and renew_ts < end_date:
        if len(tier_change_ts) == 2 and renew_ts > tier_change_ts[1]:
            current_tier = "Tier2"

        elif len(tier_change_ts) == 3:
            if renew_ts > tier_change_ts[1] and renew_ts < tier_change_ts[2]:
                current_tier = "Tier2"
            elif renew_ts > tier_change_ts[2]:
                current_tier = "Tier3"                
        # Renew event
        subscription_events.append({
            "event_id": next(id_generator),
            "account_id": account_id,
            "timestamp": renew_ts,
            "event": "renew",
            "tier": current_tier
            })
        renew_ts += pd.DateOffset(months=1)
    
    # Cancel event
    if cancel_ts < end_date:
        subscription_events.append({
            "event_id": next(id_generator),
            "account_id": account_id,
            "timestamp": cancel_ts,
            "event": "cancel",
            "tier": current_tier
        })
               

df_accounts = pd.DataFrame(accounts)
df_accounts.to_csv("../seeds/accounts.csv", index=False)

df_sub_events = pd.DataFrame(subscription_events).sort_values(["account_id", "timestamp"])
df_sub_events.to_csv("../seeds/subscription_events.csv", index=False)

In [70]:
# Generate Content
# # Parameters
n_rows = 30
locales = ["en-US", "ja-JP", "ko-KR", "zh-CN"]
lang_probs = [0.7, 0.1, 0.05, 0.15]
content_types = ["movie", "series", "show"]
content_probs = [1/3, 1/3, 1/3]

# Generate rows
accounts = []
for _ in range(n_rows):
    content_id = short_uuid()
    locale = rng.choice(locales, p=lang_probs)
    content_type = rng.choice(content_types, p=content_probs)
    accounts.append({"id": content_id, "locale": locale, "content_type": content_type})

df_content = pd.DataFrame(accounts)

# Save to CSV
df_content.to_csv("../seeds/content.csv", index=False)

df_content.head(10)

Unnamed: 0,id,locale,content_type
0,PCchwvVI,en-US,movie
1,4-f4Bsvh,zh-CN,series
2,wtEJMROi,en-US,series
3,7R_7uH-2,en-US,show
4,juWFskrN,en-US,show
5,GA6n3i0p,en-US,series
6,xHO9n33N,en-US,show
7,ScpDeNT9,en-US,series
8,Z9TzEVtb,en-US,movie
9,HlkZx7vb,en-US,series


In [82]:
# Assign country at account-level (US/CA split)
account_countries = {
    acc: rng.choice(["US", "CA"], p=[0.6, 0.4]) for acc in df_accounts["account_id"].unique()
}
df_accounts["country"] = df_accounts["account_id"].map(account_countries)

# Assign unique device_id per subaccount
df_accounts["device_id"] = [short_uuid() for _ in range(len(df_accounts))]

In [83]:
# Simulation of app events
events = []
sessions = []

for _, row in df_accounts.iterrows():
    sub_id = row["subaccount_id"]
    device_id = row["device_id"]
    country = row["country"]
    creation_ts = row["creation_ts"]
    
    # Choose login pattern
    pattern = rng.choice(["monthly", "weekend", "weekly"], p=[0.25, 0.40, 0.35])
    
    # Generate login times
    login_times = []
    if pattern == "monthly":
        for m in pd.date_range(creation_ts, end_date, freq="MS"):
            rand_day = m + pd.to_timedelta(rng.integers(0, 28), unit="D")
            rand_time = rand_day + pd.to_timedelta(rng.integers(0, 24*60*60), unit="s")
            if rand_time <= end_date: 
                login_times.append(rand_time)
    elif pattern == "weekend":
        for d in pd.date_range(creation_ts, end_date, freq="W-SAT"):
            chosen = d + pd.to_timedelta(rng.integers(0, 2), unit="D")  # Sat or Sun
            rand_time = chosen + pd.to_timedelta(rng.integers(0, 24*60*60), unit="s")
            if rand_time <= end_date: 
                login_times.append(rand_time)
    elif pattern == "weekly":
        for d in pd.date_range(creation_ts, end_date, freq="W"):
            rand_time = d + pd.to_timedelta(rng.integers(0, 7*24*60*60), unit="s")
            if rand_time <= end_date: 
                login_times.append(rand_time)
    
    # Simulate sessions
    for login_ts in login_times:
        session_id = short_uuid()

        # Log-in event
        events.append({
            "subaccount_id": sub_id, "timestamp": login_ts, "session_id": session_id,
            "device_id": device_id, "country": country,
            "event_name": "Log-in", "properties": {}
        })
        
        # Pick content
        content = df_content.sample(1).iloc[0]
        
        # Watch duration ~ Exp(60 mins)
        duration_min = max(1, int(rng.exponential(scale=60)))
        logout_ts = login_ts + timedelta(minutes=duration_min)
        if logout_ts > end_date: 
            logout_ts = end_date
        
        # PlayProgress every 10 minutes
        for t in range(10, duration_min+1, 10):
            prog_ts = login_ts + timedelta(minutes=t)
            if prog_ts >= logout_ts or prog_ts > end_date: 
                break

            events.append({
                "subaccount_id": sub_id, "timestamp": prog_ts, "session_id": session_id,
                "device_id": device_id, "country": country,
                "event_name": "PlayProgress", 
                "properties": {"content_id": content["id"], "content_type": str(content["content_type"])}
            })
            
        # Log-out event
        events.append({
            "subaccount_id": sub_id, "timestamp": logout_ts, "session_id": session_id,
            "device_id": device_id, "country": country,
            "event_name": "Log-out", "properties": {}
        })

        sessions.append({
            "id": session_id, "subaccount_id": sub_id, 
            "started_at": login_ts, "ended_at": logout_ts
        })

df_events = pd.DataFrame(events).sort_values("timestamp")
df_sessions = pd.DataFrame(sessions)

# Convert dicts to JSON strings with double quotes inside
df_events["properties"] = df_events["properties"].apply(json.dumps)

df_events.to_csv("../seeds/app_events.csv", index=False)
df_sessions.to_csv("../seeds/sessions.csv", index=False)

df_events.sort_values(by=['subaccount_id', 'timestamp']).head(20)

Unnamed: 0,subaccount_id,timestamp,session_id,device_id,country,event_name,properties
7475,0m3vTeIp,2024-03-09 06:50:52.805014037,hJp_TMtx,Ak37zV-C,US,Log-in,{}
7476,0m3vTeIp,2024-03-09 07:00:52.805014037,hJp_TMtx,Ak37zV-C,US,PlayProgress,"{""content_id"": ""G4RoRl8_"", ""content_type"": ""mo..."
7477,0m3vTeIp,2024-03-09 07:10:52.805014037,hJp_TMtx,Ak37zV-C,US,PlayProgress,"{""content_id"": ""G4RoRl8_"", ""content_type"": ""mo..."
7478,0m3vTeIp,2024-03-09 07:20:52.805014037,hJp_TMtx,Ak37zV-C,US,Log-out,{}
7479,0m3vTeIp,2024-04-25 21:42:27.805014037,Io5PDBEH,Ak37zV-C,US,Log-in,{}
7480,0m3vTeIp,2024-04-25 21:52:27.805014037,Io5PDBEH,Ak37zV-C,US,PlayProgress,"{""content_id"": ""iWhroHCs"", ""content_type"": ""sh..."
7481,0m3vTeIp,2024-04-25 22:02:27.805014037,Io5PDBEH,Ak37zV-C,US,PlayProgress,"{""content_id"": ""iWhroHCs"", ""content_type"": ""sh..."
7482,0m3vTeIp,2024-04-25 22:12:27.805014037,Io5PDBEH,Ak37zV-C,US,PlayProgress,"{""content_id"": ""iWhroHCs"", ""content_type"": ""sh..."
7483,0m3vTeIp,2024-04-25 22:22:27.805014037,Io5PDBEH,Ak37zV-C,US,PlayProgress,"{""content_id"": ""iWhroHCs"", ""content_type"": ""sh..."
7484,0m3vTeIp,2024-04-25 22:32:27.805014037,Io5PDBEH,Ak37zV-C,US,PlayProgress,"{""content_id"": ""iWhroHCs"", ""content_type"": ""sh..."


In [108]:
payments = []
tier_prices = [9, 17, 23]
for _, row in df_sub_events.iterrows():
    event = row['event']
    if event in ('subscribe', 'upgrade', 'renew'):
        event_id = row['event_id']
        account_id = row['account_id']
        country = df_accounts[df_accounts['account_id'] == account_id]['country'].iloc[0]
        ts = df_sub_events[df_sub_events['event_id']==event_id]['timestamp'].iloc[0]
        if country == 'CA':
            currency = 'CAD'
        else:
            currency = 'USD'
        tier_index = int(row['tier'][-1])
        amount = tier_prices[tier_index - 1]

        payments.append({'id': next(id_generator), 'event_id': event_id, 
                        'account_id': account_id, 'amount': amount, 
                        'currency': currency, 'status': 'success',
                        'created_at': ts+timedelta(minutes=1)})
    
df_payments = pd.DataFrame(payments)
df_payments.to_csv("../seeds/payments.csv", index=False)

: 