In [23]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

num_samples = 20_000_000

devices = ["PS4", "PS5", "PS5_PRO"]
device_choices = np.random.choice(devices, size=num_samples, p=[0.1, 0.4, 0.5])

def generate_join_date(device):
    if device == "PS5":
        start_date = datetime(2020, 11, 20)
    elif device == "PS5_PRO":
        start_date = datetime(2024, 11, 7)
    else:
        start_date = datetime(2013, 12, 17)
    end_date = datetime(2025, 3, 20)
    delta = end_date - start_date
    return (start_date + timedelta(days=np.random.randint(0, delta.days))).strftime('%d-%m-%y')

genders = np.random.choice(['Male', 'Female'], size=num_samples, p=[0.9, 0.1])

age_range = list(range(14, 51))
raw_probs = []
for age in age_range:
    if 20 <= age <= 39:
        raw_probs.append(0.035)
    elif 14 <= age <= 19 or 40 <= age <= 45:
        raw_probs.append(0.02)
    else:
        raw_probs.append(0.01)
age_probs = [p / sum(raw_probs) for p in raw_probs]
ages = np.random.choice(age_range, size=num_samples, p=age_probs)

countries = [
    'United States', 'United Kingdom', 'Germany', 'France', 'Italy', 'Spain', 
    'Japan', 'Canada', 'Australia', 'South Korea',
    'Netherlands', 'Sweden', 'Norway', 'Finland', 'Poland', 'Austria', 'Switzerland', 'Belgium',
    'Mexico', 'Brazil', 'Argentina', 'India', 'Thailand', 'Indonesia', 'Malaysia', 'Philippines',
    'Russia', 'Turkey', 'China', 'South Africa', 'Moroco'
]

raw_country_probs = [
    0.112, 0.063, 0.052, 0.045, 0.039, 0.035, 0.071, 0.027, 0.022, 0.021,
    0.019, 0.018, 0.017, 0.016, 0.015, 0.014, 0.013, 0.012,
    0.011, 0.010, 0.0095, 0.009, 0.0085, 0.008, 0.0075, 0.0072,
    0.0068, 0.0065, 0.0062, 0.006,  0.0000000003
]
country_probs = np.array(raw_country_probs) / sum(raw_country_probs)
country_choices = np.random.choice(countries, size=num_samples, p=country_probs)

plan_durations = np.random.choice([1, 3, 12], size=num_samples)

join_dates = [generate_join_date(d) for d in device_choices]
last_payment_dates = [
    (datetime.strptime(j, "%d-%m-%y") + timedelta(days=random.randint(60, 365))).strftime('%d-%m-%y')
    for j in join_dates
]

def get_revenue(plan):
    if plan == 1:
        return 7500, 18800, 60000
    elif plan == 3:
        return 8000, 20000, 62000
    else:
        return 9000, 22000, 65000

monthly_rev, quarterly_rev, annual_rev = zip(*[get_revenue(p) for p in plan_durations])

subscription_types = np.random.choice(
    ["Delux", "Special", "Essential"],
    size=num_samples,
    p=[0.6, 0.15, 0.25]
)

df = pd.DataFrame({
    'User ID': range(num_samples),
    'Subscription Type': subscription_types,
    'Monthly Revenue': monthly_rev,
    'Quarterly Revenue': quarterly_rev,
    'Annual Revenue': annual_rev,
    'Join Date': join_dates,
    'Last Payment Date': last_payment_dates,
    'Country': country_choices,
    'Age': ages,
    'Gender': genders,
    'Device': device_choices,
    'Plan Duration': [f"{p} Month" for p in plan_durations]
})

df.to_csv("./data/psn.csv", index=False)