In [1]:
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta

# ----------------------------
# 1. Generate USERS dataset
# ----------------------------

np.random.seed(42)

num_users = 1000

countries = ["IN", "US", "UK", "BR"]
subscriptions = ["Basic", "Standard", "Premium"]
preferred_devices = ["Mobile", "Laptop", "TV"]

users = pd.DataFrame({
    "user_id": np.arange(1, num_users + 1),
    "country": np.random.choice(countries, num_users, p=[0.40, 0.30, 0.15, 0.15]),
    "subscription_type": np.random.choice(subscriptions, num_users, p=[0.30, 0.40, 0.30]),
    "device_preferred": np.random.choice(preferred_devices, num_users, p=[0.50, 0.25, 0.25]),
    "signup_date": pd.to_datetime("2021-01-01") + pd.to_timedelta(np.random.randint(0, 900, num_users), unit='D')
})

# ----------------------------
# 2. Generate SESSIONS dataset
# ----------------------------

num_sessions = 5000

genres = ["Action", "Thriller", "Comedy", "Drama", "Sci-Fi", "Horror"]
content_types = ["Movie", "Series"]
time_of_day_bins = ["Morning", "Afternoon", "Evening", "Night"]
bounce_reasons = ["UI Confusion", "No Interest", "Preview Too Short", "Playback Error", None]

session_list = []

for i in range(1, num_sessions + 1):

    # randomly pick a user for this session
    uid = np.random.randint(1, num_users + 1)
    user_country = users.loc[users["user_id"] == uid, "country"].values[0]
    subscription = users.loc[users["user_id"] == uid, "subscription_type"].values[0]

    # A/B group assignment
    group = np.random.choice(["A", "B"])

    # controlled bias for A/B behavior
    base_watch_prob = 0.65 if group == "A" else 0.75
    base_completion_prob = 0.52 if group == "A" else 0.62

    watched = np.random.choice([0, 1], p=[1 - base_watch_prob, base_watch_prob])
    watch_completion = np.random.choice([0, 1], p=[1 - base_completion_prob, base_completion_prob]) if watched == 1 else 0

    # session properties
    session_length = round(max(5, np.random.normal(12 if group == "A" else 14, 4)), 2)
    clicks_before_watch = np.random.poisson(2 if group == "A" else 1.5)

    # device might differ from preferred device
    device_type = np.random.choice(["Mobile", "Laptop", "TV"], p=[0.55, 0.20, 0.25])

    content_type = np.random.choice(content_types)
    genre = np.random.choice(genres)

    # datetime logic
    session_start = datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 90),
                                                     hours=np.random.randint(0, 24))
    hour = session_start.hour
    if hour < 12:
        tod = "Morning"
    elif hour < 17:
        tod = "Afternoon"
    elif hour < 21:
        tod = "Evening"
    else:
        tod = "Night"

    # bounce reason only if NOT watched
    b_reason = None
    if watched == 0:
        b_reason = np.random.choice(bounce_reasons[:-1])  # exclude None

    session_list.append([
        i,
        uid,
        group,
        session_start,
        session_length,
        watched,
        watch_completion,
        clicks_before_watch,
        device_type,
        content_type,
        genre,
        tod,
        b_reason
    ])

sessions = pd.DataFrame(session_list, columns=[
    "session_id", "user_id", "group", "session_start", "session_length",
    "watched", "watch_completion", "clicks_before_watch",
    "device_type", "content_type", "genre", "time_of_day", "bounce_reason"
])

# ----------------------------
# 3. Save both CSVs
# ----------------------------

users.to_csv("users.csv", index=False)
sessions.to_csv("sessions.csv", index=False)

users.head(), sessions.head()


(   user_id country subscription_type device_preferred signup_date
 0        1      IN             Basic           Mobile  2023-06-02
 1        2      BR          Standard           Mobile  2021-11-09
 2        3      UK           Premium               TV  2023-04-20
 3        4      US           Premium           Mobile  2022-06-03
 4        5      IN           Premium           Mobile  2021-07-25,
    session_id  user_id group       session_start  session_length  watched  \
 0           1      849     B 2024-02-17 17:00:00           13.80        1   
 1           2      912     A 2024-01-04 06:00:00            6.38        1   
 2           3       72     A 2024-02-26 05:00:00           12.93        1   
 3           4      206     B 2024-03-09 19:00:00           13.12        1   
 4           5      345     A 2024-02-18 19:00:00           13.63        0   
 
    watch_completion  clicks_before_watch device_type content_type     genre  \
 0                 0                    2      

In [2]:
from google.colab import files

files.download('users.csv')
files.download('sessions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>