In [1]:
# pip install duckdb

In [2]:
# pip install torch-directml

### This file is used to synthesize data that mimics the existing patterns of a dataset. For project purposes sake, we can synthesize data to more realistically represent real-world data. 

This currently only applies to the feature_usage.csv

In [3]:
import duckdb 
import os
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import timedelta
import uuid
from sdv.metadata import Metadata
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.sampling import Condition
import torch
import torch_directml



con = duckdb.connect()

base_dir = Path("C:\\Users\\henry\\OneDrive\\Personal Career\\Personal Projects\\GitHub\\Revenue-Sustainability-Analysis")
data_dir = Path(base_dir / "raw_data/Kaggle")

In [4]:
pd.set_option('display.max_rows', 10)

In [5]:
df = con.execute(f"""
    SELECT *
    FROM '{data_dir}/ravenstack_feature_usage.csv'
                 """).df()

In [6]:
# Check if DirectML is available
if torch_directml.is_available():
    device = torch_directml.device()
    print(f"Success! Using device: {device}")
    
    # Create a tensor on your AMD GPU
    x = torch.ones(3).to(device)
    print("Tensor on AMD GPU:", x)
else:
    print("DirectML not found. Check your installation.")

Success! Using device: privateuseone:0
Tensor on AMD GPU: tensor([1., 1., 1.], device='privateuseone:0')


### Introduce synthetic data to feature_usage since we do not have enough rows to replicate realistic data

In [7]:
RELEASE_TS = pd.Timestamp("2025-10-01")
INTRODUCED_FEATURE = "feature_new_ai"
TOTAL_ROWS = 50000
POST_RELEASE_LIFT = 3.0
TABLE = "usage_events"

df = df.copy()

# types
df["usage_date"] = pd.to_datetime(df["usage_date"], errors="coerce")
df["feature_name"] = df["feature_name"].replace("None", pd.NA)
df = df.dropna(subset=["feature_name"])

df["subscription_id"] = df["subscription_id"].astype(str)
df["feature_name"] = df["feature_name"].astype(str)
df["is_beta_feature"] = df["is_beta_feature"].astype(bool)

for c in ["usage_count", "usage_duration_secs", "error_count"]:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

# release flag
df["post_release"] = (df["usage_date"] >= RELEASE_TS)

# ensure introduced feature exists
if INTRODUCED_FEATURE not in set(df["feature_name"].unique()):
    seed_n = max(10, int(0.002 * len(df)))
    seed = df.sample(seed_n, random_state=42).copy()
    seed["feature_name"] = INTRODUCED_FEATURE
    seed["post_release"] = True
    seed["is_beta_feature"] = False
    df = pd.concat([df, seed], ignore_index=True)

# CRITICAL: create guaranteed-unique PK AFTER all concatenations
df["usage_pk"] = [uuid.uuid4().hex for _ in range(len(df))]

# metadata
metadata = Metadata.detect_from_dataframe(data=df, table_name=TABLE)
metadata.update_column(table_name=TABLE, column_name="usage_pk", sdtype="id")
metadata.set_primary_key(table_name=TABLE, column_name="usage_pk")
metadata.update_column(table_name=TABLE, column_name="usage_date", sdtype="datetime")

# fit
synth = GaussianCopulaSynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True)
synth.fit(df)

# sample baseline
syn_base = synth.sample(num_rows=TOTAL_ROWS)

# oversample introduced feature post-release
post_df = df[df["post_release"] == True]
post_share = len(post_df) / len(df) if len(df) else 0.5
post_n = int(TOTAL_ROWS * post_share)

base_feat_share_post = (post_df["feature_name"] == INTRODUCED_FEATURE).mean() if len(post_df) else 0.0
base_feat_n_post = int(post_n * base_feat_share_post)
target_feat_n_post = int(base_feat_n_post * POST_RELEASE_LIFT)
add_n = max(0, target_feat_n_post - base_feat_n_post)

if add_n > 0:
    cond = Condition({"post_release": True, "feature_name": INTRODUCED_FEATURE}, num_rows=add_n)
    syn_lift = synth.sample_from_conditions([cond])
    synthetic_df = pd.concat([syn_base, syn_lift], ignore_index=True)
else:
    synthetic_df = syn_base.copy()

# final unique id for synthetic dataset
synthetic_df["usage_id"] = [f"syn_usage_{i}" for i in range(len(synthetic_df))]


Sampling conditions: 100%|██████████| 198/198 [00:00<00:00, 2211.73it/s]


In [8]:
# ----------------------------
# A) Baseline feature adoption table (from df)
# ----------------------------
df_base = df.copy()

# Clean feature names robustly (handles "None", None, "", whitespace)
df_base["feature_name"] = (
    df_base["feature_name"]
    .astype("string")
    .str.strip()
    .replace({"None": pd.NA, "": pd.NA})
)
df_base = df_base.dropna(subset=["feature_name"])

# Safety: ensure numerics are numeric (prevents groupby sum/median issues)
df_base["usage_count"] = pd.to_numeric(df_base["usage_count"], errors="coerce").fillna(0)
df_base["usage_duration_secs"] = pd.to_numeric(df_base["usage_duration_secs"], errors="coerce").fillna(0)

# Denominator
total_subs = df_base["subscription_id"].nunique()

# Aggregate per (feature, subscription) so power users don't dominate
feature_sub_agg = (
    df_base
    .groupby(["feature_name", "subscription_id"], as_index=False)
    .agg(
        total_usage_count=("usage_count", "sum"),
        total_usage_duration=("usage_duration_secs", "sum"),
    )
)

# Feature-level baseline table
feature_baseline = (
    feature_sub_agg
    .groupby("feature_name", as_index=False)
    .agg(
        subscriptions_used=("subscription_id", "nunique"),
        median_usage_count_per_sub=("total_usage_count", "median"),
        median_usage_duration_secs_per_sub=("total_usage_duration", "median"),
    )
)

# % of subscriptions used
feature_baseline["pct_subscriptions_used"] = (
    feature_baseline["subscriptions_used"] / total_subs * 100
).round(2)

# Optional rounding/typing
feature_baseline["median_usage_duration_secs_per_sub"] = (
    feature_baseline["median_usage_duration_secs_per_sub"]
    .fillna(0)
    .round(0)
    .astype(int)
)

feature_baseline = feature_baseline.sort_values("pct_subscriptions_used", ascending=False)

# ----------------------------
# B) Synthetic df cleanup + realistic usage_date generation
# ----------------------------
# Fix: str methods break if columns aren't strings; also "sdv-" replacement should be literal (regex=False)
for col, prefix in [("usage_id", "syn_"), ("usage_pk", "sdv-")]:
    if col in synthetic_df.columns:
        synthetic_df[col] = (
            synthetic_df[col]
            .astype("string")
            .str.replace(prefix, "", regex=False)
        )

# Create realistic usage_date based on one anchor per subscription + random offset
start = pd.Timestamp("2023-06-12")
end   = pd.Timestamp("2024-12-31")

# one anchor date per subscription (normalized to date)
subs = synthetic_df[["subscription_id"]].drop_duplicates()

anchors = subs.assign(
    anchor=pd.to_datetime(
        np.random.randint(
            start.value // 10**9,
            end.value   // 10**9,
            size=len(subs)
        ),
        unit="s",
        utc=True,  # avoids timezone surprises
    ).tz_convert(None).normalize()  # back to naive midnight
)

synthetic_df = synthetic_df.merge(anchors, on="subscription_id", how="left")

# each event occurs 0–60 days after the anchor
synthetic_df["usage_date"] = (
    synthetic_df["anchor"] + pd.to_timedelta(np.random.randint(0, 61, size=len(synthetic_df)), unit="D")
)

synthetic_df = synthetic_df.drop(columns=["anchor"])

synthetic_df['usage_id'] = synthetic_df['usage_id'].str.replace('syn_', "", regex=False)
synthetic_df['usage_pk'] = synthetic_df['usage_pk'].str.replace('sdv-',"")

### Ensure that we have enough usage dates for our new feature, just so that we have realistic dataset

In [9]:
df = synthetic_df

users_once = (
    df[df["feature_name"] == "feature_newai"]
    ["subscription_id"]
    .unique()
)

synthetic_rows = []

for user in users_once:
    base_date = df.loc[
        df["subscription_id"] == user, "usage_day"
    ].dropna().min()

    if pd.isna(base_date):
        continue

    roll = np.random.rand()

    if roll < 0.6:
        days = 1
    elif roll < 0.85:
        days = 2
    else:
        days = np.random.randint(3, 6)

    for d in range(1, days):
        synthetic_rows.append({
            "subscription_id": user,
            "feature_name": "feature_newai",
            "usage_day": base_date + timedelta(days=d),
            "synthetic": True
        })

synthetic_df = pd.DataFrame(synthetic_rows)
df_augmented = pd.concat([df, synthetic_df], ignore_index=True)

In [10]:
# -----------------------------
# SETTINGS YOU CAN TWEAK
# -----------------------------
FEATURE = "feature_newai"

ADD_NEW_USERS = 200        # reach: how many brand-new subscription_ids to add with 1-day usage
PROMOTE_USERS = 150        # depth: how many users (existing + new) to push into 2/3/4 days

# adoption distribution for promoted users (must sum to 1.0)
# "Most 1 day" applies naturally; promotion assigns targets for a subset
P_2DAY = 0.70              # most promoted -> 2 days
P_3DAY = 0.25              # fewer -> 3 days
P_4DAY = 0.05              # only a few -> 4 days

SEED = 42
rng = np.random.default_rng(SEED)


# -----------------------------
# 1) CLEAN DATES
# -----------------------------
feats = df_augmented.copy()  # (overwrites variable feats; keeps your original object safe if needed)
feats["synthetic"] = feats.get("synthetic", False)

feats["usage_date"] = feats["usage_date"].astype(str).str.strip()
feats["usage_date"] = pd.to_datetime(feats["usage_date"], errors="coerce")
feats["_usage_day"] = feats["usage_date"].dt.normalize()


# -----------------------------
# 2) TRAIN SDV ON EXISTING feature_newai ROWS (fallback to all rows if too few)
# -----------------------------
train = feats[(feats["feature_name"] == FEATURE) & feats["subscription_id"].notna() & feats["_usage_day"].notna()].copy()

# If too sparse, SDV will learn garbage. Fallback to all rows (still SDV, just broader behavior).
if len(train) < 50:
    train = feats[feats["subscription_id"].notna() & feats["_usage_day"].notna()].copy()
    print(f"Warning: feature-specific rows are sparse. Training SDV on broader data: {len(train)} rows.")

exclude_cols = {"subscription_id", "usage_id", "usage_pk", "synthetic"}
model_cols = [c for c in train.columns if c not in exclude_cols and c != "_usage_day"]

model_df = train[model_cols].copy()

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(model_df)

synth = GaussianCopulaSynthesizer(metadata)
synth.fit(model_df)


# -----------------------------
# 3) DATE WINDOW FOR REALISM
# -----------------------------
global_min = feats["_usage_day"].min()
global_max = feats["_usage_day"].max()
if pd.isna(global_min) or pd.isna(global_max):
    raise ValueError("No valid date window after parsing usage_date.")

date_range = pd.date_range(global_min, global_max, freq="D")


# -----------------------------
# 4) HELPERS
# -----------------------------
def make_sub_ids(n, existing):
    existing = set(existing)
    new_ids = []
    while len(new_ids) < n:
        cid = "S-" + uuid.uuid4().hex[:6]
        if cid not in existing:
            existing.add(cid)
            new_ids.append(cid)
    return new_ids

def make_usage_id():
    return "U-syn" + uuid.uuid4().hex[:8]

def make_usage_pk():
    return "id_syn_" + uuid.uuid4().hex[:10]

def pick_target_days(n):
    # promoted users get 2/3/4 day targets only, per distribution
    choices = np.array([2, 3, 4])
    probs = np.array([P_2DAY, P_3DAY, P_4DAY])
    probs = probs / probs.sum()
    return rng.choice(choices, size=n, replace=True, p=probs)


# -----------------------------
# 5) REACH: ADD NEW USERS (each gets exactly 1 day of feature usage)
# -----------------------------
existing_all_users = feats["subscription_id"].dropna().unique()
new_users = make_sub_ids(ADD_NEW_USERS, existing_all_users)

new_user_rows = synth.sample(ADD_NEW_USERS)

# enforce feature
new_user_rows["feature_name"] = FEATURE

# assign each new user exactly one usage day
new_days = rng.choice(date_range, size=ADD_NEW_USERS, replace=True)
new_user_rows["usage_date"] = pd.to_datetime(new_days)

# assign identifiers
new_user_rows["subscription_id"] = new_users
new_user_rows["usage_id"] = [make_usage_id() for _ in range(ADD_NEW_USERS)]
new_user_rows["usage_pk"] = [make_usage_pk() for _ in range(ADD_NEW_USERS)]
new_user_rows["synthetic"] = True


# -----------------------------
# 6) DEPTH: PROMOTE USERS TO 2/3/4 DISTINCT DAYS (NEVER > 4)
# -----------------------------
# Build current used-day sets for feature users
feat_only = feats[(feats["feature_name"] == FEATURE) & feats["subscription_id"].notna() & feats["_usage_day"].notna()].copy()
used_days = feat_only.groupby("subscription_id")["_usage_day"].apply(lambda s: set(s.tolist())).to_dict()

# include new users as having 1 day already
for uid, day in zip(new_users, pd.to_datetime(new_days).normalize()):
    used_days[uid] = set([day])

# promotion pool: users with <4 distinct days (existing + new)
promotion_pool = [u for u, days in used_days.items() if len(days) < 4]
if len(promotion_pool) == 0:
    print("No users eligible for promotion (<4 days). Skipping depth synthesis.")
    promo_rows = pd.DataFrame()
else:
    rng.shuffle(promotion_pool)
    promote_n = min(PROMOTE_USERS, len(promotion_pool))
    promote_users = promotion_pool[:promote_n]

    targets = pick_target_days(promote_n)

    promo_records = []
    for uid, target in zip(promote_users, targets):
        current_days = used_days.get(uid, set())
        # cap target at 4 and never below current
        target = int(min(4, max(target, len(current_days))))
        needed = target - len(current_days)
        if needed <= 0:
            continue

        # sample event templates via SDV
        templates = synth.sample(needed)
        templates["feature_name"] = FEATURE
        templates["subscription_id"] = uid
        templates["synthetic"] = True
        templates["usage_id"] = [make_usage_id() for _ in range(needed)]
        templates["usage_pk"] = [make_usage_pk() for _ in range(needed)]

        # choose NEW distinct days for this user (after their earliest day)
        base_day = min(current_days) if current_days else global_min
        new_distinct = set()

        attempts = 0
        while len(new_distinct) < needed and attempts < 5000:
            # keep repeats realistic: within 1–30 days after first use
            offset = int(rng.integers(1, 31))
            cand = (base_day + pd.Timedelta(days=offset)).normalize()
            if cand > global_max:
                cand = global_max.normalize()

            if cand not in current_days and cand not in new_distinct:
                new_distinct.add(cand)
            attempts += 1

        new_distinct = sorted(new_distinct)
        templates["usage_date"] = pd.to_datetime(new_distinct)

        # update tracking
        used_days[uid] = current_days.union(new_distinct)

        promo_records.append(templates)

    promo_rows = pd.concat(promo_records, ignore_index=True) if promo_records else pd.DataFrame()


# -----------------------------
# 7) APPEND SYNTHETIC ROWS INTO feats (overwrite feats variable)
# -----------------------------
to_add = pd.concat([new_user_rows, promo_rows], ignore_index=True)

# align columns
for c in feats.columns:
    if c not in to_add.columns:
        to_add[c] = np.nan
to_add = to_add[feats.columns]

feats = pd.concat([feats.drop(columns=["_usage_day"]), to_add], ignore_index=True)

# recompute day helper (optional, but nice)
feats["usage_date"] = pd.to_datetime(feats["usage_date"], errors="coerce")
feats["usage_day"] = feats["usage_date"].dt.normalize()


# -----------------------------
# 8) VALIDATE DISTRIBUTION
# -----------------------------
user_days = (
    feats[feats["feature_name"] == FEATURE]
    .dropna(subset=["subscription_id", "usage_day"])
    .groupby("subscription_id")["usage_day"]
    .nunique()
    .reset_index(name="distinct_days")
)





In [11]:
feats.to_csv(data_dir / 'feature_usage.csv')

In [12]:
df_augmented

Unnamed: 0,usage_id,subscription_id,usage_date,feature_name,usage_count,usage_duration_secs,error_count,is_beta_feature,post_release,usage_pk
0,usage_0,S-38131a,2024-12-11,feature_25,14,3782,2,False,False,id-gKDvFC
1,usage_1,S-712e69,2023-10-09,feature_30,8,2553,0,True,False,id-GPcfRg
2,usage_2,S-9c09cb,2024-08-18,feature_13,13,1860,0,False,False,id-DFPSav
3,usage_3,S-2dbf26,2024-09-19,feature_33,9,834,2,False,False,id-QgdjPG
4,usage_4,S-d114e2,2024-06-12,feature_4,9,1300,1,False,False,id-xbbuep
...,...,...,...,...,...,...,...,...,...,...
50193,usage_50193,S-104941,2023-12-20,feature_new_ai,8,2211,2,False,True,id-BqttoS
50194,usage_50194,S-5cb8a6,2024-01-23,feature_new_ai,11,5980,0,False,True,id-nYuaKD
50195,usage_50195,S-b7ef4c,2023-12-07,feature_new_ai,5,702,1,True,True,id-VBoAzR
50196,usage_50196,S-d11408,2024-11-17,feature_new_ai,6,3684,0,True,True,id-IVdsls


In [13]:
percent_interacted = df_augmented.loc[
    df_augmented['feature_name'] == 'feature_new_ai',
    'subscription_id'
].nunique() / df_augmented['subscription_id'].nunique()

percent_interacted

0.05918119173084718

In [14]:
df_augmented['subscription_id'].nunique()

4934

About 6% of subscriptions interacetd with the new feature 'feature_new_ai'. This is sufficient enough to be considered realistic

In [15]:
df = pd.read_csv(data_dir / 'ravenstack_subscriptions.csv')

In [16]:
len(df)

5000

In [17]:
df_augmented.loc[
    ~df_augmented['subscription_id'].isin(df['subscription_id']),
    'subscription_id'
].unique()

array([], dtype=object)

I see that each subscription_id that is in synthetic appears at least once in the subscriptions.csv . Therefore, we can get link any subscription_id from feature_usage.csv to subcriptions.csv