In [11]:
# pip install duckdb

In [12]:
# pip install torch-directml

### This file is used to synthesize data that mimics the existing patterns of a dataset. For project purposes sake, we can synthesize data to more realistically represent real-world data. 

This currently only applies to the feature_usage.csv

In [13]:
import duckdb 
import os
import pandas as pd
import numpy as np
from pathlib import Path


con = duckdb.connect()

base_dir = Path("C:\\Users\\henry\\OneDrive\\Personal Career\\Personal Projects\\GitHub\\Revenue-Sustainability-Analysis")
data_dir = Path(base_dir / "raw_data/Kaggle")

In [14]:
pd.set_option('display.max_rows', 10)

In [15]:
df = con.execute(f"""
    SELECT *
    FROM '{data_dir}/ravenstack_feature_usage.csv'
                 """).df()

In [16]:
import torch
import torch_directml
import pandas as pd
import uuid
from sdv.metadata import Metadata
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.sampling import Condition

# Check if DirectML is available
if torch_directml.is_available():
    device = torch_directml.device()
    print(f"Success! Using device: {device}")
    
    # Create a tensor on your AMD GPU
    x = torch.ones(3).to(device)
    print("Tensor on AMD GPU:", x)
else:
    print("DirectML not found. Check your installation.")

Success! Using device: privateuseone:0
Tensor on AMD GPU: tensor([1., 1., 1.], device='privateuseone:0')


In [17]:
RELEASE_TS = pd.Timestamp("2025-10-01")
INTRODUCED_FEATURE = "feature_new_ai"
TOTAL_ROWS = 50000
POST_RELEASE_LIFT = 3.0
TABLE = "usage_events"

df = df.copy()

# types
df["usage_date"] = pd.to_datetime(df["usage_date"], errors="coerce")
df["feature_name"] = df["feature_name"].replace("None", pd.NA)
df = df.dropna(subset=["feature_name"])

df["subscription_id"] = df["subscription_id"].astype(str)
df["feature_name"] = df["feature_name"].astype(str)
df["is_beta_feature"] = df["is_beta_feature"].astype(bool)

for c in ["usage_count", "usage_duration_secs", "error_count"]:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

# release flag
df["post_release"] = (df["usage_date"] >= RELEASE_TS)

# ensure introduced feature exists
if INTRODUCED_FEATURE not in set(df["feature_name"].unique()):
    seed_n = max(10, int(0.002 * len(df)))
    seed = df.sample(seed_n, random_state=42).copy()
    seed["feature_name"] = INTRODUCED_FEATURE
    seed["post_release"] = True
    seed["is_beta_feature"] = False
    df = pd.concat([df, seed], ignore_index=True)

# CRITICAL: create guaranteed-unique PK AFTER all concatenations
df["usage_pk"] = [uuid.uuid4().hex for _ in range(len(df))]

# metadata
metadata = Metadata.detect_from_dataframe(data=df, table_name=TABLE)
metadata.update_column(table_name=TABLE, column_name="usage_pk", sdtype="id")
metadata.set_primary_key(table_name=TABLE, column_name="usage_pk")
metadata.update_column(table_name=TABLE, column_name="usage_date", sdtype="datetime")

# fit
synth = GaussianCopulaSynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True)
synth.fit(df)

# sample baseline
syn_base = synth.sample(num_rows=TOTAL_ROWS)

# oversample introduced feature post-release
post_df = df[df["post_release"] == True]
post_share = len(post_df) / len(df) if len(df) else 0.5
post_n = int(TOTAL_ROWS * post_share)

base_feat_share_post = (post_df["feature_name"] == INTRODUCED_FEATURE).mean() if len(post_df) else 0.0
base_feat_n_post = int(post_n * base_feat_share_post)
target_feat_n_post = int(base_feat_n_post * POST_RELEASE_LIFT)
add_n = max(0, target_feat_n_post - base_feat_n_post)

if add_n > 0:
    cond = Condition({"post_release": True, "feature_name": INTRODUCED_FEATURE}, num_rows=add_n)
    syn_lift = synth.sample_from_conditions([cond])
    synthetic_df = pd.concat([syn_base, syn_lift], ignore_index=True)
else:
    synthetic_df = syn_base.copy()

# final unique id for synthetic dataset
synthetic_df["usage_id"] = [f"syn_usage_{i}" for i in range(len(synthetic_df))]


Sampling conditions: 100%|██████████| 198/198 [00:00<00:00, 2186.17it/s]


In [18]:
# ============================
# FIXED: baseline table + synthetic cleanup + realistic usage_date
# ============================

import pandas as pd
import numpy as np

# ----------------------------
# A) Baseline feature adoption table (from df)
# ----------------------------
df_base = df.copy()

# Clean feature names robustly (handles "None", None, "", whitespace)
df_base["feature_name"] = (
    df_base["feature_name"]
    .astype("string")
    .str.strip()
    .replace({"None": pd.NA, "": pd.NA})
)
df_base = df_base.dropna(subset=["feature_name"])

# Safety: ensure numerics are numeric (prevents groupby sum/median issues)
df_base["usage_count"] = pd.to_numeric(df_base["usage_count"], errors="coerce").fillna(0)
df_base["usage_duration_secs"] = pd.to_numeric(df_base["usage_duration_secs"], errors="coerce").fillna(0)

# Denominator
total_subs = df_base["subscription_id"].nunique()

# Aggregate per (feature, subscription) so power users don't dominate
feature_sub_agg = (
    df_base
    .groupby(["feature_name", "subscription_id"], as_index=False)
    .agg(
        total_usage_count=("usage_count", "sum"),
        total_usage_duration=("usage_duration_secs", "sum"),
    )
)

# Feature-level baseline table
feature_baseline = (
    feature_sub_agg
    .groupby("feature_name", as_index=False)
    .agg(
        subscriptions_used=("subscription_id", "nunique"),
        median_usage_count_per_sub=("total_usage_count", "median"),
        median_usage_duration_secs_per_sub=("total_usage_duration", "median"),
    )
)

# % of subscriptions used
feature_baseline["pct_subscriptions_used"] = (
    feature_baseline["subscriptions_used"] / total_subs * 100
).round(2)

# Optional rounding/typing
feature_baseline["median_usage_duration_secs_per_sub"] = (
    feature_baseline["median_usage_duration_secs_per_sub"]
    .fillna(0)
    .round(0)
    .astype(int)
)

feature_baseline = feature_baseline.sort_values("pct_subscriptions_used", ascending=False)

# ----------------------------
# B) Synthetic df cleanup + realistic usage_date generation
# ----------------------------
# Fix: str methods break if columns aren't strings; also "sdv-" replacement should be literal (regex=False)
for col, prefix in [("usage_id", "syn_"), ("usage_pk", "sdv-")]:
    if col in synthetic_df.columns:
        synthetic_df[col] = (
            synthetic_df[col]
            .astype("string")
            .str.replace(prefix, "", regex=False)
        )

# Create realistic usage_date based on one anchor per subscription + random offset
start = pd.Timestamp("2023-01-02")
end   = pd.Timestamp("2024-12-31")

# one anchor date per subscription (normalized to date)
subs = synthetic_df[["subscription_id"]].drop_duplicates()

anchors = subs.assign(
    anchor=pd.to_datetime(
        np.random.randint(
            start.value // 10**9,
            end.value   // 10**9,
            size=len(subs)
        ),
        unit="s",
        utc=True,  # avoids timezone surprises
    ).tz_convert(None).normalize()  # back to naive midnight
)

synthetic_df = synthetic_df.merge(anchors, on="subscription_id", how="left")

# each event occurs 0–60 days after the anchor
synthetic_df["usage_date"] = (
    synthetic_df["anchor"] + pd.to_timedelta(np.random.randint(0, 61, size=len(synthetic_df)), unit="D")
)

synthetic_df = synthetic_df.drop(columns=["anchor"])

# Preview
feature_baseline.head(10)

Unnamed: 0,feature_name,subscriptions_used,median_usage_count_per_sub,median_usage_duration_secs_per_sub,pct_subscriptions_used
3,feature_12,624,10.0,3050,12.56
18,feature_26,616,10.0,2905,12.4
25,feature_32,614,10.0,3055,12.36
8,feature_17,613,10.0,2860,12.34
27,feature_34,613,10.0,2933,12.34
31,feature_38,610,10.0,2802,12.28
11,feature_2,607,10.0,2870,12.22
2,feature_11,606,10.0,3058,12.2
6,feature_15,606,10.0,2874,12.2
36,feature_6,604,10.0,2852,12.16


In [19]:
synthetic_df['usage_id'] = synthetic_df['usage_id'].str.replace('syn_', "", regex=False)
synthetic_df['usage_pk'] = synthetic_df['usage_pk'].str.replace('sdv-',"")

In [20]:
synthetic_df.to_csv(data_dir / 'feature_usage.csv')

In [21]:
percent_interacted = synthetic_df.loc[
    synthetic_df['feature_name'] == 'feature_new_ai',
    'subscription_id'
].nunique() / synthetic_df['subscription_id'].nunique()

percent_interacted

0.05918119173084718

In [22]:
synthetic_df['subscription_id'].nunique()

4934

About 6% of subscriptions interacetd with the new feature 'feature_new_ai'. This is sufficient enough to be considered realistic

In [23]:
df = pd.read_csv(data_dir / 'ravenstack_subscriptions.csv')

In [24]:
len(df)

5000

In [25]:
synthetic_df.loc[
    ~synthetic_df['subscription_id'].isin(df['subscription_id']),
    'subscription_id'
].unique()

array([], dtype=object)

I see that each subscription_id that is in synthetic appears at least once in the subscriptions.csv . Therefore, we can get link any subscription_id from feature_usage.csv to subcriptions.csv