In [32]:
# pip install duckdb

In [33]:
# pip install torch-directml

### This file is used to synthesize data that mimics the existing patterns of a dataset. For project purposes sake, we can synthesize data to more realistically represent real-world data. 

This currently only applies to the feature_usage.csv

In [34]:
import duckdb 
import os
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import timedelta
import uuid
from sdv.metadata import Metadata
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.sampling import Condition
import torch
import torch_directml

con = duckdb.connect()

base_dir = Path("C:\\Users\\henry\\OneDrive\\Personal Career\\Personal Projects\\GitHub\\Revenue-Sustainability-Analysis")
data_dir = Path(base_dir / "raw_data/Kaggle")

In [35]:
pd.set_option('display.max_rows', 10)

In [36]:
df = con.execute(f"""
    SELECT *
    FROM '{data_dir}/ravenstack_feature_usage.csv'
                 """).df()

In [37]:
df

Unnamed: 0,usage_id,subscription_id,usage_date,feature_name,usage_count,usage_duration_secs,error_count,is_beta_feature
0,U-1c6c24,S-0fcf7d,2023-07-27,feature_20,9,5004,0,False
1,U-f07cb8,S-c25263,2023-08-07,feature_5,9,369,0,False
2,U-096807,S-f29e7f,2023-12-07,feature_3,9,1458,0,False
3,U-6b1580,S-be655e,2024-07-28,feature_40,5,2085,0,False
4,U-720a29,S-f9b1d0,2024-12-02,feature_12,12,900,0,False
...,...,...,...,...,...,...,...,...
24995,U-134479,S-c249fb,2023-07-08,feature_16,7,4116,0,False
24996,U-2031ad,S-b83d8d,2023-03-29,feature_31,8,2240,1,False
24997,U-dd4ffc,S-ad7716,2024-10-03,feature_5,5,2745,0,False
24998,U-49d9e1,S-dbad62,2024-06-25,feature_5,7,1715,0,False


In [38]:
# Check if DirectML is available
if torch_directml.is_available():
    device = torch_directml.device()
    print(f"Success! Using device: {device}")
    
    # Create a tensor on your AMD GPU
    x = torch.ones(3).to(device)
    print("Tensor on AMD GPU:", x)
else:
    print("DirectML not found. Check your installation.")

Success! Using device: privateuseone:0
Tensor on AMD GPU: tensor([1., 1., 1.], device='privateuseone:0')


### Introduce synthetic data to feature_usage since we do not have enough rows to replicate realistic data

In [39]:
import uuid
import numpy as np
import pandas as pd

from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer

# ============================================================
# PURPOSE:
# - Take your original extracted table (df)
# - Clean + normalize it
# - Learn patterns from existing data (SDV)
# - Add more rows that look like the existing data
# - Specifically add EXTRA rows for feature_name == "feature_new_ai"
# - Enforce: ALL "feature_new_ai" usage_date >= 2023-06-12
# - Output ONE final table: feats
# ============================================================

# ----------------------------
# CONFIG
# ----------------------------
TARGET_FEATURE = "feature_new_ai"
FEATURE_MIN_DATE = pd.Timestamp("2023-06-12")

# how many rows you want to ADD (not total rows)
ADD_GENERAL_ROWS = 15_000         # general “more rows like existing”
ADD_TARGET_ROWS  = 10_000         # extra rows specifically for feature_new_ai

SEED = 42
rng = np.random.default_rng(SEED)

# ----------------------------
# HELPERS
# ----------------------------
def _clean_feats(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Required columns
    required = ["usage_date", "feature_name", "subscription_id"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    # Dates
    df["usage_date"] = pd.to_datetime(df["usage_date"], errors="coerce")

    # Feature cleanup
    df["feature_name"] = (
        df["feature_name"]
        .astype("string")
        .str.strip()
        .replace({"None": pd.NA, "": pd.NA})
    )

    # Drop invalid essentials
    df = df.dropna(subset=["usage_date", "feature_name", "subscription_id"])

    # Normalize id + feature types
    df["subscription_id"] = df["subscription_id"].astype(str)
    df["feature_name"] = df["feature_name"].astype(str)

    # Safer boolean normalization (only if present)
    if "is_beta_feature" in df.columns:
        if df["is_beta_feature"].dtype == object:
            df["is_beta_feature"] = (
                df["is_beta_feature"]
                .astype("string")
                .str.strip()
                .str.lower()
                .map({"true": True, "false": False, "1": True, "0": False, "yes": True, "no": False})
                .fillna(False)
            )
        else:
            df["is_beta_feature"] = df["is_beta_feature"].astype(bool)

    # Numeric columns (coerce if they exist)
    for c in ["usage_count", "usage_duration_secs", "error_count"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

    # Helpful derived date
    df["usage_day"] = df["usage_date"].dt.normalize()

    # If synthetic flag exists, keep it; else create
    df["synthetic"] = df.get("synthetic", False)

    return df


def _ensure_target_exists(df: pd.DataFrame, feature: str) -> pd.DataFrame:
    """If target feature doesn't exist, seed a few rows so SDV can condition on it."""
    df = df.copy()
    if feature in set(df["feature_name"].unique()):
        return df

    seed_n = max(10, int(0.002 * len(df)))
    seed = df.sample(seed_n, random_state=SEED).copy()
    seed["feature_name"] = feature
    seed["usage_date"] = pd.to_datetime(
        FEATURE_MIN_DATE + pd.to_timedelta(rng.integers(0, 30, size=len(seed)), unit="D")
    )
    seed["usage_day"] = seed["usage_date"].dt.normalize()
    if "is_beta_feature" in seed.columns:
        seed["is_beta_feature"] = False
    seed["synthetic"] = True

    return pd.concat([df, seed], ignore_index=True)


def _make_ids(n: int, prefix: str) -> list[str]:
    return [f"{prefix}{uuid.uuid4().hex[:12]}" for _ in range(n)]


def _fit_sdv_for_templates(df: pd.DataFrame) -> tuple[GaussianCopulaSynthesizer, list[str]]:
    """
    Fit SDV on df using "template columns" that exclude high-cardinality IDs.
    Returns (synth, template_cols).
    """
    exclude = {"usage_id", "usage_pk"}  # these should not be learned; we generate new ones
    template_cols = [c for c in df.columns if c not in exclude]

    model_df = df[template_cols].copy()

    md = SingleTableMetadata()
    md.detect_from_dataframe(model_df)

    synth = GaussianCopulaSynthesizer(md)
    synth.fit(model_df)

    return synth, template_cols


def _enforce_target_min_date(df: pd.DataFrame, feature: str, min_date: pd.Timestamp) -> pd.DataFrame:
    df = df.copy()
    df["usage_date"] = pd.to_datetime(df["usage_date"], errors="coerce")

    mask = (df["feature_name"] == feature) & df["usage_date"].notna() & (df["usage_date"] < min_date)
    if mask.any():
        n = int(mask.sum())
        offsets = rng.integers(0, 91, size=n)  # within ~3 months after min_date
        df.loc[mask, "usage_date"] = min_date + pd.to_timedelta(offsets, unit="D")

    df["usage_day"] = df["usage_date"].dt.normalize()
    return df


# ============================================================
# BUILD feats (single output table)
# ============================================================

# Start from your original extracted df
feats = _clean_feats(df)
feats = _ensure_target_exists(feats, TARGET_FEATURE)

# Fit SDV on current feats patterns (excluding IDs)
synth, template_cols = _fit_sdv_for_templates(feats)

# 1) Add general new rows that look like the existing data
gen_new = synth.sample(ADD_GENERAL_ROWS)
gen_new["synthetic"] = True

# 2) Add extra rows specifically for the target feature
target_new = synth.sample(ADD_TARGET_ROWS)
target_new["feature_name"] = TARGET_FEATURE
target_new["synthetic"] = True

# Force target dates >= 2023-06-12 (give them realistic range after min_date)
# If SDV produced earlier dates, we overwrite them.
target_offsets = rng.integers(0, 365, size=len(target_new))  # within 1 year after min_date
target_new["usage_date"] = pd.to_datetime(FEATURE_MIN_DATE + pd.to_timedelta(target_offsets, unit="D"))

# Combine additions
to_add = pd.concat([gen_new, target_new], ignore_index=True)

# Align to feats columns (keep all original columns, fill missing)
for c in feats.columns:
    if c not in to_add.columns:
        to_add[c] = np.nan
to_add = to_add[feats.columns]

# Generate fresh IDs if present in feats schema
if "usage_id" in feats.columns:
    to_add["usage_id"] = _make_ids(len(to_add), "syn_usage_")
if "usage_pk" in feats.columns:
    to_add["usage_pk"] = _make_ids(len(to_add), "syn_pk_")

# Append and enforce target date constraint one last time
feats = pd.concat([feats, to_add], ignore_index=True)
feats = _enforce_target_min_date(feats, TARGET_FEATURE, FEATURE_MIN_DATE)

# Optional: ensure types remain consistent
feats["usage_date"] = pd.to_datetime(feats["usage_date"], errors="coerce")
feats["subscription_id"] = feats["subscription_id"].astype(str)
feats["feature_name"] = feats["feature_name"].astype(str)
feats['usage_id'] = feats['usage_id'].str.replace("syn_usage_", "U-")




In [40]:
feats

Unnamed: 0,usage_id,subscription_id,usage_date,feature_name,usage_count,usage_duration_secs,error_count,is_beta_feature,usage_day,synthetic
0,U-1c6c24,S-0fcf7d,2023-07-27,feature_20,9,5004,0,False,2023-07-27,False
1,U-f07cb8,S-c25263,2023-08-07,feature_5,9,369,0,False,2023-08-07,False
2,U-096807,S-f29e7f,2023-12-07,feature_3,9,1458,0,False,2023-12-07,False
3,U-6b1580,S-be655e,2024-07-28,feature_40,5,2085,0,False,2024-07-28,False
4,U-720a29,S-f9b1d0,2024-12-02,feature_12,12,900,0,False,2024-12-02,False
...,...,...,...,...,...,...,...,...,...,...
50045,U-87bd3c4bbc1f,S-d36b9f,2024-04-27,feature_new_ai,11,3655,0,False,2024-04-27,True
50046,U-005e9e111723,S-aaed3e,2023-11-29,feature_new_ai,11,2047,2,False,2023-11-29,True
50047,U-5f92faada6f5,S-5bc277,2024-01-27,feature_new_ai,8,781,2,False,2024-01-27,True
50048,U-5b7bde0d7d87,S-3473af,2023-10-07,feature_new_ai,8,589,0,False,2023-10-07,True


In [41]:
feats.to_csv(data_dir / 'ravenstack_feature_usage.csv')

In [42]:
users = feats[feats['feature_name'] == 'feature_new_ai']

# Count how many rows the feature I want has
print(f"Row_Count: {users.count()[0]}")

# Count how many unique users have used feature 1+ times
print(f"Unique users: {users['subscription_id'].nunique()}")

distinct_days = users.groupby('subscription_id')['usage_date'].nunique().reset_index(name='distinct_usage_days')
distinct_days.groupby('distinct_usage_days')['subscription_id'].nunique().reset_index(name='num_users').sort_values('distinct_usage_days')

Row_Count: 10074
Unique users: 4088


  print(f"Row_Count: {users.count()[0]}")


Unnamed: 0,distinct_usage_days,num_users
0,1,1316
1,2,1128
2,3,787
3,4,452
4,5,237
...,...,...
6,7,35
7,8,18
8,9,5
9,10,3
