In [1]:
# pip install duckdb

In [7]:
pip install torch-directml

Collecting torch-directml
  Downloading torch_directml-0.2.5.dev240914-cp312-cp312-win_amd64.whl.metadata (6.3 kB)
Collecting torch==2.4.1 (from torch-directml)
  Downloading torch-2.4.1-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting torchvision==0.19.1 (from torch-directml)
  Downloading torchvision-0.19.1-cp312-cp312-win_amd64.whl.metadata (6.1 kB)
Downloading torch_directml-0.2.5.dev240914-cp312-cp312-win_amd64.whl (9.0 MB)
   ---------------------------------------- 0.0/9.0 MB ? eta -:--:--
   ---------------------------------------- 9.0/9.0 MB 62.1 MB/s  0:00:00
Downloading torch-2.4.1-cp312-cp312-win_amd64.whl (199.4 MB)
   ---------------------------------------- 0.0/199.4 MB ? eta -:--:--
   --- ------------------------------------ 16.0/199.4 MB 77.1 MB/s eta 0:00:03
   ------ --------------------------------- 32.8/199.4 MB 77.0 MB/s eta 0:00:03
   --------- ------------------------------ 49.5/199.4 MB 76.9 MB/s eta 0:00:02
   ------------- -------------------------- 66.

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.9.1+cpu requires torch==2.9.1, but you have torch 2.4.1 which is incompatible.


In [2]:
import duckdb 
import os
import pandas as pd
import numpy as np
from pathlib import Path


con = duckdb.connect()

base_dir = Path("C:\\Users\\henry\\OneDrive\\Personal Career\\Personal Projects\\GitHub\\Revenue-Sustainability-Analysis\\Dataset")
data_dir = str(base_dir).replace("\\","/")

In [3]:
data_dir

'C:/Users/henry/OneDrive/Personal Career/Personal Projects/GitHub/Revenue-Sustainability-Analysis/Dataset'

In [4]:
pd.set_option('display.max_rows', None)

In [5]:
df = con.execute(f"""
    SELECT *
    FROM '{base_dir}/feature_usage.parquet'
                 """).df()

In [8]:
import torch
import torch_directml

# Check if DirectML is available
if torch_directml.is_available():
    device = torch_directml.device()
    print(f"Success! Using device: {device}")
    
    # Create a tensor on your AMD GPU
    x = torch.ones(3).to(device)
    print("Tensor on AMD GPU:", x)
else:
    print("DirectML not found. Check your installation.")

Success! Using device: privateuseone:0
Tensor on AMD GPU: tensor([1., 1., 1.], device='privateuseone:0')


In [9]:
df.dtypes

usage_id               object
subscription_id        object
usage_date             object
feature_name           object
usage_count             Int64
usage_duration_secs     Int64
error_count             Int64
is_beta_feature          bool
dtype: object

In [23]:
import pandas as pd
from sdv.metadata import Metadata
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.sampling import Condition

# ----------------------------
# CONFIG
# ----------------------------
RELEASE_TS = pd.Timestamp("2025-10-01")   # change this
INTRODUCED_FEATURE = "feature_new_ai"     # change this
TOTAL_ROWS = 45000                        # >= 40000
POST_RELEASE_LIFT = 3.0                   # 3x more common post-release

df = df.copy()

# ----------------------------
# 1) Clean + types
# ----------------------------
df["usage_date"] = pd.to_datetime(df["usage_date"], errors="coerce")

# Fix your 'None' feature issue (critical)
df["feature_name"] = df["feature_name"].replace("None", pd.NA)
df = df.dropna(subset=["feature_name"])

df["usage_id"] = df["usage_id"].astype(str)
df["subscription_id"] = df["subscription_id"].astype(str)
df["feature_name"] = df["feature_name"].astype(str)
df["is_beta_feature"] = df["is_beta_feature"].astype(bool)

for c in ["usage_count", "usage_duration_secs", "error_count"]:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

# Ensure unique PK
if df["usage_id"].duplicated().any():
    df["usage_id"] = [f"usage_{i}" for i in range(len(df))]

# ----------------------------
# 2) Add release flag (lets us condition SDV)
# ----------------------------
df["post_release"] = (df["usage_date"] >= RELEASE_TS)

# Ensure introduced feature exists at least a tiny bit in training
if INTRODUCED_FEATURE not in set(df["feature_name"].unique()):
    seed_n = max(10, int(0.002 * len(df)))  # 0.2% of rows or min 10
    seed = df.sample(seed_n, random_state=42).copy()
    seed["feature_name"] = INTRODUCED_FEATURE
    seed["post_release"] = True
    seed["is_beta_feature"] = False
    df = pd.concat([df, seed], ignore_index=True)

# ----------------------------
# 3) Metadata (correct API usage!)
# ----------------------------
TABLE = "usage_events"
metadata = Metadata.detect_from_dataframe(data=df, table_name=TABLE)

metadata.update_column(
    table_name=TABLE,
    column_name="usage_id",
    sdtype="id"
)
metadata.set_primary_key(table_name=TABLE, column_name="usage_id")

metadata.update_column(
    table_name=TABLE,
    column_name="usage_date",
    sdtype="datetime"
)

metadata.save_to_json("usage_events_metadata.json")

# ----------------------------
# 4) Fit synthesizer (no torch)
# ----------------------------
synth = GaussianCopulaSynthesizer(
    metadata,
    enforce_min_max_values=True,
    enforce_rounding=True
)
synth.fit(df)

# ----------------------------
# 5) Generate baseline synthetic (>= 40k)
# ----------------------------
syn_base = synth.sample(num_rows=TOTAL_ROWS)

# ----------------------------
# 6) Oversample introduced feature AFTER release
# ----------------------------
# Estimate base counts from training post-release slice
post_df = df[df["post_release"] == True]
post_share = len(post_df) / len(df) if len(df) else 0.5
post_n = int(TOTAL_ROWS * post_share)

base_feat_share_post = (post_df["feature_name"] == INTRODUCED_FEATURE).mean() if len(post_df) else 0.0
base_feat_n_post = int(post_n * base_feat_share_post)
target_feat_n_post = int(base_feat_n_post * POST_RELEASE_LIFT)

add_n = max(0, target_feat_n_post - base_feat_n_post)

if add_n > 0:
    cond = Condition(
        column_values={"post_release": True, "feature_name": INTRODUCED_FEATURE},
        num_rows=add_n
    )
    syn_lift = synth.sample_from_conditions([cond])
    synthetic_df = pd.concat([syn_base, syn_lift], ignore_index=True)
else:
    synthetic_df = syn_base.copy()

# Keep IDs unique
synthetic_df["usage_id"] = [f"syn_usage_{i}" for i in range(len(synthetic_df))]

# ----------------------------
# 7) Validate the shift
# ----------------------------
def feat_share(data, post, feat):
    sub = data[data["post_release"] == post]
    return (sub["feature_name"] == feat).mean() if len(sub) else 0.0

print("Rows real:", len(df), "Rows synthetic:", len(synthetic_df))
print("Release:", RELEASE_TS, "Introduced feature:", INTRODUCED_FEATURE)

print(f"REAL share pre:  {feat_share(df, False, INTRODUCED_FEATURE):.4f}")
print(f"REAL share post: {feat_share(df, True,  INTRODUCED_FEATURE):.4f}")
print(f"SYN  share pre:  {feat_share(synthetic_df, False, INTRODUCED_FEATURE):.4f}")
print(f"SYN  share post: {feat_share(synthetic_df, True,  INTRODUCED_FEATURE):.4f}")

print("\nTop features POST-release (synthetic):")
print(
    synthetic_df[synthetic_df["post_release"] == True]["feature_name"]
    .value_counts(normalize=True)
    .head(10)
)




Rows real: 22560 Rows synthetic: 45000
Release: 2025-10-01 00:00:00 Introduced feature: feature_new_ai
REAL share pre:  0.0020
REAL share post: 0.0000
SYN  share pre:  0.0021
SYN  share post: 0.0000

Top features POST-release (synthetic):
Series([], Name: proportion, dtype: float64)


In [24]:
len(synthetic_df)

45000

In [25]:
len(df)

22560