# 01_SETUP
-------------
Create required volume, subfolders, and synthetic data

In [0]:
%pip install databricks-sdk --upgrade

In [0]:
dbutils.library.restartPython()

In [0]:
from databricks.sdk import WorkspaceClient
from databricks.sdk.service import pipelines

w = WorkspaceClient()

### Part 1: Setup

In [0]:
user = (w.current_user.me().user_name).split("@")[0]
username = user.replace(".", "_")

In [0]:
username

In [0]:
%sql
USE CATALOG users;

In [0]:
spark.sql(f"USE SCHEMA {username}")

In [0]:
%sql
SELECT current_schema()

In [0]:
%sql
CREATE VOLUME IF NOT EXISTS customer_segmentation;

In [0]:
# Define volume, folder, and file details.
catalog            = 'users'
schema             = username
volume             = 'customer_segmentation'
volume_path        = f"/Volumes/{catalog}/{schema}/{volume}" # /Volumes/main/default/my-volume

# Create an empty folder in a volume.
w.files.create_directory(f"{volume_path}/customer_profile")
w.files.create_directory(f"{volume_path}/transactions")
w.files.create_directory(f"{volume_path}/engagement")

In [0]:
notebook_path = f"/Users/landan.george@databricks.com/stuff/E2E/Customer_Segmentation/02_Process_Data"

created = w.pipelines.create(
    continuous=False,
    name=f"{username}_customer_segmentation",
    libraries=[pipelines.PipelineLibrary(notebook=pipelines.NotebookLibrary(path=notebook_path))],
    serverless=True,
    budget_policy_id="45e016d4-ace8-407d-870f-3f06123daee1",
    photon=True,
    catalog="users",
    schema=username,
    channel="CURRENT"
)

print(f"Created pipeline: {created.pipeline_id}")

## Part 2: Synthetic data Gen
------
Tables:
* Customer Profile
* Raw Transactions
* Engagement

In [0]:
# Imports
import pandas as pd
import numpy as np
from faker import Faker
import random

In [0]:
# Setup
fake = Faker()
random.seed(42)
np.random.seed(42)

NUM_CUSTOMERS = 1000

In [0]:
# 1. Customer Profile
def generate_customer_profile(n):
    data = []
    for i in range(n):

        location = fake.local_latlng(country_code='US', coords_only=True)

        data.append({
            "CustomerID": f"C{str(i+1).zfill(4)}",
            "FirstName": fake.first_name(),
            "LastName": fake.last_name(),
            "Age": random.randint(18, 65),
            "Gender": (random.choices(['M', 'F', 'X'], weights=[40, 40, 10]))[0],
            "City": fake.city(),
            "EstLocation": f"{float(location[0])} {float(location[1])}",
            "SignupDate": fake.date_between(start_date='-5y', end_date='-1y')
        })
    return pd.DataFrame(data)
  
profiles = generate_customer_profile(NUM_CUSTOMERS)

profiles.to_csv(f"{volume_path}/customer_profile/customers.csv", index=False)

In [0]:
# 2. Transaction Summary
def generate_detailed_transactions(profiles):
    transactions = []
    categories = ['Fashion', 'Electronics', 'Beauty', 'Home Goods', 'Health & Wellness']
    channels = ['Web', 'Mobile App', 'In-Store']
    
    for _, row in profiles.iterrows():
        num_txns = random.randint(1, 15)
        if num_txns == 0:
            continue
        
        for i in range(num_txns):
            txn_date = pd.to_datetime("2025-07-01") - pd.to_timedelta(np.random.randint(15, 365), unit='D')
            amount = round(np.random.normal(60, 25), 2)
            category = random.choices(categories, weights=[0.25, 0.2, 0.2, 0.2, 0.15])[0]
            discount_used = random.random() < 0.4  # 40% chance
            transactions.append({
                "TransactionID": np.random.randint(100000000, 500000000),
                "CustomerID": row['CustomerID'],
                "TransactionDate": txn_date,
                "Amount": max(amount, 5.0),  # prevent negative/very low
                "Category": category,
                "Channel": random.choice(channels),
                "DiscountUsed": int(discount_used)
            })

    return pd.DataFrame(transactions)
  
detailed_txns = generate_detailed_transactions(profiles)

detailed_txns.to_csv(f"{volume_path}/transactions/raw_transactions.csv", index=False)

In [0]:
# 3. Engagement Metrics
def generate_engagement_data(profiles):
    data = []
    for _, row in profiles.iterrows():
        loyalty = np.random.randint(20, 100)
        returns = np.random.binomial(2, 0.2)
        discount_rate = round(min(1.0, np.random.beta(2, 5)), 2)
        data.append({
            "CustomerID": row['CustomerID'],
            "EmailOpens": np.random.poisson(10),
            "WebVisits": np.random.poisson(6),
            "MobileAppSessions": np.random.poisson(15),
            "LoyaltyScore": loyalty,
            "Returns": returns,
            "DiscountUsageRate": discount_rate
        })
    return pd.DataFrame(data)

engagement = generate_engagement_data(profiles)

engagement.to_csv(f"{volume_path}/engagement/metrics.csv", index=False)

In [0]:
w.pipelines.start_update(
  pipeline_id=created.pipeline_id,
  full_refresh=True
)