# Setup and Configuration for Streaming Feature Engineering Pipeline

This notebook handles the initial setup and configuration for the streaming feature engineering pipeline with Lakebase. It installs required packages, configures the environment, and validates the setup.

## Setup Tasks
1. **Package Installation**: Install required Python packages
2. **Environment Configuration**: Set up Spark configurations for streaming and Delta Lake
3. **Database Setup**: Create necessary databases and feature tables
4. **Validation**: Test all components and connections
5. **Sample Data**: Create initial sample datasets

## Prerequisites
- Databricks Runtime 13.0+ with ML
- Access to Delta Lake / Lakebase storage
- Optional: Kafka/Event Hub for streaming sources

## Post-Setup
After running this notebook, you can proceed with:
- `01_fraud_detection_streaming.ipynb` - Streaming feature engineering pipeline
- `02_feature_engineering.ipynb` - Feature engineering examples
- `04_data_generator.ipynb` - Data generation for testing


In [None]:
# MAGIC %md
# MAGIC ## Package Installation
# MAGIC
# MAGIC **Pre-installed in Databricks Runtime 13.0+:**
# MAGIC - ✅ Apache Spark
# MAGIC - ✅ Delta Lake
# MAGIC - ✅ MLflow
# MAGIC - ✅ Pandas, NumPy, Scikit-learn
# MAGIC - ✅ Matplotlib, Seaborn, Plotly
# MAGIC
# MAGIC **Install only if needed:**

# Install optional packages for your data source
# Uncomment the line for your data source:

# For Kafka:
# %pip install kafka-python

# For Azure Event Hub:
# %pip install azure-eventhub

# For configuration management:
# %pip install pyyaml

# Restart Python to ensure packages are loaded (only if you installed something)
# dbutils.library.restartPython()


In [None]:
# MAGIC %md
# MAGIC ## Environment Configuration

# Import required libraries
# Note: 'spark' session is already available in Databricks
from pyspark.sql import Window
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable
import logging
from datetime import datetime

# Configure Spark for optimal streaming performance
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
spark.conf.set("spark.sql.streaming.stateStore.maintenanceInterval", "600s")
spark.conf.set("spark.sql.streaming.statefulOperator.checkCorrectness.enabled", "false")

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("🔧 Environment Configuration:")
print(f"  Spark Version: {spark.version}")
print(f"  Databricks Runtime: {spark.conf.get('spark.databricks.clusterUsageTags.sparkVersion', 'Unknown')}")
print(f"  Delta Lake Enabled: {spark.conf.get('spark.databricks.delta.optimizeWrite.enabled')}")
print("  ✅ Streaming optimizations applied")
print("  ✅ Delta Lake optimizations enabled")
print("  ✅ Stateful streaming configured")


In [None]:
# MAGIC %md
# MAGIC ## Lakebase PostgreSQL Setup

# Install psycopg2 for PostgreSQL connection
%pip install psycopg2-binary

# Import Lakebase client
from lakebase_client import LakebaseClient

# Get OAuth token for authentication
token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

# OR use secrets (recommended for production)
# token = dbutils.secrets.get(scope="lakebase", key="token")
# host = dbutils.secrets.get(scope="lakebase", key="host")

# Lakebase connection configuration
# TODO: Update with your actual Lakebase host
LAKEBASE_CONFIG = {
    "host": "your-workspace.cloud.databricks.com",
    "port": 5432,
    "database": "feature_store",
    "user": "token",
    "password": token
}

print("📊 Connecting to Lakebase PostgreSQL...\n")

# Initialize Lakebase client
lakebase = LakebaseClient(**LAKEBASE_CONFIG)

# Test connection
print("🔍 Testing Lakebase connection...")
if lakebase.test_connection():
    print("✅ Successfully connected to Lakebase PostgreSQL!")
    print(f"   Host: {LAKEBASE_CONFIG['host']}")
    print(f"   Port: {LAKEBASE_CONFIG['port']}")
    print(f"   Database: {LAKEBASE_CONFIG['database']}")
else:
    print("❌ Failed to connect to Lakebase")
    print("   Please check:")
    print("   1. Lakebase instance is provisioned")
    print("   2. Host is correct")
    print("   3. OAuth token is valid")
    raise Exception("Lakebase connection failed")

# Create feature table
print("\n📋 Creating transaction_features table in Lakebase...")
lakebase.create_feature_table("transaction_features")
print("✅ Table created successfully!")

# Verify table exists
print("\n📊 Verifying table...")
try:
    stats = lakebase.get_table_stats("transaction_features")
    print(f"   Total rows: {stats['total_rows']:,}")
    print(f"   Unique users: {stats['unique_users']:,}")
    print(f"   Unique merchants: {stats['unique_merchants']:,}")
except Exception as e:
    print("   Table exists but is empty (just created)")

print("\n" + "="*60)
print("✅ LAKEBASE POSTGRESQL SETUP COMPLETE")
print("="*60)
print("\n💡 Lakebase Info:")
print("   Type: PostgreSQL-compatible OLTP database")
print("   Port: 5432")
print("   Features: ACID transactions, <10ms query latency")
print("   Use case: Real-time feature serving")
print("\n📝 Table Schema:")
print("   transaction_features:")
print("     - transaction_id (PRIMARY KEY)")
print("     - timestamp")
print("     - user_id, merchant_id, amount")
print("     - Time-based features (hour, day_of_week, etc.)")
print("     - Amount features (amount_log, amount_sqrt)")
print("     - Velocity features (user_txn_count_1h, etc.)")


In [None]:
# MAGIC %md
# MAGIC ## Setup Validation and Testing

# Test basic functionality
print("🧪 Validating setup...\n")

# 1. Test Spark functionality
test_df = spark.range(10).toDF("id")
test_count = test_df.count()
print(f"✅ Spark test: Created DataFrame with {test_count} records")

# 2. Test Delta Lake functionality
test_df.write.format("delta").mode("overwrite").save("/tmp/delta_test")
delta_test_df = spark.read.format("delta").load("/tmp/delta_test")
print(f"✅ Delta Lake test: Read {delta_test_df.count()} records from Delta table")

# 3. Test streaming capability
try:
    streaming_df = spark.readStream.format("rate").load()
    print("✅ Streaming test: Successfully created streaming DataFrame")
except Exception as e:
    print(f"⚠️  Streaming test warning: {e}")

# 4. Test feature engineering functions
test_data = [(1, 100.0, "2025-10-03 14:30:00")]
test_schema = StructType([
    StructField("id", IntegerType()),
    StructField("amount", DoubleType()),
    StructField("timestamp", StringType())
])
test_feature_df = spark.createDataFrame(test_data, test_schema)
test_feature_df = test_feature_df.withColumn("timestamp", to_timestamp("timestamp"))
test_feature_df = test_feature_df \
    .withColumn("amount_log", log1p("amount")) \
    .withColumn("hour", hour("timestamp")) \
    .withColumn("day_of_week", dayofweek("timestamp")) \
    .withColumn("is_weekend", when(dayofweek("timestamp").isin([1, 7]), 1).otherwise(0))
print("✅ Feature engineering test: Successfully applied transformations")
print(f"   Generated features: {', '.join([c for c in test_feature_df.columns if c not in ['id', 'amount', 'timestamp']])}")

# 5. Test window functions (for velocity features)
try:
    window_spec = Window.partitionBy("id").orderBy(col("timestamp").cast("long")).rangeBetween(-3600, 0)
    test_window_df = test_feature_df.withColumn("count_1h", count("*").over(window_spec))
    print("✅ Window functions test: Successfully applied windowed aggregations")
except Exception as e:
    print(f"⚠️  Window functions test warning: {e}")

print("\n" + "="*60)
print("🎉 SETUP VALIDATION COMPLETE")
print("="*60)
print("✅ All components validated successfully!")
print("✅ Ready to run streaming feature engineering pipeline!")
print("\n📝 Next steps:")
print("1. Lakebase PostgreSQL is ready at port 5432")
print("2. Generate streaming data: data_generator.py")
print("3. Run streaming demo: 01_streaming_lakebase_demo.ipynb")
print("4. Features will be written to Lakebase PostgreSQL (<100ms latency)")
print("\n💡 Tip: Check the console output in the streaming notebook for real-time feature monitoring")
