In [0]:
# Get all tables from the schema
tables = spark.catalog.listTables("eldenringcatalog.gold")

for t in tables:
    table_name = f"eldenringcatalog.gold.{t.name}"
    print(f"Dropping table: {table_name}")
    spark.sql(f"DROP TABLE IF EXISTS {table_name}")

In [0]:
from pyspark.sql.functions import (
    col, lit, current_timestamp, when, coalesce, explode, split,
    from_json, regexp_extract, regexp_replace, trim, upper, lower,
    udf, struct, array, size, concat_ws, substring, expr,
    sum as spark_sum, count as spark_count, avg, max, min,
    monotonically_increasing_id, row_number, rank, dense_rank,
    round as spark_round, abs as spark_abs, sqrt, pow,
    first, last, collect_list, collect_set,
    year, month, dayofmonth, date_format, to_date, datediff
)
from pyspark.sql import DataFrame
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, 
    DoubleType, BooleanType, ArrayType, MapType, DateType
)
from pyspark.sql.window import Window
import logging
from datetime import datetime, timedelta

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

## **DIMENSION TABLES**

In [0]:
def create_dim_weapons(config: dict) -> DataFrame:
    """
    Create weapon dimension with base attributes
    
    Source: silver.weapons
    Grain: One row per weapon
    Type: SCD Type 1
    """
    logger.info("📊 Creating dim_weapons...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.weapons")
    
    df_dim = df.select(
        monotonically_increasing_id().alias("weapon_key"),
        col("weapon_id"),
        col("weapon_name"),
        col("category"),
        col("damage_type"),
        col("weight"),
        col("is_dlc"),
        
        # Requirements (already parsed in Silver)
        col("required_str").cast("int").alias("required_strength"),
        col("required_dex").cast("int").alias("required_dexterity"),
        col("required_int").cast("int").alias("required_intelligence"),
        col("required_fai").cast("int").alias("required_faith"),
        col("required_arc").cast("int").alias("required_arcane"),
        
        # Passive effect
        col("passive_effect"),
        
        # Metadata
        current_timestamp().alias("created_at"),
        current_timestamp().alias("updated_at")
    )
    
    return df_dim

In [0]:
def create_dim_shields(config: dict) -> DataFrame:
    """
    Create shield dimension
    
    Source: silver.shields
    Grain: One row per shield
    """
    logger.info("📊 Creating dim_shields...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.shields")
    
    df_dim = df.select(
        monotonically_increasing_id().alias("shield_key"),
        col("shield_id"),
        col("shield_name"),
        col("category"),
        col("weight"),
        col("is_dlc"),
        
        # Requirements
        col("required_str").cast("int").alias("required_strength"),
        col("required_dex").cast("int").alias("required_dexterity"),
        col("required_int").cast("int").alias("required_intelligence"),
        col("required_fai").cast("int").alias("required_faith"),
        
        current_timestamp().alias("created_at"),
        current_timestamp().alias("updated_at")
    )
    
    return df_dim


In [0]:
def create_dim_armors(config: dict) -> DataFrame:
    """
    Create armor dimension
    
    Source: silver.armors
    Grain: One row per armor piece
    """
    logger.info("📊 Creating dim_armors...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.armors")
    
    df_dim = df.select(
        monotonically_increasing_id().alias("armor_key"),
        col("armor_id"),
        col("armor_name"),
        col("category").alias("armor_type"),  # Head, Chest, Gauntlets, Legs
        col("weight"),
        col("is_dlc"),
        
        current_timestamp().alias("created_at"),
        current_timestamp().alias("updated_at")
    )
    
    return df_dim

In [0]:
def create_dim_items(config: dict) -> DataFrame:
    """
    Create unified item dimension
    
    Source: silver.items_unified (consolidated from 13 item types)
    Grain: One row per item
    """
    logger.info("📊 Creating dim_items...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.items_unified")
    
    df_dim = df.select(
        monotonically_increasing_id().alias("item_key"),
        col("item_id") ,
        col("item_name"),
        col("item_type"),  # ammo, bell, consumable, cookbook, etc.
        col("description"),
        col("effect"),
        
        # Type-specific attributes
               
        current_timestamp().alias("created_at"),
        current_timestamp().alias("updated_at")
    )
    
    return df_dim


In [0]:
def create_dim_locations(config: dict) -> DataFrame:
    """
    Create location dimension
    
    Source: silver.locations
    Grain: One row per location
    """
    logger.info("📊 Creating dim_locations...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.locations")
    
    df_dim = df.select(
        monotonically_increasing_id().alias("location_key"),
        col("location_id"),
        col("location_name"),
        col("region"),
        col("description"),
        col("is_dlc"),
        
        current_timestamp().alias("created_at"),
        current_timestamp().alias("updated_at")
    )
    
    return df_dim

In [0]:
def create_dim_npcs(config: dict) -> DataFrame:
    """
    Create NPC dimension
    
    Source: silver.npcs
    Grain: One row per NPC
    """
    logger.info("📊 Creating dim_npcs...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.npcs")
    
    df_dim = df.select(
        monotonically_increasing_id().alias("npc_key"),
        col("npc_id"),
        col("npc_name"),
        col("role"),
        col("voiced_by").alias("voice_actor"),
        col("is_dlc"),
        
        current_timestamp().alias("created_at"),
        current_timestamp().alias("updated_at")
    )
    
    return df_dim

In [0]:
def create_dim_bosses(config: dict) -> DataFrame:
    """
    Create boss dimension
    
    Source: silver.bosses
    Grain: One row per boss
    """
    logger.info("📊 Creating dim_bosses...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.bosses")
    
    df_dim = df.select(
        monotonically_increasing_id().alias("boss_key"),
        col("boss_id"),
        col("boss_name"),
        col("is_dlc"),
        
        current_timestamp().alias("created_at"),
        current_timestamp().alias("updated_at")
    )
    
    return df_dim

In [0]:
def create_dim_date(config: dict) -> DataFrame:
    """
    Create date dimension for time-series analysis
    
    Grain: One row per date
    Range: 2022-02-25 (Elden Ring release) to 2026-12-31
    """
    logger.info("📊 Creating dim_date...")
    
    # Generate date range
    start_date = datetime(2022, 2, 25)  # Elden Ring release date
    end_date = datetime(2026, 12, 31)
    
    dates = []
    current = start_date
    while current <= end_date:
        dates.append((current,))
        current += timedelta(days=1)
    
    df_dates = spark.createDataFrame(dates, ["date"])
    
    df_dim = df_dates.select(
        to_date(col("date")).alias("date_key"),
        year(col("date")).alias("year"),
        month(col("date")).alias("month"),
        dayofmonth(col("date")).alias("day"),
        date_format(col("date"), "E").alias("day_of_week"),
        date_format(col("date"), "MMMM").alias("month_name"),
        date_format(col("date"), "Q").cast("int").alias("quarter"),
        when(month(col("date")).isin([12, 1, 2]), "Winter")
            .when(month(col("date")).isin([3, 4, 5]), "Spring")
            .when(month(col("date")).isin([6, 7, 8]), "Summer")
            .otherwise("Fall").alias("season"),
        
        current_timestamp().alias("created_at")
    )
    
    return df_dim

In [0]:
def create_fact_weapon_stats(config: dict) -> DataFrame:
    """
    Create weapon performance fact table
    
    Source: silver.weapons_upgrades (60K rows)
    Grain: One row per weapon per upgrade level
    Metrics: Attack power, scaling, status buildup
    """
    logger.info("📈 Creating fact_weapon_stats...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.weapons_upgrades")
    
    # Get weapon dimension for lookup
    df_weapons = spark.table(f"{config['catalog']}.{config['silver_schema']}.weapons") \
        .select(
            col("weapon_name"),
            col("category"),
            col("weight"),
            col("required_str"),
            col("required_dex"),
            col("required_int"),
            col("required_fai"),
            col("required_arc")
        )
    
    # Join with explicit column references
    df_joined = df.join(
        df_weapons,
        df["weapon_name"] == df_weapons["weapon_name"],
        "left"
    )
    
    # FIX: Include weight and requirements in the select before using them in withColumn
    df_fact = df_joined.select(
        monotonically_increasing_id().alias("weapon_stat_key"),
        
        # Dimensions (use df prefix to avoid ambiguity)
        df["weapon_name"],
        df_weapons["category"],
        col("upgrade_level"),
        
        # Attack stats (parsed in Silver)
        col("attack_physical").cast("double"),
        col("attack_magic").cast("double"),
        col("attack_fire").cast("double"),
        col("attack_lightning").cast("double"),
        col("attack_holy").cast("double"),
        col("stamina_cost").cast("double"),
        col("critical_damage").cast("double"),
        
        # Total attack power
        (
            coalesce(col("attack_physical"), lit(0.0)) + 
            coalesce(col("attack_magic"), lit(0.0)) + 
            coalesce(col("attack_fire"), lit(0.0)) + 
            coalesce(col("attack_lightning"), lit(0.0)) + 
            coalesce(col("attack_holy"), lit(0.0))
        ).alias("total_attack_power"),
        
        # Scaling stats
        col("scaling_str").alias("scaling_strength"),
        col("scaling_dex").alias("scaling_dexterity"),
        col("scaling_int").alias("scaling_intelligence"),
        col("scaling_fai").alias("scaling_faith"),
        col("scaling_arc").alias("scaling_arcane"),
        
        # Guard stats (for weapons with shields)
        col("guard_physical").cast("double"),
        col("guard_magic").cast("double"),
        col("guard_fire").cast("double"),
        col("guard_lightning").cast("double"),
        col("guard_holy").cast("double"),
        col("guard_boost").cast("double"),
        col("guard_resistance").cast("double"),
        
        # Passive effects (dual buildup support)
        col("passive_effect_type"),
        col("passive_buildup_primary").cast("int"),
        col("passive_buildup_secondary").cast("int"),
        col("has_dual_buildup").cast("boolean"),
        col("has_passive_effect").cast("boolean"),
        
        # FIX: Include these columns from df_weapons so we can use them in withColumn
        df_weapons["weight"],
        df_weapons["required_str"],
        df_weapons["required_dex"],
        df_weapons["required_int"],
        df_weapons["required_fai"],
        df_weapons["required_arc"],
        
        current_timestamp().alias("created_at")
    ).withColumn(
        "damage_per_stat_point",
        when(
            (coalesce(col("required_str"), lit(0)) + 
             coalesce(col("required_dex"), lit(0))) > 0,
            coalesce(col("attack_physical"), lit(0.0)) / 
            (coalesce(col("required_str"), lit(0)) + 
             coalesce(col("required_dex"), lit(0)))
        ).otherwise(lit(0.0))
    ).withColumn(
        "damage_per_weight",
        when(
            coalesce(col("weight"), lit(0.0)) > 0,
            coalesce(col("attack_physical"), lit(0.0)) / 
            coalesce(col("weight"), lit(1.0))
        ).otherwise(lit(0.0))
    ).drop(
        # Drop requirement columns after calculation (optional - keep if you want them in the fact table)
        "required_str", "required_dex", "required_int", "required_fai", "required_arc"
    )
    
    return df_fact

In [0]:
def create_fact_shield_stats(config: dict) -> DataFrame:
    """
    Create shield effectiveness fact table
    
    Source: silver.shields_upgrades (28K rows)
    Grain: One row per shield per upgrade level
    """
    logger.info("📈 Creating fact_shield_stats...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.shields_upgrades")
    
    df_shields = spark.table(f"{config['catalog']}.{config['silver_schema']}.shields") \
        .select(
            col("shield_name").alias("base_shield_name"),
            col("weight")
        )
    
    # FIX: Use different column names to avoid ambiguity
    df_joined = df.join(
        df_shields,
        df["shield_name"] == df_shields["base_shield_name"],
        "left"
    )
    
    df_fact = df_joined.select(
        monotonically_increasing_id().alias("shield_stat_key"),
        
        # Use df prefix for shield_name from shields_upgrades table
        df["shield_name"],
        col("upgrade_level"),
        
        # Attack stats (shield bash damage)
        col("attack_physical").cast("double"),
        col("stamina_cost").cast("double"),
        
        # Guard stats (damage negation)
        col("guard_physical").cast("double").alias("negation_physical"),
        col("guard_magic").cast("double").alias("negation_magic"),
        col("guard_fire").cast("double").alias("negation_fire"),
        col("guard_lightning").cast("double").alias("negation_lightning"),
        col("guard_holy").cast("double").alias("negation_holy"),
        col("guard_boost").cast("double").alias("boost"),
        col("guard_resistance").cast("double"),
        
        # Scaling
        col("scaling_str").alias("scaling_strength"),
        col("scaling_dex").alias("scaling_dexterity"),
        
        # Passive effects
        col("passive_effect_type"),
        col("passive_buildup_primary").cast("int"),
        col("passive_buildup_secondary").cast("int"),
        col("has_dual_buildup").cast("boolean"),
        col("has_passive_effect").cast("boolean"),
        
        # FIX: Include weight from base shields table in select
        df_shields["weight"],
        
        current_timestamp().alias("created_at")
    ).withColumn(
        # Total negation
        "total_negation",
        (
            coalesce(col("negation_physical"), lit(0.0)) + 
            coalesce(col("negation_magic"), lit(0.0)) + 
            coalesce(col("negation_fire"), lit(0.0)) + 
            coalesce(col("negation_lightning"), lit(0.0)) + 
            coalesce(col("negation_holy"), lit(0.0))
        )
    ).withColumn(
        # Protection per weight - safe division
        "protection_per_weight",
        when(
            coalesce(col("weight"), lit(0.0)) > 0,
            coalesce(col("negation_physical"), lit(0.0)) / 
            coalesce(col("weight"), lit(1.0))
        ).otherwise(lit(0.0))
    )
    
    return df_fact

In [0]:
def create_fact_armor_stats(config: dict) -> DataFrame:
    """
    Create armor protection fact table
    
    Source: silver.armors (723 rows)
    Grain: One row per armor piece
    """
    logger.info("📈 Creating fact_armor_stats...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.armors")
    
    df_fact = df.select(
        monotonically_increasing_id().alias("armor_stat_key"),
        
        col("armor_name"),
        col("category").alias("armor_type"),
        col("weight"),
        
        # Damage negation (parsed in Silver)
        col("dmg_negation_physical").cast("double").alias("negation_physical"),
        col("dmg_negation_vs_strike").cast("double").alias("negation_strike"),
        col("dmg_negation_vs_slash").cast("double").alias("negation_slash"),
        col("dmg_negation_vs_pierce").cast("double").alias("negation_pierce"),
        col("dmg_negation_magic").cast("double").alias("negation_magic"),
        col("dmg_negation_fire").cast("double").alias("negation_fire"),
        col("dmg_negation_lightning").cast("double").alias("negation_lightning"),
        col("dmg_negation_holy").cast("double").alias("negation_holy"),
        
        # Resistance stats
        col("resistance_immunity").cast("double").alias("resistance_immunity"),
        col("resistance_robustness").cast("double").alias("resistance_robustness"),
        col("resistance_focus").cast("double").alias("resistance_focus"),
        col("resistance_vitality").cast("double").alias("resistance_vitality"),
        col("resistance_poise").cast("double").alias("resistance_poise"),
        
        # Calculated metrics
        (coalesce(col("dmg_negation_physical"), lit(0)) / 
         coalesce(col("weight"), lit(1))).alias("protection_per_weight"),
        
        (coalesce(col("resistance_poise"), lit(0)) / 
         coalesce(col("weight"), lit(1))).alias("poise_per_weight"),
        
        current_timestamp().alias("created_at")
    )
    
    return df_fact

In [0]:
def create_fact_boss_encounters(config: dict) -> DataFrame:
    """
    Create boss encounter fact table
    
    Source: silver.bosses (155 rows)
    Grain: One row per boss
    Includes: HP ranges, phases, classifications
    """
    logger.info("📈 Creating fact_boss_encounters...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.bosses")
    
    df_fact = df.select(
        monotonically_increasing_id().alias("boss_encounter_key"),
        
        col("boss_name"),
        # HP metrics (parsed in Silver with parse_boss_hp UDF)
        col("hp_min").cast("int"),
        col("hp_max").cast("int"),
        col("hp_type"),  # single, phased, approximate, tbd
        col("phase_count").cast("int"),
        col("boss_classification"),  # Single-Phase, Multi-Phase, God-Tier, etc.
        
        # Calculated difficulty score (simplified)
        when(col("boss_classification") == "God-Tier", lit(100))
            .when(col("boss_classification") == "Multi-Phase", lit(80))
            .when(col("boss_classification") == "High-HP", lit(60))
            .when(col("boss_classification") == "Standard", lit(40))
            .otherwise(lit(20)).alias("difficulty_score"),
        
        col("is_dlc").cast("int"),
        
        current_timestamp().alias("created_at")
    )
    
    return df_fact

In [0]:
def create_fact_spell_usage(config: dict) -> DataFrame:
    """
    Create spell usage fact table
    
    Sources: silver.sorceries (86), silver.incantations (131)
    Grain: One row per spell
    Metrics: FP costs (min/max/per_use structure), requirements
    """
    logger.info("📈 Creating fact_spell_usage...")
    
    # Sorceries
    df_sorc = spark.table(
        f"{config['catalog']}.{config['silver_schema']}.sorceries"
    ).select(
        lit("Sorcery").alias("spell_type"),
        col("sorcery_name").alias("spell_name"),
        col("required_int").cast("int").alias("required_intelligence"),
        col("required_fai").cast("int").alias("required_faith"),
        col("required_arc").cast("int").alias("required_arcane"),
        col("memory_slots").cast("int").alias("slots_required"),
        col("stamina_cost").cast("double"),
        col("fp_cost_min").cast("int"),
        col("fp_cost_max").cast("int"),
        col("fp_cost_per_use").cast("int"),
        col("fp_cost_type"),
        col("bonus"),
        col("is_dlc").cast("int")
    )
    
    # Incantations
    df_incan = spark.table(
        f"{config['catalog']}.{config['silver_schema']}.incantations"
    ).select(
        lit("Incantation").alias("spell_type"),
        col("incantation_name").alias("spell_name"),
        col("required_int").cast("int").alias("required_intelligence"),
        col("required_fai").cast("int").alias("required_faith"),
        col("required_arc").cast("int").alias("required_arcane"),
        col("memory_slots").cast("int").alias("slots_required"),
        col("stamina_cost").cast("double"),
        col("fp_cost_min").cast("int"),
        col("fp_cost_max").cast("int"),
        col("fp_cost_per_use").cast("int"),
        col("fp_cost_type"),
        col("bonus"),
        col("is_dlc").cast("int").alias("is_dlc")
    )
    
    df_fact = df_sorc.union(df_incan).select(
        monotonically_increasing_id().alias("spell_usage_key"),
        col("*"),
        ((coalesce(col("fp_cost_min"), lit(0)) + coalesce(col("fp_cost_max"), lit(0))) / 2.0).alias("fp_efficiency"),
        (coalesce(col("required_intelligence"), lit(0)) +
         coalesce(col("required_faith"), lit(0)) +
         coalesce(col("required_arcane"), lit(0))).alias("total_stat_requirement"),
        current_timestamp().alias("created_at")
    )
    
    return df_fact

In [0]:
def create_fact_status_effects(config: dict) -> DataFrame:
    """
    Create status effects fact table
    
    Source: silver.weapons_upgrades (passive effects with dual buildup)
    Grain: One row per weapon per upgrade level with status effect
    Focus: Buildup efficiency analysis
    """
    logger.info("📈 Creating fact_status_effects...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.weapons_upgrades")
    
    # Filter only weapons with passive effects
    df_fact = df.filter(col("passive_effect_type").isNotNull()).select(
        monotonically_increasing_id().alias("status_effect_key"),
        
        col("weapon_name"),
        col("upgrade_level"),
        
        col("passive_effect_type").alias("effect_type"),  # poison, frostbite, bleed, etc.
        col("passive_buildup_primary").cast("int").alias("buildup_primary"),
        col("passive_buildup_secondary").cast("int").alias("buildup_secondary"),
        col("has_dual_buildup").cast("boolean").alias("has_dual_buildup"),
        
        # Total buildup
        (coalesce(col("passive_buildup_primary"), lit(0)) + 
         coalesce(col("passive_buildup_secondary"), lit(0))).alias("total_buildup"),
        
        current_timestamp().alias("created_at")
    )
    
    return df_fact

In [0]:
def create_bridge_location_items(config: dict) -> DataFrame:
    """
    Bridge: Locations ↔ Items
    Source: silver.location_items (2,937 relationships)
    """
    logger.info("🔗 Creating bridge_location_items...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.location_items")
    
    return df.select(
        monotonically_increasing_id().alias("location_item_key"),
        col("location_id"),
        col("item_name"),
        current_timestamp().alias("created_at")
    )

def create_bridge_location_npcs(config: dict) -> DataFrame:
    """
    Bridge: Locations ↔ NPCs
    Source: silver.location_npcs (215 relationships)
    """
    logger.info("🔗 Creating bridge_location_npcs...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.location_npcs")
    
    return df.select(
        monotonically_increasing_id().alias("location_npc_key"),
        col("location_id"),
        col("npc_name"),
        current_timestamp().alias("created_at")
    )

def create_bridge_location_creatures(config: dict) -> DataFrame:
    """
    Bridge: Locations ↔ Creatures
    Source: silver.location_creatures (789 relationships)
    """
    logger.info("🔗 Creating bridge_location_creatures...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.location_creatures")
    
    return df.select(
        monotonically_increasing_id().alias("location_creature_key"),
        col("location_id"),
        col("creature_name"),
        current_timestamp().alias("created_at")
    )

def create_bridge_location_bosses(config: dict) -> DataFrame:
    """
    Bridge: Locations ↔ Bosses
    Source: silver.location_bosses (235 relationships)
    """
    logger.info("🔗 Creating bridge_location_bosses...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.location_bosses")
    
    return df.select(
        monotonically_increasing_id().alias("location_boss_key"),
        col("location_id"),
        col("boss_name"),
        current_timestamp().alias("created_at")
    )


def create_bridge_boss_drops(config: dict) -> DataFrame:
    """
    Bridge: Bosses → Drop Items (with classification)
    
    Source: silver.boss_drops (with drop_type classification)
    Grain: One row per boss per location per drop item
    
    Drop types: runes, remembrance, great_rune, weapon, armor, 
                material, talisman, ash_of_war, item
    """
    logger.info("🔗 Creating bridge_boss_drops...")
    
    df = spark.table(f"{config['catalog']}.{config['silver_schema']}.boss_drops")
    
    return df.select(
        monotonically_increasing_id().alias("boss_drop_key"),
        col("boss_id"),
        col("boss_name"),
        col("location_name"),
        col("drop_item"),
        col("drop_type"),  # Critical: 9 drop type classifications
        col("drop_order").cast("int"),
        current_timestamp().alias("created_at")
    )


In [0]:
def create_agg_weapon_rankings(config: dict) -> DataFrame:
    """
    Pre-aggregated weapon rankings by damage efficiency
    
    Metrics:
    - Damage per stat point (attack / (STR + DEX))
    - Damage per weight
    - Best weapons by category
    """
    logger.info("📊 Creating agg_weapon_rankings...")
    
    df_fact = spark.table(f"{config['catalog']}.{config['gold_schema']}.fact_weapon_stats")
    
    # Get max upgrade level for each weapon
    window_max = Window.partitionBy("weapon_name").orderBy(col("upgrade_level").desc())
    
    df_max_upgrade = df_fact.withColumn("rn", row_number().over(window_max)) \
        .filter(col("rn") == 1) \
        .drop("rn")
    
    # Rank by damage efficiency
    window_rank = Window.partitionBy("category").orderBy(col("damage_per_stat_point").desc())
    
    df_agg = df_max_upgrade.select(
        col("weapon_name"),
        col("category"),
        col("upgrade_level"),
        col("total_attack_power"),
        col("damage_per_stat_point"),
        col("damage_per_weight"),
        rank().over(window_rank).alias("rank_in_category"),
        current_timestamp().alias("created_at")
    )
    
    return df_agg

In [0]:
def create_agg_armor_efficiency(config: dict) -> DataFrame:
    """
    Pre-aggregated armor efficiency metrics
    
    Metrics:
    - Protection per weight
    - Poise per weight
    - Best armor pieces by type
    """
    logger.info("📊 Creating agg_armor_efficiency...")
    
    df_fact = spark.table(f"{config['catalog']}.{config['gold_schema']}.fact_armor_stats")
    
    window_rank = Window.partitionBy("armor_type").orderBy(col("protection_per_weight").desc())
    
    df_agg = df_fact.select(
        col("armor_name"),
        col("armor_type"),
        col("weight"),
        col("negation_physical"),
        col("protection_per_weight"),
        col("poise_per_weight"),
        rank().over(window_rank).alias("rank_in_type"),
        current_timestamp().alias("created_at")
    )
    
    return df_agg

In [0]:
def create_agg_status_buildup(config: dict) -> DataFrame:
    """
    Pre-aggregated status effect buildup analysis
    
    Focus: Best weapons for inflicting status effects
    """
    logger.info("📊 Creating agg_status_buildup...")
    
    df_fact = spark.table(f"{config['catalog']}.{config['gold_schema']}.fact_status_effects")
    
    # Get max upgrade for each weapon-effect combination
    window_max = Window.partitionBy("weapon_name", "effect_type") \
        .orderBy(col("upgrade_level").desc())
    
    df_max = df_fact.withColumn("rn", row_number().over(window_max)) \
        .filter(col("rn") == 1) \
        .drop("rn")
    
    # Rank by total buildup
    window_rank = Window.partitionBy("effect_type").orderBy(col("total_buildup").desc())
    
    df_agg = df_max.select(
        col("weapon_name"),
        col("effect_type"),
        col("upgrade_level"),
        col("buildup_primary"),
        col("buildup_secondary"),
        col("total_buildup"),
        col("has_dual_buildup"),
        rank().over(window_rank).alias("rank_in_effect"),
        current_timestamp().alias("created_at")
    )
    
    return df_agg

In [0]:
def create_agg_boss_difficulty(config: dict) -> DataFrame:
    """
    Pre-aggregated boss difficulty metrics
    
    Combines: HP, phases, drop value to estimate difficulty
    """
    logger.info("📊 Creating agg_boss_difficulty...")
    
    df_bosses = spark.table(f"{config['catalog']}.{config['gold_schema']}.fact_boss_encounters")
    df_drops = spark.table(f"{config['catalog']}.{config['gold_schema']}.bridge_boss_drops")
    
    # Count drops by type
    df_drop_counts = df_drops.groupBy("boss_name").agg(
        spark_count("*").alias("total_drops"),
        spark_count(when(col("drop_type") == "remembrance", 1)).alias("remembrance_count"),
        spark_count(when(col("drop_type") == "great_rune", 1)).alias("great_rune_count")
    )
    
    df_agg = df_bosses.join(df_drop_counts, "boss_name", "left").select(
        col("boss_name"),
        col("hp_min"),
        col("hp_max"),
        col("phase_count"),  # ✅ FIXED: Changed from hp_phases to phase_count
        col("boss_classification"),  # ✅ FIXED: Changed from hp_classification to boss_classification
        col("difficulty_score"),
        coalesce(col("total_drops"), lit(0)).alias("total_drops"),
        coalesce(col("remembrance_count"), lit(0)).alias("remembrance_count"),
        col("is_dlc"),
        
        # Rank by difficulty
        rank().over(Window.orderBy(col("difficulty_score").desc())).alias("difficulty_rank"),
        
        current_timestamp().alias("created_at")
    )
    
    return df_agg

In [0]:
def run_gold_transformations(config: dict) -> dict:
    """
    Execute all Gold layer transformations
    
    Creates:
    - 8 dimension tables
    - 6 fact tables
    - 5 bridge tables
    - 4 aggregate tables
    
    Total: 23 Gold tables
    """
    stats = {
        "start_time": datetime.now(),
        "tables_created": [],
        "tables_failed": []
    }
    
    logger.info("=" * 80)
    logger.info("🏆 STARTING GOLD LAYER TRANSFORMATIONS")
    logger.info("=" * 80)
    
    # ========================================================================
    # DIMENSION TABLES (8)
    # ========================================================================
    logger.info("\n📊 CREATING DIMENSION TABLES...")
    
    dimensions = [
        ("dim_weapons", create_dim_weapons),
        ("dim_shields", create_dim_shields),
        ("dim_armors", create_dim_armors),
        ("dim_items", create_dim_items),
        ("dim_locations", create_dim_locations),
        ("dim_npcs", create_dim_npcs),
        ("dim_bosses", create_dim_bosses),
        ("dim_date", create_dim_date)
    ]
    
    for table_name, transform_func in dimensions:
        try:
            df = transform_func(config)
            full_name = f"{config['catalog']}.{config['gold_schema']}.{table_name}"
            
            df.write \
                .format("delta") \
                .mode("overwrite") \
                .option("overwriteSchema", "true") \
                .saveAsTable(full_name)
            
            count = df.count()
            stats["tables_created"].append(full_name)
            logger.info(f"  ✅ {table_name}: {count:,} rows")
        except Exception as e:
            stats["tables_failed"].append(table_name)
            logger.error(f"  ❌ {table_name}: {e}")
    
    # ========================================================================
    # FACT TABLES (6)
    # ========================================================================
    logger.info("\n📈 CREATING FACT TABLES...")
    
    facts = [
        ("fact_weapon_stats", create_fact_weapon_stats),
        ("fact_shield_stats", create_fact_shield_stats),
        ("fact_armor_stats", create_fact_armor_stats),
        ("fact_boss_encounters", create_fact_boss_encounters),
        ("fact_spell_usage", create_fact_spell_usage),
        ("fact_status_effects", create_fact_status_effects)
    ]
    
    for table_name, transform_func in facts:
        try:
            df = transform_func(config)
            full_name = f"{config['catalog']}.{config['gold_schema']}.{table_name}"
            
            df.write \
                .format("delta") \
                .mode("overwrite") \
                .option("overwriteSchema", "true") \
                .saveAsTable(full_name)
            
            count = df.count()
            stats["tables_created"].append(full_name)
            logger.info(f"  ✅ {table_name}: {count:,} rows")
        except Exception as e:
            stats["tables_failed"].append(table_name)
            logger.error(f"  ❌ {table_name}: {e}")
    
    # ========================================================================
    # BRIDGE TABLES (5)
    # ========================================================================
    logger.info("\n🔗 CREATING BRIDGE TABLES...")
    
    bridges = [
        ("bridge_location_items", create_bridge_location_items),
        ("bridge_location_npcs", create_bridge_location_npcs),
        ("bridge_location_creatures", create_bridge_location_creatures),
        ("bridge_location_bosses", create_bridge_location_bosses),
        ("bridge_boss_drops", create_bridge_boss_drops)
    ]
    
    for table_name, transform_func in bridges:
        try:
            df = transform_func(config)
            full_name = f"{config['catalog']}.{config['gold_schema']}.{table_name}"
            
            df.write \
                .format("delta") \
                .mode("overwrite") \
                .option("overwriteSchema", "true") \
                .saveAsTable(full_name)
            
            count = df.count()
            stats["tables_created"].append(full_name)
            logger.info(f"  ✅ {table_name}: {count:,} rows")
        except Exception as e:
            stats["tables_failed"].append(table_name)
            logger.error(f"  ❌ {table_name}: {e}")
    
    # ========================================================================
    # AGGREGATE TABLES (4) - Must run AFTER facts/bridges
    # ========================================================================
    logger.info("\n📊 CREATING AGGREGATE TABLES...")
    
    aggregates = [
        ("agg_weapon_rankings", create_agg_weapon_rankings),
        ("agg_armor_efficiency", create_agg_armor_efficiency),
        ("agg_status_buildup", create_agg_status_buildup),
        ("agg_boss_difficulty", create_agg_boss_difficulty)
    ]
    
    for table_name, transform_func in aggregates:
        try:
            df = transform_func(config)
            full_name = f"{config['catalog']}.{config['gold_schema']}.{table_name}"
            
            df.write \
                .format("delta") \
                .mode("overwrite") \
                .option("overwriteSchema", "true") \
                .saveAsTable(full_name)
            
            count = df.count()
            stats["tables_created"].append(full_name)
            logger.info(f"  ✅ {table_name}: {count:,} rows")
        except Exception as e:
            stats["tables_failed"].append(table_name)
            logger.error(f"  ❌ {table_name}: {e}")
    
    stats["end_time"] = datetime.now()
    stats["duration"] = (stats["end_time"] - stats["start_time"]).total_seconds()
    
    logger.info("\n" + "=" * 80)
    logger.info("🏆 GOLD LAYER COMPLETE")
    logger.info(f"✅ Tables created: {len(stats['tables_created'])}")
    logger.info(f"❌ Tables failed: {len(stats['tables_failed'])}")
    logger.info(f"⏱️  Duration: {stats['duration']:.2f}s")
    logger.info("=" * 80)
    
    return stats

In [0]:
if __name__ == "__main__":
    CONFIG = {
        "catalog": "eldenringcatalog",
        "bronze_schema": "bronze",
        "silver_schema": "silver",
        "gold_schema": "gold",
        "batch_id": datetime.now().strftime("%Y%m%d_%H%M%S")
    }
    
    try:
        stats = run_gold_transformations(CONFIG)
        
        logger.info("\n📊 GOLD LAYER SUMMARY")
        logger.info(f"Total tables: {len(stats['tables_created'])}")
        logger.info("\n✅ CREATED TABLES:")
        for table in stats['tables_created']:
            logger.info(f"  • {table}")
        
        if stats['tables_failed']:
            logger.warning("\n❌ FAILED TABLES:")
            for table in stats['tables_failed']:
                logger.warning(f"  • {table}")
        
    except Exception as e:
        logger.error(f"\n❌ Gold layer failed: {str(e)}")
        import traceback
        traceback.print_exc()
        raise

In [0]:
%sql
select * from eldenringcatalog.gold.dim_locations