# Silver Layer - Data Cleaning and Transformation
This notebook reads from the bronze layer and creates a cleaned, validated silver layer.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, trim, upper, lower, when, coalesce, 
    to_date, datediff, current_date, year, month,
    regexp_replace, round as spark_round
)
from pyspark.sql.types import DecimalType
import yaml

In [0]:
# Initialize Spark session
spark = SparkSession.builder.getOrCreate()

# Configuration
CATALOG_NAME = "orderbooks_main"
BRONZE_SCHEMA = "bronze"
SILVER_SCHEMA = "silver"
BRONZE_TABLE = "orderbook"
SILVER_TABLE = "orderbook_clean"

print(f"Building Silver Layer")
print(f"Source: {CATALOG_NAME}.{BRONZE_SCHEMA}.{BRONZE_TABLE}")
print(f"Target: {CATALOG_NAME}.{SILVER_SCHEMA}.{SILVER_TABLE}")

In [0]:
# Create silver schema if it doesn't exist
spark.sql(f"USE CATALOG {CATALOG_NAME}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {SILVER_SCHEMA}")
print(f"✓ Schema {SILVER_SCHEMA} ready")

In [0]:
# Read from bronze layer
bronze_table_path = f"{CATALOG_NAME}.{BRONZE_SCHEMA}.{BRONZE_TABLE}"
df_bronze = spark.table(bronze_table_path)

print(f"✓ Loaded {df_bronze.count():,} rows from bronze layer")
print(f"\nBronze Schema:")
df_bronze.printSchema()

In [0]:
# Preview bronze data
df_bronze.display()

In [0]:
df_bronze.orderBy("anticipated_end_date").display(10)

In [0]:
null_anticipated_end_dates = df_bronze.filter(col("anticipated_end_date").isNull())

null_anticipated_end_dates.groupby("Office").count().orderBy("count", ascending=False).display()
# Check for null values in critical columns
# critical_columns = ['JobNumber', 'ProjectTitle')

## Data Quality Checks

In [0]:
# Check for duplicates based on row_hash
total_rows = df_bronze.count()
unique_rows = df_bronze.select("row_hash").distinct().count()
duplicates = total_rows - unique_rows

print(f"Data Quality Summary:")
print(f"  Total rows: {total_rows:,}")
print(f"  Unique rows: {unique_rows:,}")
print(f"  Duplicates: {duplicates:,}")

# Check for null values in critical columns
critical_columns = ['JobNumber', 'ProjectTitle', 'Client', 'Office']
print(f"\nNull counts in critical columns:")
for col_name in critical_columns:
    null_count = df_bronze.filter(col(col_name).isNull()).count()
    print(f"  {col_name}: {null_count:,}")

## Data Transformations

In [0]:
# Start transformations
df_silver = df_bronze

# 1. Remove duplicates (keep first occurrence based on row_hash)
df_silver = df_silver.dropDuplicates(['row_hash'])
print(f"✓ Removed duplicates: {df_silver.count():,} rows remaining")

# 2. Clean and standardize string columns
string_columns = ['JobNumber', 'Office', 'office_div', 'ProjectTitle', 
                  'Client', 'location_country', 'Currency', 'Status', 'ProjectType']

for col_name in string_columns:
    df_silver = df_silver.withColumn(
        col_name,
        trim(col(col_name))
    )

print(f"✓ Cleaned string columns")

# 3. Standardize Status values
df_silver = df_silver.withColumn(
    'Status',
    upper(trim(col('Status')))
)

# 4. Ensure NewProject is properly set (1 or 0)
df_silver = df_silver.withColumn(
    'NewProject',
    when(col('NewProject').isNull(), 0)
    .otherwise(col('NewProject'))
)

print(f"✓ Standardized categorical values")

In [0]:
# 5. Calculate derived fields

# Calculate project duration in days (if both dates exist)
df_silver = df_silver.withColumn(
    'project_duration_days',
    when(
        (col('StartDate').isNotNull()) & (col('anticipated_end_date').isNotNull()),
        datediff(col('anticipated_end_date'), col('StartDate'))
    ).otherwise(None)
)

# Calculate days until project end (from current date)
df_silver = df_silver.withColumn(
    'days_until_end',
    when(
        col('anticipated_end_date').isNotNull(),
        datediff(col('anticipated_end_date'), current_date())
    ).otherwise(None)
)

# Calculate fee completion percentage
df_silver = df_silver.withColumn(
    'fee_completion_pct',
    when(
        (col('gross_fee_usd').isNotNull()) & (col('gross_fee_usd') > 0),
        spark_round((col('fee_earned_usd') / col('gross_fee_usd')) * 100, 2)
    ).otherwise(None)
)

# Determine if project is active, completed, or upcoming
df_silver = df_silver.withColumn(
    'project_status_derived',
    when(
        col('days_until_end') < 0, 'COMPLETED'
    ).when(
        (col('days_until_end') >= 0) & (col('fee_earned_usd') > 0), 'ACTIVE'
    ).when(
        (col('days_until_end') >= 0) & (col('fee_earned_usd').isNull() | (col('fee_earned_usd') == 0)), 'UPCOMING'
    ).otherwise('UNKNOWN')
)

print(f"✓ Calculated derived fields")

In [0]:
# 6. Add data quality flags

df_silver = df_silver.withColumn(
    'is_valid_record',
    when(
        (col('JobNumber').isNotNull()) & 
        (col('ProjectTitle').isNotNull()) & 
        (col('Client').isNotNull()) & 
        (col('Office').isNotNull()),
        True
    ).otherwise(False)
)

# Flag records with financial inconsistencies
df_silver = df_silver.withColumn(
    'has_financial_issues',
    when(
        (col('gross_fee_usd').isNotNull()) & 
        (col('fee_earned_usd').isNotNull()) & 
        (col('gross_fee_yet_to_be_earned_usd').isNotNull()) &
        (col('gross_fee_usd') != (col('fee_earned_usd') + col('gross_fee_yet_to_be_earned_usd'))),
        True
    ).otherwise(False)
)

print(f"✓ Added data quality flags")

## Quality Checks on Silver Data

In [0]:
# Summary statistics
print(f"\nSilver Layer Summary:")
print(f"  Total rows: {df_silver.count():,}")
print(f"  Valid records: {df_silver.filter(col('is_valid_record') == True).count():,}")
print(f"  Records with financial issues: {df_silver.filter(col('has_financial_issues') == True).count():,}")

print(f"\nProject Status Distribution:")
df_silver.groupBy('project_status_derived').count().orderBy('count', ascending=False).show()

print(f"\nOffice Distribution:")
df_silver.groupBy('Office').count().orderBy('count', ascending=False).show(10)

In [0]:
# Preview transformed data
display(df_silver.select(
    'JobNumber', 'ProjectTitle', 'Client', 'Office',
    'gross_fee_usd', 'fee_earned_usd', 'fee_completion_pct',
    'project_status_derived', 'days_until_end',
    'is_valid_record', 'has_financial_issues'
).limit(10))

## Write to Silver Layer

In [0]:
# Write to silver table
silver_table_path = f"{CATALOG_NAME}.{SILVER_SCHEMA}.{SILVER_TABLE}"

try:
    df_silver.write.mode("overwrite").saveAsTable(silver_table_path)
    
    final_count = spark.table(silver_table_path).count()
    
    print(f"\n✓ SUCCESS!")
    print(f"✓ Data written to: {silver_table_path}")
    print(f"✓ Total records in silver table: {final_count:,}")
    
except Exception as e:
    print(f"❌ Error writing to silver layer: {e}")
    import traceback
    traceback.print_exc()
    raise

In [0]:
# Verify the silver table
print(f"\nSilver Table Schema:")
spark.table(silver_table_path).printSchema()