In [0]:
import re

def clean_column_names(df):
    # Remove all characters except letters, digits, underscore
    new_cols = [re.sub(r'[^0-9a-zA-Z_]', '', c).lower() for c in df.columns]
    
    # Ensure uniqueness in case of collisions
    seen = {}
    clean_cols = []
    for col in new_cols:
        if col in seen:
            seen[col] += 1
            clean_cols.append(f"{col}_{seen[col]}")
        else:
            seen[col] = 0
            clean_cols.append(col)
    
    return df.toDF(*clean_cols)

# Load from Bronze
bronze_df = spark.read.format("delta").load("dbfs:/nyc-taxi/bronze/")
bronze_df = clean_column_names(bronze_df)

# Filter data
silver_df = bronze_df.filter(
    (bronze_df.passenger_count.isNotNull()) &
    (bronze_df.trip_distance > 0)
)

# Write to Silver path
silver_path = "dbfs:/nyc-taxi/silver/"
silver_df.write.format("delta").mode("overwrite").save(silver_path)

# Register the cleaned Silver table
spark.sql(f"CREATE TABLE nyc_taxi_silver USING DELTA LOCATION '{silver_path}'")

print("Silver layer rebuilt with clean schema and metadata.")


Silver layer rebuilt with clean schema and metadata.
