### Customized schema Evolution feature in plain spark which should work in GCP unlike databricks (cloudFiles)


In [0]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, to_json, struct
from pyspark.sql.types import StructType

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("SchemaEvolutionGCS") \
    .getOrCreate()

# GCS path
gcs_path = "gs://your-bucket/path/*.csv"  # or *.json

# Expected schema (start with known columns)
expected_schema = StructType([])  # Empty means infer first batch

# Function to rescue unexpected fields
def rescue_unexpected_fields(df, expected_cols):
    actual_cols = set(df.columns)
    extra_cols = actual_cols - expected_cols
    if extra_cols:
        df = df.withColumn("_rescued_data", to_json(struct(*[col(c) for c in extra_cols])))
        df = df.drop(*extra_cols)
    return df

# Read files and infer schema dynamically
df = spark.read.option("header", "true").csv(gcs_path)  # For JSON use .json(gcs_path)

# If expected schema is empty, initialize it
if len(expected_schema.fields) == 0:
    expected_schema = df.schema

# Align columns: add missing columns with nulls
expected_cols = set(expected_schema.fieldNames())
actual_cols = set(df.columns)

missing_cols = expected_cols - actual_cols
for col_name in missing_cols:
    df = df.withColumn(col_name, lit(None))

# Rescue unexpected fields
df = rescue_unexpected_fields(df, expected_cols)

# Reorder columns to match expected schema
df = df.select(*expected_schema.fieldNames(), "_rescued_data")

# Write cleaned data back to GCS or Delta
df.write.mode("overwrite").parquet("gs://your-bucket/cleaned-data/")
