In [1]:
# ==============================================================================
# SPARK PERFORMANCE TUNING & OPTIMIZATION
# ==============================================================================

# Shuffle Partitions: Sets the number of partitions to use when shuffling data for 
# joins or aggregations. 200 is the default, but vital to declare for consistency.
spark.conf.set("spark.sql.shuffle.partitions", "200")

# Adaptive Query Execution (AQE): Enables Spark's ability to re-optimize query 
# plans at runtime based on the actual statistics of the data being processed.
spark.conf.set("spark.sql.adaptive.enabled", "true")

# Dynamic Partition Coalescing: Allows AQE to merge small partitions together 
# to reduce overhead when data volume is lower than expected.
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")

# Memory Management: Forces Spark to target 64MB partitions. Smaller sizes are 
# often better for environments to prevent 'Out of Memory' (OOM) errors.
spark.conf.set("spark.sql.adaptive.advisoryPartitionSizeInBytes", "64mb") 

# Skew Join Optimization: Automatically handles 'data skew' where one partition 
# is much larger than others, preventing a single task from bottlenecking the pipeline.
spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")

# Checkpointing: Sets the directory for metadata and state storage. 
# Used to truncate long lineage chains and improve fault tolerance.
spark.sparkContext.setCheckpointDir("Files/checkpoints")

print("Spark configurations applied!")

StatementMeta(, 9b8a09ed-c18f-4da2-b607-66d6efb5db58, 3, Finished, Available, Finished)

Spark configurations applied!


In [2]:
# ==============================================================================
# STANDARD LIBRARIES & EXTERNAL DATA TOOLS
# ==============================================================================

# mssparkutils: Microsoft Fabric utility toolset for file system 
# management and notebook orchestration.
from notebookutils import mssparkutils

# Advanced String Functions:
# - regexp_replace: Used for complex pattern-based cleaning (e.g., removing special characters).
# - lower / initcap: Standardizes casing for names and categories.
# - translate: Efficiently replaces specific characters (useful for encoding or accent removal).
# - when: Essential for conditional 'if-then-else' logic within Spark columns.
from pyspark.sql.functions import col, trim, regexp_replace, lower, initcap, translate, when 

# F: Standard alias for the full PySpark SQL functions suite.
from pyspark.sql import functions as F

# Data Science Ecosystem:
# - pandas (pd): Used for smaller, local data manipulations or converting to/from Spark.
# - numpy (np): Provides support for large, multi-dimensional arrays and high-level math.
import pandas as pd
import numpy as np

StatementMeta(, 9b8a09ed-c18f-4da2-b607-66d6efb5db58, 4, Finished, Available, Finished)

In [3]:
# Function to interface with the Spark catalog and retrieve data
def extract_table(table_name: str):
    """
    Reads a table from the Spark catalog and loads it into a DataFrame.
    
    In Microsoft Fabric, this function pulls metadata and data from the 
    defined Lakehouse tables, supporting both managed and external tables.

    Args:
        table_name (str): The name of the table to be extracted.
                          Example: 'BronzeLakeHouse.dbo.customers'

    Returns:
        pyspark.sql.dataframe.DataFrame: A Spark DataFrame containing the table data.
    """

    df = spark.read.table(table_name)
    
    return df

StatementMeta(, 9b8a09ed-c18f-4da2-b607-66d6efb5db58, 5, Finished, Available, Finished)

In [4]:
# Function to automate whitespace removal across all string-type data
def trim_all_string_cols(df):
    """
    Scans the DataFrame schema to identify all string columns and applies 
    the trim() function to remove leading and trailing whitespace.
    
    Args:
        df (pyspark.sql.DataFrame): The input DataFrame.
        
    Returns:
        pyspark.sql.DataFrame: DataFrame with all string columns cleaned.
    """

    # Identify string columns: df.dtypes returns a list of (column_name, data_type).
    # We filter specifically for 'string' to avoid errors on numeric or date types.
    string_columns = [c for c, t in df.dtypes if t == 'string']

    # withColumns: Efficiently applies the transformation to multiple columns 
    # simultaneously using a dictionary comprehension.
    df_trimmed = df.withColumns({c: trim(col(c)) for c in string_columns})

    return df_trimmed

StatementMeta(, 9b8a09ed-c18f-4da2-b607-66d6efb5db58, 6, Finished, Available, Finished)

In [5]:
# Function to standardize city names by removing accents and special characters
def clean_geolocation_city(df_geo):

    print("In clean_geolocation_city ...")
    
    # commented old code not efficient (Keep for reference)

    # df_cleaned = df_geo.withColumn(
    #     "geolocation_city_clean",
    #     # Step 1: Convert to lowercase
    #     lower(F.col("geolocation_city"))
    # ).withColumn(
    #     "geolocation_city_clean",
    #     # Step 2: Remove extra whitespaces (leading, trailing, multiple spaces)
    #     regexp_replace(trim(F.col("geolocation_city_clean")), "\\s+", " ")    
    # ).withColumn(
    #     "geolocation_city_clean",
    #     # Step 3: Remove special characters and accents
    #     translate(
    #         F.col("geolocation_city_clean"),
    #         "áàâãäéèêëíìîïóòôõöúùûüçñ",
    #         "aaaaaeeeeiiiiooooouuuucn"
    #     )
    # ).withColumn(
    #     "geolocation_city_clean",
    #     # Step 4: Remove remaining non-alphanumeric characters (except spaces)
    #     regexp_replace(F.col("geolocation_city_clean"), "[^a-z0-9\\s]", "")
    # )


    # Nested Transformation Logic:
    # 1. lower() & trim(): Standardizes casing and removes outer whitespace.
    # 2. regexp_replace(..., "\\s+", " "): Collapses multiple internal spaces into one.
    # 3. translate(...): Maps accented characters (e.g., 'ã') to plain equivalents ('a').
    # 4. regexp_replace(..., "[^a-z0-9\\s]", ""): Removes punctuation/symbols.
    cleaned_city = regexp_replace(
        translate(
            regexp_replace(
                trim(lower(F.col("geolocation_city"))),
                "\\s+", " "
            ),
            "áàâãäéèêëíìîïóòôõöúùûüçñ",
            "aaaaaeeeeiiiiooooouuuucn"
        ),
        "[^a-z0-9\\s]", ""
    )    

    # Apply the combined expression and replace the original column
    df_cleaned = df_geo.withColumn("geolocation_city_clean", cleaned_city)
    df_cleaned = df_cleaned.drop("geolocation_city").withColumnRenamed("geolocation_city_clean", "geolocation_city")

    return df_cleaned

StatementMeta(, 9b8a09ed-c18f-4da2-b607-66d6efb5db58, 7, Finished, Available, Finished)

In [6]:
# Function to aggregate spatial coordinates and remove redundancy
def group_consolidate_data(df):
    """
    Consolidates geolocation data by grouping on postal and administrative levels.
    Calculates the average latitude and longitude for each unique Zip/State/City 
    combination and ensures the final output is free of duplicate records.

    Args:
        df (pyspark.sql.DataFrame): The cleaned geolocation DataFrame.

    Returns:
        pyspark.sql.DataFrame: A summarized DataFrame with one set of coordinates per location.
    """

    print("In group_consolidate_data ...")
  
    # 1. Spatial Aggregation: Group by hierarchy and calculate the mean coordinates.
    # We use mean() to find the geographic center of all entries for a specific zip code.
    group_df = df.groupBy(['geolocation_zip_code_prefix', 'geolocation_state', 'geolocation_city']) \
        .agg(
            F.mean('geolocation_lat').alias('geolocation_lat'),
            F.mean('geolocation_lng').alias('geolocation_lng')
)

    # 2. De-duplication: Ensures each group result is unique across the dataset.
    # Note: Logic to count duplicates is currently disabled but preserved for debugging.
    group_df = group_df.dropDuplicates()

    return group_df

StatementMeta(, 9b8a09ed-c18f-4da2-b607-66d6efb5db58, 8, Finished, Available, Finished)

In [7]:
# Function to persist the consolidated geolocation data into a Delta Table
def load_df_to_delta(df, table_name: str, mode: str = "overwrite"):
    """
    Load and saves dataframe data as a Delta Table.
    
    Args:
        df (spark DataFrame): df to be loaded to delta table.
        table_name: name of destination table 
        mode (str): 'overwrite' to replace the table, default.
    """
    print("In load_df_to_delta ...")

    try:
        # 1. Save as a Delta Table 
        df.write.format("delta") \
            .mode(mode) \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)

        print(f"Table '{table_name}' loaded successfully !")
        
    except Exception as e:
        # Error handling to capture failures such as permissions or path issues
        print(f"Error processing table {table_name}: {str(e)}")


StatementMeta(, 9b8a09ed-c18f-4da2-b607-66d6efb5db58, 9, Finished, Available, Finished)

In [8]:
# Function to perform high-performance uniqueness validation on composite keys
def check_records_count(df):
    """
    Audits the DataFrame for duplicates based on the composite key of 
    Zip Code, State, and City. Uses caching and repartitioning to 
    optimize performance across multiple count actions.

    Args:
        df (pyspark.sql.DataFrame): The DataFrame to be audited.
    """

    print("In check_records_count ...")
    
    # 1. Define the composite key columns: Uniqueness is defined by this triplet.
    key_cols = ['geolocation_zip_code_prefix', 'geolocation_state', 'geolocation_city']
    
    # 2. Optimization: Repartitioning moves identical keys to the same partition.
    # .cache() keeps the result in memory, avoiding redundant computation for counts.
    df_cached = df.repartition(200, *key_cols).cache()
    
    # 3. Execution: Calculate total vs. distinct counts.
    total_count = df_cached.count()
    print(f"Total count: {total_count}")
    
    # Count distinct using the cached data
    distinct_count = df_cached.select(key_cols).distinct().count()
    print(f"Distinct count (composite key): {distinct_count}")
    
    duplicate_count = total_count - distinct_count
    print(f"Duplicate records: {duplicate_count}")
    
    # 4. Investigation: Identify the specific "hot" keys causing duplicates.
    if duplicate_count > 0:
        print("\nRecords to investigate (duplicates based on composite key):")
        duplicates = df_cached.groupBy(key_cols) \
            .count() \
            .filter(F.col("count") > 1) \
            .orderBy(F.desc("count")) \
            .limit(50)  # Limit results
        
        duplicates.show(20, truncate=False)
    else:
        print("\nNo duplicate records found!")
    
    # 5. Resource Management: Release memory once the audit is finished.
    df_cached.unpersist(blocking=True)
    
    print("Record count check complete!")

StatementMeta(, 9b8a09ed-c18f-4da2-b607-66d6efb5db58, 10, Finished, Available, Finished)

In [9]:
# Function to remove specific known anomalies from the geolocation dataset
def clean_dirty_data(df):
    """
    Excludes specific records identified as 'dirty data' based on manually 
    verified zip code and state mismatches.

    Args:
        df (pyspark.sql.DataFrame): The input geolocation DataFrame.

    Returns:
        pyspark.sql.DataFrame: A filtered DataFrame with the specified anomalies removed.
    """

    #Clean by removing dirty data - Geolocation

    print("In clean_dirty_data ...")

    # The filter uses the Tilde (~) as a 'NOT' operator.
    # It removes any row where the Zip Code and State combination matches the bad data list.
    df = df.filter(
        ~(
            ((df.geolocation_zip_code_prefix == "80630") & (df.geolocation_state == "SC")) |
            ((df.geolocation_zip_code_prefix == "72915") & (df.geolocation_state == "DF")) |
            ((df.geolocation_zip_code_prefix == "78557") & (df.geolocation_state == "RO")) |
            ((df.geolocation_zip_code_prefix == "02116") & (df.geolocation_state == "RN")) |
            ((df.geolocation_zip_code_prefix == "23056") & (df.geolocation_state == "AC")) |
            ((df.geolocation_zip_code_prefix == "04011") & (df.geolocation_state == "AC")) |
            ((df.geolocation_zip_code_prefix == "79750") & (df.geolocation_state == "RS")) |
            ((df.geolocation_zip_code_prefix == "21550") & (df.geolocation_state == "AC"))
        )
    )

    print("Cleaning script executed ! ")

    return df


StatementMeta(, 9b8a09ed-c18f-4da2-b607-66d6efb5db58, 11, Finished, Available, Finished)

In [10]:
# Function to identify and display records missing critical geographic identifiers
def check_null_columns(df):
    """
    Scans the dataset for missing values in the primary geographic keys 
    (Zip Code and State). If nulls are found, it prints the total count 
    and displays a sample of the affected records.

    Args:
        df (pyspark.sql.DataFrame): The input geolocation DataFrame.
    """

    print("In check_null_columns ...")
    
    # 1. Filter logic: Using the 'OR' operator (|) to catch rows missing 
    # either the zip code prefix or the state code.
    null_df = df.filter(
            F.col("geolocation_zip_code_prefix").isNull() | 
            F.col("geolocation_state").isNull()          
        )

    # 2. Count Check: Determine the volume of incomplete data.
    null_count = null_df.count()
    print(f"Records with null columns: {null_count}")

    # 3. Investigation: Only trigger the .show() action if there is 
    # actual dirty data to inspect, saving compute resources.
    if null_count > 0:
        # Reuses the filtered logic
        null_df.show()

StatementMeta(, 9b8a09ed-c18f-4da2-b607-66d6efb5db58, 12, Finished, Available, Finished)

In [11]:
# Function to align zip code prefixes with official municipality names
def apply_zipcode_city_standardization(df):
    """
    Standardizes city names for specific zip code prefixes where the raw data
    contains district names or variations instead of the official municipality.
    Based on official Brazilian CEP (postal code) data alignment.
    
    Args:
        df (pyspark.sql.DataFrame): Input DataFrame containing geolocation keys.
    
    Returns:
        pyspark.sql.DataFrame: DataFrame with corrected city names.
    """

    print("In apply_zipcode_city_standardization ...")
    
    # Initialize the column reference to track the chain of 'when' conditions
    city_col = col("geolocation_city")
    
    # Dictionary of specific Zip-to-City corrections
    # Key: Zip Code Prefix | Value: Official Municipality Name
    zipcode_corrections = {
        '45936': 'mucuri',
        '65935': 'senador la rocque',
        '09780': 'sao bernardo do campo',
        '13318': 'cabreuva',  # Jacaré is a district of Cabreúva
        '13855': 'mogi guacu',
        '17970': 'sao joao do pau dalho',
        '25936': 'mage',  # Guia de Pacobaíba and Inhomirim are districts of Magé
        '27163': 'barra do pirai',
        '28950': 'armacao dos buzios',
        '28993': 'saquarema',
        '35315': 'caratinga',  # Santa Luzia de Caratinga is a district
        '36206': 'mantiqueira',
        '38295': 'limeira do oeste',
        '38749': 'patrocinio',
        '42820': 'camacari',  # Monte Gordo is a district
        '42841': 'camacari',  # Abrantes is a district
        '45816': 'porto seguro',  # Arraial d'Ajuda is a district
        '72270': 'brasilia',  # Ceilândia is an administrative region
        '85139': 'guarapuava',
        '87365': 'quarto centenario',
        '01307': 'sao paulo',
        '03203': 'sao paulo',
        '04004': 'sao paulo',
        '04132': 'sao paulo',
        '04346': 'sao paulo',
        '05026': 'sao paulo',
        '05372': 'sao paulo',
        '05854': 'sao paulo',
        '06050': 'osasco',
        '06803': 'embu das artes',
        '06804': 'embu das artes',
        '06805': 'embu das artes',
        '06806': 'embu das artes',
        '06807': 'embu das artes',
        '06810': 'embu das artes',
        '06813': 'embu das artes',
        '06814': 'embu das artes',
        '06815': 'embu das artes',
        '06816': 'embu das artes',
        '06817': 'embu das artes',
        '06820': 'embu das artes',
        '06823': 'embu das artes',
        '06824': 'embu das artes',
        '06825': 'embu das artes',
        '06826': 'embu das artes',
        '06833': 'embu das artes',
        '06835': 'embu das artes',
        '06840': 'embu das artes',
        '06843': 'embu das artes',
        '06844': 'embu das artes',
        '06845': 'embu das artes',
        '06846': 'embu das artes',
        '06900': 'embu guacu',
        '07097': 'guarulhos',
        '07141': 'guarulhos',
        '07174': 'guarulhos',
        '07770': 'cajamar',
        '07776': 'cajamar',
        '07786': 'cajamar',
        '07787': 'cajamar',
        '07790': 'cajamar',
        '07792': 'cajamar',
        '07793': 'cajamar',
        '08120': 'sao paulo',
        '08543': 'ferraz de vasconcelos',
        '08730': 'mogi das cruzes',
        '08940': 'biritiba mirim',
        '09540': 'sao caetano do sul',
        '11610': 'sao sebastiao',
        '11612': 'sao sebastiao',
        '11619': 'sao sebastiao',
        '11621': 'sao sebastiao',
        '11623': 'sao sebastiao',
        '11626': 'sao sebastiao',
        '11628': 'sao sebastiao',
        '12140': 'sao luis do paraitinga',
        '12249': 'sao jose dos campos',
        '13380': 'nova odessa',
        '13450': 'santa barbara doeste',
        '13451': 'santa barbara doeste',
        '13453': 'santa barbara doeste',
        '13454': 'santa barbara doeste',
        '13455': 'santa barbara doeste',
        '13456': 'santa barbara doeste',
        '13457': 'santa barbara doeste',
        '13458': 'santa barbara doeste',
        '13800': 'mogi mirim',
        '13801': 'mogi mirim',
        '13802': 'mogi mirim',
        '13803': 'mogi mirim',
        '13805': 'mogi mirim',
        '13806': 'mogi mirim',
        '13807': 'mogi mirim',
        '13808': 'mogi mirim',
        '13820': 'jaguariuna',
        '13840': 'mogi guacu',
        '13841': 'mogi guacu',
        '13842': 'mogi guacu',
        '13843': 'mogi guacu',
        '13844': 'mogi guacu',
        '13845': 'mogi guacu',
        '13846': 'mogi guacu',
        '13847': 'mogi guacu',
        '13848': 'mogi guacu',
        '13849': 'mogi guacu',
        '13910': 'monte alegre do sul',
        '14110': 'ribeirao preto',
        '14407': 'franca',
        '14760': 'pitangueiras',
        '15650': 'estrela doeste',
        '15720': 'palmeira doeste',
        '15735': 'aparecida doeste',
        '15780': 'santa rita doeste',
        '15785': 'santa clara doeste',
        '15953': 'santa adelia',
        '17123': 'agudos',
        '17220': 'jau',
        '17405': 'garca',
        '18240': 'angatuba',
        '18271': 'tatui',
        '18618': 'botucatu',
        '18725': 'paranapanema',
        '18980': 'chavantes',
        '19120': 'presidente prudente',
        '19274': 'rosana',
        '19845': 'maracai',
        '19870': 'florinia',
        '19882': 'candido mota',
        '21032': 'rio de janeiro',
        '21341': 'rio de janeiro',
        '22260': 'rio de janeiro',
        '23073': 'rio de janeiro',
        '23870': 'mangaratiba',
        '23880': 'mangaratiba',
        '23885': 'mangaratiba',
        '23895': 'seropedica',
        '23968': 'angra dos reis',
        '23970': 'paraty',
        '24900': 'marica',
        '24913': 'marica',
        '24914': 'marica',
        '25860': 'paraiba do sul',
        '25882': 'sapucaia',
        '25887': 'sapucaia',
        '25912': 'mage',
        '25920': 'mage',
        '25930': 'mage',
        '25931': 'mage',
        '25935': 'mage',
        '26520': 'nilopolis',
        '26660': 'engenheiro paulo de frontin',
        '26910': 'miguel pereira',
        '26980': 'paty do alferes',
        '27155': 'barra do pirai',
        '27165': 'barra do pirai',
        '27475': 'rio claro',
        '27555': 'resende',
        '27598': 'itatiaia',
        '27650': 'valenca',
        '27655': 'valenca',
        '27657': 'valenca',
        '27770': 'vassouras',
        '27987': 'macae',
        '29132': 'viana',
        '29187': 'fundao',
        '35348': 'pingodagua',
        '37200': 'lavras',
        '37530': 'brazopolis',
        '42850': 'dias davila',
        '45195': 'planalto',
        '45470': 'jiquirica',
        '45625': 'barro preto',
        '46835': 'nova redencao',
        '47115': 'muquem de sao francisco',
        '47220': 'campo alegre de lourdes',
        '47310': 'casa nova',
        '48355': 'itamira',
        '48610': 'gloria',
        '54400': 'jaboatao dos guararapes',
        '54590': 'cabo de santo agostinho',
        '54749': 'sao lourenco da mata',
        '55473': 'panelas',
        '55485': 'jurema',
        '55735': 'bom jardim',
        '56440': 'belem do sao francisco',
        '56820': 'caraiba',
        '56828': 'quixaba',
        '57246': 'sao miguel dos campos',
        '57258': 'campo alegre',
        '57319': 'pau darco',
        '59730': 'olhodagua do borges',
        '61962': 'maranguape',
        '62502': 'itapipoca',
        '62597': 'cruz',
        '62800': 'aracati',
        '62852': 'cascavel',
        '63765': 'sucesso',
        '65706': 'olho dagua das cunhas',
        '65760': 'presidente dutra',
        '65943': 'grajau',
        '68617': 'cachoeira de piria',
        '69555': 'tefe',
        '73330': 'brasilia',
        '73360': 'brasilia',
        '73370': 'brasilia',
        '73750': 'planaltina de goias',
        '73752': 'planaltina de goias',
        '73760': 'sao joao dalianca',
        '75260': 'senador canedo',
        '75390': 'trindade',
        '75570': 'bom jesus de goias',
        '75893': 'sao simao',
        '75914': 'riverlandia',
        '76310': 'rianapolis',
        '76385': 'goianesia',
        '76840': 'porto velho',
        '76842': 'porto velho',
        '76846': 'porto velho',
        '76847': 'porto velho',
        '76850': 'guajara mirim',
        '76868': 'machadinho doeste',
        '76930': 'alvorada doeste',
        '76952': 'alto alegre dos parecis',
        '76954': 'alta floresta do oeste',
        '76958': 'nova brasilandia doeste',
        '76963': 'jiparana',
        '76970': 'pimenta bueno',
        '76990': 'nova brasilandia doeste',
        '78245': 'vila bela da santissima trindade',
        '78278': 'lambari doeste',
        '78280': 'mirassol doeste',
        '78290': 'figueiropolis doeste',
        '78402': 'diamantino',
        '78678': 'ribeirao cascalheira',
        '78816': 'juscimeira',
        '79760': 'bataipora',
        '81470': 'curitiba',
        '83404': 'colombo',
        '83810': 'mandirituba',
        '85575': 'sao jorge doeste',
        '85580': 'itapejara doeste',
        '85794': 'capitao leonidas marques',
        '85896': 'diamante doeste',
        '86818': 'apucarana',
        '86819': 'apucarana',
        '87145': 'paicandu',
        '87214': 'cianorte',
        '87395': 'rancho alegre doeste',
        '87895': 'terra rica',
        '88058': 'florianopolis',
        '88061': 'florianopolis',
        '88165': 'biguacu',
        '88330': 'balneario camboriu',
        '88370': 'navegantes',
        '88371': 'navegantes',
        '88380': 'balneario picarras',
        '88509': 'lages',
        '88835': 'morro da fumaca',
        '88915': 'maracaja',
        '89115': 'luiz alves',
        '89143': 'ibirama',
        '89294': 'campo alegre',
        '89610': 'herval doeste',
        '91130': 'porto alegre',
        '95181': 'farroupilha',
        '95272': 'flores da cunha',
        '96222': 'rio grande',
        '96859': 'santa cruz do sul',
        '97538': 'barra do quarai',
        '28026': 'campos dos goytacazes',
        '28110': 'campos dos goytacazes',
        '28140': 'campos dos goytacazes',
        '28145': 'campos dos goytacazes',
        '28175': 'campos dos goytacazes',
        '28333': 'itaperuna',
        '28348': 'itaperuna',
        '28450': 'cambuci',
        '28557': 'sao sebastiao do alto',
        '28595': 'itaocara',
        '28610': 'nova friburgo',
        '28685': 'cachoeiras de macacu',
        '28695': 'cachoeiras de macacu',
        '28750': 'trajano de moraes',
        '28880': 'casimiro de abreu',
        '28927': 'cabo frio',
        '28980': 'araruama',
        '28994': 'saquarema',
        '28997': 'saquarema',
        '29273': 'domingos martins',
        '29278': 'domingos martins',
        '29321': 'cachoeiro de itapemirim',
        '29338': 'itapemirim',
        '29375': 'venda nova do imigrante',
        '29660': 'santa teresa',
        '29665': 'sao roque do canaa',
        '29755': 'pancas',
        '29800': 'barra de sao francisco',
        '29901': 'linhares',
        '29967': 'conceicao da barra',
        '31580': 'belo horizonte',
        '31610': 'belo horizonte',
        '32260': 'contagem',
        '34740': 'sabara',
        '34950': 'caete',
        '35108': 'governador valadares',
        '35147': 'naquenanuque',
        '35196': 'belo oriente',
        '35222': 'itueta',
        '35243': 'conselheiro pena',
        '35245': 'ferruginha',
        '35280': 'itabirinha',
        '35362': 'sao pedro dos ferros',
        '35365': 'abre campo',
        '35367': 'matipo',
        '35409': 'ouro preto',
        '35410': 'ouro preto',
        '35411': 'ouro preto',
        '35412': 'ouro preto',
        '35413': 'ouro preto',
        '35418': 'ouro preto',
        '35435': 'rosario do pontal',
        '35464': 'brumadinho',
        '35541': 'oliveira',
        '35672': 'mateus leme',
        '35698': 'antunes',
        '35905': 'itabira',
        '35975': 'barao de cocais',
        '36207': 'pinheiro grosso',
        '36264': 'vitorinos',
        '36315': 'sao joao del rei',
        '36490': 'piranga',
        '36574': 'vicosa',
        '36576': 'vicosa',
        '36846': 'tombos',
        '36905': 'manhuacu',
        '36908': 'manhuacu',
        '37136': 'alfenas',
        '37556': 'pouso alegre',
        '37557': 'pouso alegre',
        '37558': 'pouso alegre',
        '37559': 'pouso alegre',
        '37560': 'pouso alegre',
        '37653': 'camanducaia',
        '37723': 'botelhos',
        '37925': 'piumhi',
        '38106': 'uberaba',
        '38439': 'martinesia',
        '38738': 'brejo bonito',
        '38773': 'joao pinheiro',
        '38775': 'joao pinheiro',
        '38845': 'carmo do paranaiba',
        '39120': 'gouveia',
        '39398': 'olhos dagua',
        '39414': 'montes claros',
        '39445': 'janauba',
        '39526': 'catuti',
        '39602': 'aracuai',
        '39809': 'mucuri',
        '39862': 'nanuque',
        '42835': 'camacari',
        '42840': 'camacari',
        '43843': 'candeias',
        '44590': 'santa terezinha',
        '45818': 'porto seguro',
        '45928': 'nova vicosa',
        '45990': 'teixeira de freitas',
        '53900': 'itamaraca',
        '54420': 'jaboatao dos guararapes',
        '57010': 'maceio',
        '57442': 'olho dagua das flores',
        '58100': 'campina grande',
        '58101': 'campina grande',
        '58102': 'campina grande',
        '58103': 'campina grande',
        '58108': 'campina grande',
        '58304': 'santa rita',
        '58428': 'campina grande',
        '58429': 'campina grande',
        '58430': 'campina grande',
        '58432': 'campina grande',
        '58433': 'campina grande',
        '58434': 'campina grande',
        '58441': 'campina grande',
        '58463': 'santa cecilia',
        '59179': 'tibau do sul',
        '59570': 'ceara mirim',
        '59585': 'sao miguel de touros',
        '62600': 'itapaje',
        '65370': 'pindare mirim',
        '65485': 'itapecuru mirim',
        '68129': 'mojui dos campos',
        '68275': 'oriximina',
        '68447': 'barcarena',
        '68448': 'barcarena',
        '68516': 'parauapebas',
        '68545': 'pau darco',
        '68682': 'tomeacu',
        '68695': 'tailandia',
        '68792': 'santa isabel do para',
        '68945': 'pedra branca do amapari',
        '69919': 'rio branco',
        '69921': 'rio branco',
        '70645': 'brasilia',
        '70648': 'brasilia',
        '70650': 'brasilia',
        '70655': 'brasilia',
        '70658': 'brasilia',
        '70660': 'brasilia',
        '70670': 'brasilia',
        '70673': 'brasilia',
        '70675': 'brasilia',
        '71010': 'brasilia',
        '71015': 'brasilia',
        '71020': 'brasilia',
        '71050': 'brasilia',
        '71065': 'brasilia',
        '71070': 'brasilia',
        '71200': 'brasilia',
        '71503': 'brasilia',
        '71505': 'brasilia',
        '71510': 'brasilia',
        '71515': 'brasilia',
        '71535': 'brasilia',
        '71571': 'brasilia',
        '71596': 'brasilia',
        '71600': 'brasilia',
        '71605': 'brasilia',
        '71615': 'brasilia',
        '71620': 'brasilia',
        '71625': 'brasilia',
        '71635': 'brasilia',
        '71640': 'brasilia',
        '71645': 'brasilia',
        '71660': 'brasilia',
        '71665': 'brasilia',
        '71670': 'brasilia',
        '71675': 'brasilia',
        '71680': 'brasilia',
        '71692': 'brasilia',
        '71693': 'brasilia',
        '71705': 'brasilia',
        '71720': 'brasilia',
        '71725': 'brasilia',
        '71727': 'brasilia',
        '71736': 'brasilia',
        '71741': 'brasilia',
        '71745': 'brasilia',
        '71805': 'brasilia',
        '71825': 'brasilia',
        '71880': 'brasilia',
        '71901': 'brasilia',
        '71907': 'brasilia',
        '71908': 'brasilia',
        '71909': 'brasilia',
        '71915': 'brasilia',
        '71917': 'brasilia',
        '71925': 'brasilia',
        '71926': 'brasilia',
        '71927': 'brasilia',
        '71937': 'brasilia',
        '71938': 'brasilia',
        '71939': 'brasilia',
        '71950': 'brasilia',
        '71955': 'brasilia',
        '72007': 'brasilia',
        '72010': 'brasilia',
        '72015': 'brasilia',
        '72025': 'brasilia',
        '72110': 'brasilia',
        '72115': 'brasilia',
        '72120': 'brasilia',
        '72125': 'brasilia',
        '72130': 'brasilia',
        '72140': 'brasilia',
        '72146': 'brasilia',
        '72150': 'brasilia',
        '72152': 'brasilia',
        '72155': 'brasilia',
        '72210': 'brasilia',
        '72215': 'brasilia',
        '72220': 'brasilia',
        '72225': 'brasilia',
        '72231': 'brasilia',
        '72233': 'brasilia',
        '72235': 'brasilia',
        '72240': 'brasilia',
        '72241': 'brasilia',
        '72251': 'brasilia',
        '72301': 'brasilia',
        '72302': 'brasilia',
        '72304': 'brasilia',
        '72306': 'brasilia',
        '72308': 'brasilia',
        '72312': 'brasilia',
        '72318': 'brasilia',
        '72319': 'brasilia',
        '72320': 'brasilia',
        '72327': 'brasilia',
        '72329': 'brasilia',
        '72331': 'brasilia',
        '72335': 'brasilia',
        '72410': 'brasilia',
        '72415': 'brasilia',
        '72420': 'brasilia',
        '72445': 'brasilia',
        '72501': 'brasilia',
        '72504': 'brasilia',
        '72506': 'brasilia',
        '72507': 'brasilia',
        '72542': 'brasilia',
        '72543': 'brasilia',
        '72546': 'brasilia',
        '72547': 'brasilia',
        '72592': 'brasilia',
        '72593': 'brasilia',
        '72600': 'brasilia',
        '72601': 'brasilia',
        '72620': 'brasilia',
        '72710': 'brasilia',
        '72715': 'brasilia',
        '72725': 'brasilia',
        '72726': 'brasilia',
        '73005': 'brasilia',
        '73010': 'brasilia',
        '73015': 'brasilia',
        '73025': 'brasilia',
        '73030': 'brasilia',
        '73035': 'brasilia',
        '73040': 'brasilia',
        '73045': 'brasilia',
        '73050': 'brasilia',
        '73060': 'brasilia',
        '73080': 'brasilia',
        '73092': 'brasilia',
        '73100': 'brasilia',
        '73105': 'brasilia',
        '73252': 'brasilia'        
    }
    
    # Iteratively build a single 'CASE WHEN' expression.
    # This creates a chain: when(zip1, city1).when(zip2, city2)...otherwise(original)
    for zipcode, correct_city in zipcode_corrections.items():
        city_col = when(
            col("geolocation_zip_code_prefix") == zipcode,
            correct_city
        ).otherwise(city_col)
    
    df_corrected = df.withColumn("geolocation_city", city_col)
    
    return df_corrected

StatementMeta(, 9b8a09ed-c18f-4da2-b607-66d6efb5db58, 13, Finished, Available, Finished)

In [12]:
# Function to identify data quality issues where one Zip Code maps to multiple City names
def check_city_name_variances(df):
    """
    Detects inconsistency in geographic naming. It identifies zip code prefixes 
    that are linked to more than one unique city name and displays the conflicting 
    variations.
    
    Args:
        df (pyspark.sql.DataFrame): The geolocation DataFrame to audit.
    """

    print("In check_city_name_variances ...")
    
    # 1. Performance Optimization: 
    # Repartitioning ensures all rows for the same Zip are on the same node.
    # Caching prevents reading from the source Lakehouse multiple times.
    df_repartitioned = df.repartition(200, "geolocation_zip_code_prefix").cache()
    
    # 2. Step 1: Identification
    # Finds only the Zip Codes that have a 'variation_count' greater than 1.
    conflicting_zips = df_repartitioned.groupBy("geolocation_zip_code_prefix") \
        .agg(F.countDistinct("geolocation_city").alias("variation_count")) \
        .filter(F.col("variation_count") > 1) \
        .cache()  # Cache this small result for the join
    
    # Trigger caching by counting conflicts
    conflict_count = conflicting_zips.count()
    print(f"Found {conflict_count} zip codes with city name conflicts")
    
    # 3. Step 2: Detail Retrieval
    # Inner join filters the main data to only show rows that were identified in Step 1.
    # collect_set() creates a unique list of the conflicting names.
    zip_conflicts = df_repartitioned.join(
        conflicting_zips, 
        "geolocation_zip_code_prefix", 
        "inner"
    ) \
    .groupBy("geolocation_zip_code_prefix", "variation_count") \
    .agg(F.array_sort(F.collect_set("geolocation_city")).alias("city_variations")) \
    .orderBy(F.desc("variation_count")) \
    .limit(50)
    
    # 4. Result Visualization
    print("Zip codes with multiple city name variations:")
    zip_conflicts.show(20, truncate=False)
    
    # 5. Memory Management: 
    # unpersist() is crucial here to free up RAM for the final load/write steps.
    conflicting_zips.unpersist(blocking=True)
    df_repartitioned.unpersist(blocking=True)
    
    print("City variance check complete!")

StatementMeta(, 9b8a09ed-c18f-4da2-b607-66d6efb5db58, 14, Finished, Available, Finished)

In [13]:
# ==============================================================================
# GEOLOCATION SILVER PIPELINE EXECUTION
# ==============================================================================
from_table_name = 'BronzeLakeHouse.dbo.geolocation_bronze'
to_table_name = 'geolocation_silver'

# 1. Extraction
df = extract_table(from_table_name)

# 2. Initial Quality Check & Manual Filtering
check_null_columns(df)
df = clean_dirty_data(df)

# Optimization: Break the 'Lineage' to keep the Spark Query Plan small.
# This saves the current state to the checkpoint directory you set in config.
df = df.checkpoint()  

# 3. Text & Geographic Standardization
df = clean_geolocation_city(df)
df = apply_zipcode_city_standardization(df)

# Optimization: Break lineage again before the heavy aggregation logic.
df = df.checkpoint()  # Break again

# 4. Post-Cleaning Audit
# This confirms if our standardization successfully merged the city variations.
check_city_name_variances(df)

# 5. Data Consolidation (Aggregation)
# Finds the mean Lat/Lng and drops duplicates.
df = group_consolidate_data(df)

# 6. Final Integrity Audit
# Ensures we have a clean, unique set of keys before writing to Silver.
check_records_count(df)

# 7. Loading
load_df_to_delta(df, to_table_name)

StatementMeta(, 9b8a09ed-c18f-4da2-b607-66d6efb5db58, 15, Finished, Available, Finished)

In check_null_columns ...
Records with null columns: 0
In clean_dirty_data ...
Cleaning script executed ! 
In clean_geolocation_city ...
In apply_zipcode_city_standardization ...
In check_city_name_variances ...
Found 0 zip codes with city name conflicts
Zip codes with multiple city name variations:
+---------------------------+---------------+---------------+
|geolocation_zip_code_prefix|variation_count|city_variations|
+---------------------------+---------------+---------------+
+---------------------------+---------------+---------------+

City variance check complete!
In group_consolidate_data ...
In check_records_count ...
Total count: 19015
Distinct count (composite key): 19015
Duplicate records: 0

No duplicate records found!
Record count check complete!
In load_df_to_delta ...
Table 'geolocation_silver' loaded successfully !


In [14]:
# ==============================================================================
# RESOURCE CLEANUP & SESSION TERMINATION
# ==============================================================================

# Release the final DataFrame from memory
df.unpersist()

# Clear all remaining cached data from the Spark Catalog
spark.catalog.clearCache()

# Remove temporary checkpoint files from storage to save space
# The 'True' parameter ensures the folder and all its contents are deleted
mssparkutils.fs.rm("Files/checkpoints/", True)

# Stop the Spark session to release compute resources
mssparkutils.session.stop()

StatementMeta(, 9b8a09ed-c18f-4da2-b607-66d6efb5db58, 16, Finished, Available, Finished)