# **Resolve Typographical Errors in Customer Data**
- **Purpose:** Perform controlled corrections for city, email, and name values, preserving flags for ambiguous or low‑confidence matches.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, lit, lower, upper, when, regexp_replace, split, concat_ws, initcap
)

StatementMeta(, 764a758d-419d-4076-8a23-01071924c09a, 3, Finished, Available, Finished)

In [2]:
# -------------------------------------------------------------------
# Paths (adapt if yours differ)
# -------------------------------------------------------------------
SRC_PATH = "Files/csv/customers/CustomersLocation.csv"  # full dataset you want to clean
DETECTED_PATH = "Files/csv/customers/typos/CustomersLocation_typos.csv"  # your lookup
OUT_PATH = "Files/csv/customers/clean/CustomersLocation_clean.csv"       # final output

StatementMeta(, 764a758d-419d-4076-8a23-01071924c09a, 4, Finished, Available, Finished)

In [3]:
# -------------------------------------------------------------------
# 1) Read datasets
# -------------------------------------------------------------------
df_full = spark.read.format("csv").option("header", "true").load(SRC_PATH)
df_detected = spark.read.format("csv").option("header", "true").load(DETECTED_PATH)

# Robust join on id (ensure same type)
df_full = df_full.withColumn("id", col("id").cast("string"))
df_detected = df_detected.withColumn("id", col("id").cast("string"))

StatementMeta(, 764a758d-419d-4076-8a23-01071924c09a, 5, Finished, Available, Finished)

In [4]:
# -------------------------------------------------------------------
# 2) Select only the columns we need from detected-typos
#    (what your file already has per screenshot)
# -------------------------------------------------------------------
det_cols = [c for c in df_detected.columns]
use_cols = ["id"]
# Pick available controls; only those present will be used
for c in ["invalid_email", "invalid_state", "city_suspect", "best_city"]:
    if c in det_cols:
        use_cols.append(c)

lookup = df_detected.select(*use_cols)

StatementMeta(, 764a758d-419d-4076-8a23-01071924c09a, 6, Finished, Available, Finished)

In [5]:
# -------------------------------------------------------------------
# 3) Join full dataset with lookup
# -------------------------------------------------------------------
df = df_full.join(lookup, on="id", how="left")

# Helper: mark rows that are present in lookup (we will only touch these)
lookup_present = (
    (col("invalid_email").isNotNull()) |
    (col("invalid_state").isNotNull()) |
    (col("city_suspect").isNotNull()) |
    (col("best_city").isNotNull())
)

StatementMeta(, 764a758d-419d-4076-8a23-01071924c09a, 7, Finished, Available, Finished)

In [6]:
# -------------------------------------------------------------------
# 4) City fix — ONLY when lookup says city_suspect == true
# -------------------------------------------------------------------
city_fixed = when(
    lookup_present & (col("city_suspect") == lit(True)) & col("best_city").isNotNull(),
    col("best_city")
).otherwise(col("city"))

StatementMeta(, 764a758d-419d-4076-8a23-01071924c09a, 8, Finished, Available, Finished)

In [7]:
# -------------------------------------------------------------------
# 5) Email fixes — ONLY when lookup says invalid_email == true
#    Keep scope minimal (no rebuild unless you want it)
# -------------------------------------------------------------------
email_col = col("email")
email_fixed = when(
    lookup_present & (col("invalid_email") == lit(True)) & email_col.isNotNull(),
    regexp_replace(email_col, "@exampl\\.com$", "@example.com")  # fix @exampl.com
).otherwise(email_col)

email_fixed = when(
    lookup_present & (col("invalid_email") == lit(True)) & email_fixed.rlike(r"@example$"),
    regexp_replace(email_fixed, r"@example$", "@example.com")    # add .com if missing
).otherwise(email_fixed)

StatementMeta(, 764a758d-419d-4076-8a23-01071924c09a, 9, Finished, Available, Finished)

In [8]:
# -------------------------------------------------------------------
# 6) Targeted name fixes — ONLY for rows present in lookup
#    (Small deterministic map; extend as needed)
# -------------------------------------------------------------------
from pyspark.sql.functions import create_map, lit, when, col, element_at

first_name_map = {"Jhon": "John", "Srah": "Sarah"}

kv = []
for k, v in first_name_map.items():
    kv += [lit(k), lit(v)]
map_expr = create_map(*kv)

mapped_value = element_at(map_expr, col("first_name"))
first_name_fixed = when(lookup_present & mapped_value.isNotNull(), mapped_value).otherwise(col("first_name"))
last_name_fixed = when(lookup_present & mapped_value.isNotNull(), mapped_value).otherwise(col("last_name"))



StatementMeta(, 764a758d-419d-4076-8a23-01071924c09a, 10, Finished, Available, Finished)

In [9]:
# -------------------------------------------------------------------
# 7) State — only upper-case when lookup flags invalid_state == true (optional)
# -------------------------------------------------------------------
state_fixed = when(
    lookup_present & (col("invalid_state") == lit(True)) & col("state").isNotNull(),
    upper(col("state"))
).otherwise(col("state"))

StatementMeta(, 764a758d-419d-4076-8a23-01071924c09a, 11, Finished, Available, Finished)

In [10]:
# -------------------------------------------------------------------
# 8) Build final cleaned DataFrame (return full dataset)
# -------------------------------------------------------------------
df_clean = df.select(
    col("id"),
    first_name_fixed.alias("first_name"),
    last_name_fixed.alias("last_name"),
    email_fixed.alias("email"),
    city_fixed.alias("city"),
    state_fixed.alias("state")
    # include any other columns from df_full here if your dataset has more
)

StatementMeta(, 764a758d-419d-4076-8a23-01071924c09a, 12, Finished, Available, Finished)

In [11]:

from pyspark.sql.functions import (
    col, lower, trim, regexp_replace, split, concat, lit, coalesce, when
)

# 1) Build the expected local-part from first/last name (lowercase, trimmed, spaces removed)
first_clean = lower(regexp_replace(trim(col("first_name")), r"\s+", ""))  # e.g., "John " → "john"
last_clean  = lower(regexp_replace(trim(col("last_name")),  r"\s+", ""))  # e.g., " Walker" → "walker"
expected_local = concat(first_clean, lit("."), last_clean)

# 2) Extract & normalize the domain from the current email
email_original = col("email")
domain_raw     = split(email_original, "@").getItem(1)

# Default domain when missing; fix common typos
domain_fixed = coalesce(domain_raw, lit("example.com"))
domain_fixed = regexp_replace(domain_fixed, r"(?i)^exampl\.com$", "example.com")  # exampl.com → example.com
domain_fixed = regexp_replace(domain_fixed, r"(?i)^example$", "example.com")      # example → example.com

# 3) Current local-part
actual_local = lower(split(email_original, "@").getItem(0))

# 4) Fix ONLY rows where local-part does NOT equal the expected "first.last"
email_fixed = when(
    actual_local.isNull() | (actual_local != expected_local),
    concat(expected_local, lit("@"), domain_fixed)
).otherwise(email_original)

# 5) Return the entire dataset with corrected emails (other columns untouched)
df_clean_fixed = df_clean.select(
    col("id"),
    col("first_name"),
    col("last_name"),
    email_fixed.alias("email"),
    col("city"),
    col("state")
    # include any additional columns your df_clean has
)


StatementMeta(, 764a758d-419d-4076-8a23-01071924c09a, 13, Finished, Available, Finished)

In [12]:
df_clean_fixed.filter(col("id").isin(21, 22, 24)).show(truncate=True)

StatementMeta(, 764a758d-419d-4076-8a23-01071924c09a, 14, Finished, Available, Finished)

+---+----------+---------+--------------------+--------+-----+
| id|first_name|last_name|               email|    city|state|
+---+----------+---------+--------------------+--------+-----+
| 21|      John|     John|john.john@example...|New York|   NY|
| 22|     Sarah|    Sarah|sarah.sarah@examp...|   Miami|   FL|
| 24|     Nancy|    Young|nancy.young@examp...| Detroit|   MI|
+---+----------+---------+--------------------+--------+-----+



In [13]:
df_clean_fixed.write.format("delta").mode("overwrite").saveAsTable("customerlocation")

StatementMeta(, 764a758d-419d-4076-8a23-01071924c09a, 15, Finished, Available, Finished)

In [14]:
spark.table("customerlocation").show(50)

StatementMeta(, 764a758d-419d-4076-8a23-01071924c09a, 16, Finished, Available, Finished)

+---+-----------+---------+--------------------+--------------+-----+
| id| first_name|last_name|               email|          city|state|
+---+-----------+---------+--------------------+--------------+-----+
|  1|       John|    Smith|john.smith@exampl...|      New York|   NY|
|  2|      Sarah|   Connor|sarah.connor@exam...|   Los Angeles|   CA|
|  3|    Michael|    Brown|michael.brown@exa...|       Chicago|   IL|
|  4|      Emily|    Davis|emily.davis@examp...|       Houston|   TX|
|  5|      David|   Wilson|david.wilson@exam...|       Phoenix|   AZ|
|  6|      Linda|    Moore|linda.moore@examp...|  Philadelphia|   PA|
|  7|     Robert|   Taylor|robert.taylor@exa...|   San Antonio|   TX|
|  8|   Patricia| Anderson|patricia.anderson...|     San Diego|   CA|
|  9|    Charles|   Thomas|charles.thomas@ex...|        Dallas|   TX|
| 10|    Barbara|  Jackson|barbara.jackson@e...|      San Jose|   CA|
| 11|     Daniel|    White|daniel.white@exam...|        Austin|   TX|
| 12|   Jennifer|   