In [None]:
# Cell 1: Setup + Imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import trim, regexp_replace, col, coalesce, lit, udf
from pyspark.sql.types import StringType
from unidecode import unidecode
from utils.io_helpers import list_files_with_aliases, load_file_with_alias

import os

spark = SparkSession.builder.appName("Exercise 1 - Cleaning").getOrCreate()


In [None]:
# Cell 2: Select format and folder
# Choose between "json" or "parquet"
data_type = "json"
folder = f"../data/input/practice/{data_type}"

aliases = list_files_with_aliases(folder, ext=data_type)
print("Available files:")
for alias, path in aliases.items():
    print(f"{alias}: {path}")


In [None]:
# Cell 3: Load data using alias
alias = "file1"  # Change to file2, file3, etc.
df = load_file_with_alias(spark, folder, alias=alias, ext=data_type)
df.show(truncate=False)


In [None]:
# Cell 4: Cleaning logic
remove_accents_udf = udf(lambda s: unidecode(s) if s else None, StringType())

df = df.withColumn("nume", trim(col("nume")))
df = df.withColumn("nume", remove_accents_udf("nume"))

df = df.withColumn("ocupatie", trim(regexp_replace(col("ocupatie"), "\s+", " ")))

def remove_duplicates(val):
    if not val:
        return None
    parts = [x.strip() for x in val.split(',')]
    return ', '.join(sorted(set(parts)))

remove_dupes_udf = udf(remove_duplicates, StringType())
df = df.withColumn("extra", remove_dupes_udf("extra"))

df = df.withColumn("inactiv", coalesce(col("inactiv"), lit(False)))

df.show(truncate=False)


In [None]:
# Cell 5: Save cleaned output
output_ext = "parquet" if data_type == "parquet" else "json"
output_path = f"../data/output/cleaned_{alias}.{output_ext}"

if output_ext == "json":
    df.write.mode("overwrite").json(output_path)
else:
    df.write.mode("overwrite").parquet(output_path)

print(f"✅ Saved cleaned output to: {output_path}")
