In [1]:
# Cell 1: Setup
import sys, os
sys.path.append(os.path.abspath(os.path.join("..")))

from pyspark.sql import SparkSession
from utils.io_helpers import list_files_with_aliases, load_file_with_alias

spark = SparkSession.builder.appName("Exercise 1 - Cleaning").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")


/usr/local/lib/python3.9/site-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/25 14:15:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Cell 2: Select format and folder
# Choose between "json" or "parquet"
data_type = "json"
folder = f"../data/input/practice/{data_type}"

aliases = list_files_with_aliases(folder, ext=data_type)
print("Available files:")
for alias, path in aliases.items():
    print(f"{alias}: {path}")


Available files:
file1: ../data/input/practice/json/part-00000-46e2d8a4-96d4-4b0e-a4a9-992dbceb1653-c000.json
file2: ../data/input/practice/json/part-00001-46e2d8a4-96d4-4b0e-a4a9-992dbceb1653-c000.json
file3: ../data/input/practice/json/part-00002-46e2d8a4-96d4-4b0e-a4a9-992dbceb1653-c000.json


In [4]:
# Cell 3: Load data using alias
alias = "file1"  # Change to file2, file3, etc.
df = load_file_with_alias(spark, folder, alias=alias, ext=data_type)
df.show(truncate=False)


📥 Loading file: file1 → ../data/input/practice/json/part-00000-46e2d8a4-96d4-4b0e-a4a9-992dbceb1653-c000.json
+----------------------------+-------+-----------+--------------------+------+-------+
|extra                       |inactiv|nume       |ocupatie            |varsta|vechime|
+----------------------------+-------+-----------+--------------------+------+-------+
|[PV, EV]                    |NULL   |Andrei     |Specialist marketing|38    |13     |
|[3D Printer, WII]           |NULL   |Alexandru  |Specialist HR       |34    |8      |
|[AC, EV, 5G Router]         |NULL   |Adrian     |Inginer civil       |45    |23     |
|[XBOX]                      |NULL   | Alin      |Vânzător  retail    |26    |2      |
|[5G Router, 3D Printer]     |NULL   |Anton      |Manager proiect     |40    |15     |
|[EV, AC]                    |NULL   |Ana        |Muncitor alimentar  |35    |9      |
|[PC, 3D Printer, AC]        |false  |Bogdan     |Farmacist           |50    |32     |
|NULL               

In [5]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def remove_duplicates(val):
    if not val:
        return None
    if isinstance(val, list):
        cleaned = [x.strip() for x in val]
        return ', '.join(sorted(set(cleaned)))
    elif isinstance(val, str):
        parts = [x.strip() for x in val.split(',')]
        return ', '.join(sorted(set(parts)))
    else:
        return str(val)

remove_dupes_udf = udf(remove_duplicates, StringType())
df = df.withColumn("extra", remove_dupes_udf("extra"))


In [6]:
# Cell 5: Save cleaned output
output_ext = "parquet" if data_type == "parquet" else "json"
output_path = f"../data/output/cleaned_{alias}.{output_ext}"

if output_ext == "json":
    df.write.mode("overwrite").json(output_path)
else:
    df.write.mode("overwrite").parquet(output_path)

print(f"✅ Saved cleaned output to: {output_path}")


                                                                                

✅ Saved cleaned output to: ../data/output/cleaned_file1.json
