Run this in terminal to start JupiterLab : jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root

In [None]:
# ✅ Cell 1: Setup & Load Input Data (with dynamic alias) 

import sys, os
sys.path.append("/app")  # 🔧 Mount base path directly in container

from utils.notebook_setup import enable_project_imports
enable_project_imports()

from utils.io_helpers import list_files_with_aliases, load_file_with_alias
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, when, lit, concat_ws, size, desc

# 🚀 Start Spark
spark = SparkSession.builder.appName("Pyspark - Clean & Transform").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

# 📂 Input config
data_type = "json"  # or "parquet"
input_folder = f"/app/data/input/practice/{data_type}"  # 🔧 Absolute path for container

# 🧠 Discover input files
aliases = list_files_with_aliases(input_folder, ext=data_type)
print("📄 Available input files:")
for k, v in aliases.items():
    print(f"{k}: {v}")

# 🏷️ Choose which file to load
alias = "file1"  # or file2, file3...
input_path = aliases[alias]

# 📦 Load using alias
df = load_file_with_alias(spark, input_folder, alias, ext=data_type)
print(f"✅ Loaded file: {alias} → {input_path}")
df.show(5, truncate=False)


✅ [notebook_setup] Project root already in sys.path.


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/29 08:44:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


📄 Available input files:
file1: /app/data/input/practice/json/part-00000-46e2d8a4-96d4-4b0e-a4a9-992dbceb1653-c000.json
file2: /app/data/input/practice/json/part-00001-46e2d8a4-96d4-4b0e-a4a9-992dbceb1653-c000.json
file3: /app/data/input/practice/json/part-00002-46e2d8a4-96d4-4b0e-a4a9-992dbceb1653-c000.json
📥 Loading file: file1 → /app/data/input/practice/json/part-00000-46e2d8a4-96d4-4b0e-a4a9-992dbceb1653-c000.json
✅ Loaded file: file1 → /app/data/input/practice/json/part-00000-46e2d8a4-96d4-4b0e-a4a9-992dbceb1653-c000.json
+-----------------------+-------+---------+--------------------+------+-------+
|extra                  |inactiv|nume     |ocupatie            |varsta|vechime|
+-----------------------+-------+---------+--------------------+------+-------+
|[PV, EV]               |NULL   |Andrei   |Specialist marketing|38    |13     |
|[3D Printer, WII]      |NULL   |Alexandru|Specialist HR       |34    |8      |
|[AC, EV, 5G Router]    |NULL   |Adrian   |Inginer civil       |45 

In [None]:
# ✅ Cell 2: Select format and folder

# Choose between "json" or "parquet"
data_type = "json"
folder = f"/app/data/input/practice/{data_type}"  

aliases = list_files_with_aliases(folder, ext=data_type)
print("📂 Available files:")
for alias, path in aliases.items():
    print(f"{alias}: {path}")


In [None]:
# Cell 3: Load data using alias
alias = "file1"  # Change to file2, file3, etc.
df = load_file_with_alias(spark, folder, alias=alias, ext=data_type)
df.show(truncate=False)


In [None]:
# %% 
# Cell 4: Clean the DataFrame

from pyspark.sql.functions import col, trim, regexp_replace, coalesce, lit, udf
from pyspark.sql.types import StringType
import unicodedata

# --------------------------------------------
# ✅ 1. Clean 'nume': trim + remove diacritics
# --------------------------------------------

# UDF: Remove accents using unicodedata
def remove_accents(s):
    if s is None:
        return None
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

remove_accents_udf = udf(remove_accents, StringType())

# Trim and remove accents
df = df.withColumn("nume", trim(col("nume")))
df = df.withColumn("nume", remove_accents_udf("nume"))

# --------------------------------------------
# ✅ 2. Clean 'ocupatie': trim + normalize whitespace
# --------------------------------------------
df = df.withColumn("ocupatie", trim(regexp_replace(col("ocupatie"), r"\s+", " ")))
df = df.withColumn("ocupatie", remove_accents_udf("ocupatie"))

# --------------------------------------------
# ✅ 3. Clean 'extra': deduplicate comma-separated entries
# --------------------------------------------

def remove_duplicates(val):
    if not val:
        return None

    if isinstance(val, list):
        cleaned = [str(x).strip() for x in val if x]
    elif isinstance(val, str):
        cleaned = [x.strip() for x in val.split(',') if x]
    else:
        print(f"⚠️ Unexpected value in `extra`: {val} ({type(val)})")
        return str(val).strip()

    return ', '.join(sorted(set(cleaned)))

remove_dupes_udf = udf(remove_duplicates, StringType())
df = df.withColumn("extra", remove_dupes_udf("extra"))
df.select("extra").show(10, truncate=False)

# --------------------------------------------
# ✅ 4. Set missing 'inactiv' to False
# --------------------------------------------
df = df.withColumn("inactiv", coalesce(col("inactiv"), lit(False)))

# --------------------------------------------
# ✅ 5. Convert to Pandas for Validation & Display
# --------------------------------------------
df_with_vechime = df.cache()  # Optional: cache to reuse efficiently

display_pdf = df.toPandas()
display_pdf.head(10)




In [None]:
import unicodedata

# --------------------------------------------
# ✅ Helper: Check for diacritics (non-ASCII)
# --------------------------------------------
def has_diacritics(text):
    return any(unicodedata.category(c) == "Mn" for c in unicodedata.normalize("NFD", text)) if isinstance(text, str) else False

# --------------------------------------------
# ✅ Test 1: No extra whitespace in `nume` or `ocupatie`
# --------------------------------------------
def is_trimmed(s):
    return isinstance(s, str) and s == s.strip()

trimmed_nume = display_pdf['nume'].apply(is_trimmed)
trimmed_ocupatie = display_pdf['ocupatie'].apply(is_trimmed)

assert trimmed_nume.all(), "❌ Some 'nume' values have leading/trailing spaces."
assert trimmed_ocupatie.all(), "❌ Some 'ocupatie' values have leading/trailing spaces."

# --------------------------------------------
# ✅ Test 2: Diacritics removed from `nume`
# --------------------------------------------
diacritic_check = display_pdf['nume'].apply(lambda x: not has_diacritics(x))
assert diacritic_check.all(), "❌ Some 'nume' values still contain diacritics."

# --------------------------------------------
# ✅ Test 3: `extra` column exists and is valid
# --------------------------------------------
# Ensure column exists
assert "extra" in display_pdf.columns, "❌ Column 'extra' is missing."

# Ensure it's not completely empty
non_null_extra = display_pdf['extra'].notna().sum()
assert non_null_extra > 0, "❌ Column 'extra' is entirely null."

print(f"ℹ️ 'extra' column has {non_null_extra} non-null values.")

# ✅ Check for duplicates in valid entries
def has_duplicates(entry):
    if not entry or not isinstance(entry, str): return False
    parts = [p.strip() for p in entry.split(',')]
    return len(parts) != len(set(parts))

extra_dup_check = display_pdf['extra'].dropna().apply(lambda x: not has_duplicates(x))
assert extra_dup_check.all(), "❌ Some 'extra' values contain duplicates."

print("✅ 'extra' column is present, non-null, and cleaned of duplicates.")


# --------------------------------------------
# ✅ Test 4: `inactiv` should not have nulls
# --------------------------------------------
inactiv_check = display_pdf['inactiv'].apply(lambda x: isinstance(x, bool))
assert inactiv_check.all(), "❌ Some 'inactiv' values are missing or not boolean."

print("✅ All data cleaning validations passed! 🎉")


In [None]:
# Cell 5: Add 'varsta_la_contractare' and drop 'vechime'

df = df.withColumn("varsta_la_contractare", col("varsta") - col("vechime"))
df = df.drop("vechime")
df.show(5)

In [None]:
# Cell 6: Add 'text_descriptiv' column

df = df.withColumn("text_descriptiv",
    when(col("extra").isNull() | (col("extra") == ""), 
         concat_ws(" ", col("nume"), lit("în vârstă de"), col("varsta"), lit("ani"), lit("este"), col("ocupatie")))
    .otherwise(
        concat_ws(" ", col("nume"), lit("în vârstă de"), col("varsta"), lit("ani"), lit("este"), col("ocupatie"), lit("și deține:"), col("extra"))
    )
)
df.select("nume", "text_descriptiv").show(5, truncate=False)

In [None]:
# ✅ Cell 7: Clients with contract duration between 2 and 5 years (inclusive)

df_with_vechime.filter((col("vechime") >= 2) & (col("vechime") <= 5)) \
  .select("nume", "ocupatie", "vechime") \
  .orderBy("vechime") \
  .show(truncate=False)

In [None]:
# Cell 8: Top 10 by age
df.orderBy(col("varsta").desc()).show(50, truncate=False)


In [None]:
# Cell 9: Top 5 oldest at contract time (active only)
df.filter(col("inactiv") == False) \
  .orderBy(col("varsta_la_contractare").desc()) \
  .show(5, truncate=False)