### Handling Missing values

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("MissingData_Cleansing").getOrCreate()

data = [
    (1, " Alice ", "HR", 5000),
    (2, "Bob", None, 6000),
    (3, None, "IT", None),
    (4, "David", "Finance", 4500),
    (5, "Eva", None, None),
    (6, "Bob", "Finance", 6000),   # duplicate
    (7, "John", "Finance", 12000)  # outlier
]

columns = ["id", "name", "dept", "salary"]
df = spark.createDataFrame(data, columns)

print("===== ORIGINAL DATA =====")
df.show()
df.printSchema()

df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

df.na.drop().show()

df.na.drop("all").show()

df.na.drop(subset=["dept", "salary"]).show()

df_fill = df.na.fill({"dept": "Unknown", "salary": 0, "name": "Unknown"})
df_fill.show()

mean_salary = df.select(mean("salary")).collect()[0][0]
print(f"Average salary: {mean_salary}")
df_mean = df.na.fill({"salary": mean_salary})
df_mean.show()

df_replace = df.na.replace("Finance", "Accounts", subset=["dept"])
df_replace.show()


### Data Cleansing

In [0]:
df_clean = df.withColumn("name", trim(col("name")))
df_clean.show()

df_clean = df_clean.withColumn("dept", upper(col("dept")))
df_clean.show()

df_clean = df_clean.withColumn(
    "salary",
    when(col("salary").isNull(), 0).otherwise(col("salary"))
)
df_clean.show()

df_clean = df_clean.dropDuplicates(["name", "dept"])
df_clean.show()


df_clean = df_clean.filter((col("salary") >= 3000) & (col("salary")<=10000))
df_clean.show()

df_clean = df_clean.withColumn("salary", col("salary").cast("int"))
df_clean.printSchema()

df_clean.show()