# Example 5: Data Quality Checks

Perform data quality checks and cleaning operations.

This demonstrates:
- Detecting nulls and duplicates
- Data validation
- Data profiling
- Cleaning operations

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("DataQuality").getOrCreate()

In [None]:
# Create messy dataset
data = [
    (1, "John Doe", "john@email.com", 28, 75000),
    (2, "Jane Smith", None, 35, 85000),
    (3, "Bob Johnson", "bob@email.com", None, 65000),
    (1, "John Doe", "john@email.com", 28, 75000),  # Duplicate
    (4, "Alice Brown", "alice@invalid", -5, 95000),  # Invalid age and email
    (5, None, "charlie@email.com", 42, None),  # Missing name and salary
    (6, "  Emma Davis  ", "emma@email.com", 30, 70000),  # Extra spaces
    (7, "Michael Wilson", "MICHAEL@EMAIL.COM", 45, 80000),  # Uppercase email
]

df = spark.createDataFrame(data, ["id", "name", "email", "age", "salary"])

print("Original Messy Data:")
df.show(truncate=False)

In [None]:
# Check data dimensions
print(f"\nTotal rows: {df.count()}")
print(f"Total columns: {len(df.columns)}")
print(f"Columns: {df.columns}")

In [None]:
# Check for null values
print("\nNull Value Counts:")
null_counts = df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_counts.show()

In [None]:
# Calculate null percentages
total_rows = df.count()
print("\nNull Percentages:")
for col_name in df.columns:
    null_count = df.filter(F.col(col_name).isNull()).count()
    null_pct = (null_count / total_rows) * 100
    print(f"{col_name}: {null_pct:.2f}%")

In [None]:
# Check for duplicates
duplicate_count = df.count() - df.dropDuplicates().count()
print(f"\nDuplicate rows: {duplicate_count}")

# Find duplicate records
duplicates = df.groupBy(df.columns).count().filter(F.col("count") > 1)
print("\nDuplicate records:")
duplicates.show(truncate=False)

In [None]:
# Data profiling - unique values per column
print("\nUnique Value Counts:")
for col_name in df.columns:
    unique_count = df.select(col_name).distinct().count()
    print(f"{col_name}: {unique_count} unique values")

In [None]:
# Statistical summary
print("\nStatistical Summary:")
df.describe().show()

In [None]:
# Data cleaning - Step by step

# 1. Remove exact duplicates
df_cleaned = df.dropDuplicates()
print(f"\nAfter removing duplicates: {df_cleaned.count()} rows")

# 2. Trim whitespace from name
df_cleaned = df_cleaned.withColumn("name", F.trim(F.col("name")))

# 3. Standardize email to lowercase
df_cleaned = df_cleaned.withColumn("email", F.lower(F.col("email")))

# 4. Validate email format (simple check)
df_cleaned = df_cleaned.withColumn(
    "valid_email",
    F.when(F.col("email").rlike(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"), True)
     .otherwise(False)
)

# 5. Validate age (must be between 18 and 100)
df_cleaned = df_cleaned.withColumn(
    "valid_age",
    F.when((F.col("age") >= 18) & (F.col("age") <= 100), True)
     .otherwise(False)
)

print("\nCleaned data with validation flags:")
df_cleaned.show(truncate=False)

In [None]:
# Filter to only valid records
df_valid = df_cleaned.filter(
    (F.col("name").isNotNull()) &
    (F.col("email").isNotNull()) &
    (F.col("valid_email") == True) &
    (F.col("age").isNotNull()) &
    (F.col("valid_age") == True) &
    (F.col("salary").isNotNull())
).drop("valid_email", "valid_age")

print("\nFinal valid records:")
df_valid.show(truncate=False)
print(f"Valid records: {df_valid.count()} out of {df.count()} original rows")

In [None]:
# Alternatively, fill missing values instead of dropping
df_filled = df_cleaned.fillna({
    "name": "Unknown",
    "email": "no-email@example.com",
    "age": 0,
    "salary": 0
})

print("\nData with filled null values:")
df_filled.drop("valid_email", "valid_age").show(truncate=False)

In [None]:
spark.stop()