In [0]:
df = spark.table('workspace.bronze.crm_cust_info')
display(df)

In [0]:
%sql
SELECT cst_id, cst_key, cst_firstname, cst_lastname, cst_marital_status, cst_gndr, cst_create_date, COUNT(*) AS duplicate_count
FROM workspace.bronze.crm_cust_info
GROUP BY cst_id, cst_key, cst_firstname, cst_lastname, cst_marital_status, cst_gndr, cst_create_date
HAVING COUNT(*) > 1

In [0]:
%sql
SELECT cst_id, cst_key,
       cst_firstname,
       cst_lastname,
       cst_marital_status,
       cst_gndr,
       cst_create_date,
       CASE WHEN cst_firstname != TRIM(cst_firstname) OR LENGTH(cst_firstname) != LENGTH(TRIM(cst_firstname)) THEN 'extra_spaces' ELSE '' END AS firstname_flag,
       CASE WHEN cst_lastname != TRIM(cst_lastname) OR LENGTH(cst_lastname) != LENGTH(TRIM(cst_lastname)) THEN 'extra_spaces' ELSE '' END AS lastname_flag,
       CASE WHEN cst_marital_status != TRIM(cst_marital_status) OR LENGTH(cst_marital_status) != LENGTH(TRIM(cst_marital_status)) THEN 'extra_spaces' ELSE '' END AS marital_flag,
       CASE WHEN cst_gndr != TRIM(cst_gndr) OR LENGTH(cst_gndr) != LENGTH(TRIM(cst_gndr)) THEN 'extra_spaces' ELSE '' END AS gndr_flag
FROM workspace.bronze.crm_cust_info
WHERE (cst_firstname != TRIM(cst_firstname) OR LENGTH(cst_firstname) != LENGTH(TRIM(cst_firstname)))
   OR (cst_lastname != TRIM(cst_lastname) OR LENGTH(cst_lastname) != LENGTH(TRIM(cst_lastname)))
   OR (cst_marital_status != TRIM(cst_marital_status) OR LENGTH(cst_marital_status) != LENGTH(TRIM(cst_marital_status)))
   OR (cst_gndr != TRIM(cst_gndr) OR LENGTH(cst_gndr) != LENGTH(TRIM(cst_gndr)))

In [0]:
%sql
-- Check for missing values in cst_create_date
SELECT COUNT(*) AS missing_count
FROM workspace.bronze.crm_cust_info
WHERE cst_create_date IS NULL;

-- Check for non-date values (should be empty if column is type DATE)
SELECT cst_create_date
FROM workspace.bronze.crm_cust_info
WHERE CAST(cst_create_date AS STRING) NOT RLIKE '^[0-9]{4}-[0-9]{2}-[0-9]{2}$';

-- Show sample date values and their format
SELECT DISTINCT cst_create_date
FROM workspace.bronze.crm_cust_info
ORDER BY cst_create_date DESC;

In [0]:
%sql
-- Check for missing values in cst_id
SELECT COUNT(*) AS missing_count
FROM workspace.bronze.crm_cust_info
WHERE cst_id IS NULL;

-- Check for non-numeric values (should be empty if column is type INT)
SELECT cst_id
FROM workspace.bronze.crm_cust_info
WHERE CAST(cst_id AS STRING) RLIKE '[^0-9]';

-- Check for outliers in cst_id (example: values outside expected range)
SELECT cst_id
FROM workspace.bronze.crm_cust_info
WHERE cst_id < 0 OR cst_id > 99999999;

In [0]:
# List unique values and counts for gender
from pyspark.sql import functions as F

gender_counts = spark.table('workspace.bronze.crm_cust_info') \
    .groupBy('cst_gndr') \
    .count() \
    .orderBy('count', ascending=False)
display(gender_counts)

# Flag unexpected values (not 'M', 'F', 'Male', 'Female', null)
expected = {'M', 'F', 'Male', 'Female', None}
unique_genders = [row['cst_gndr'] for row in gender_counts.collect()]
issues = [val for val in unique_genders if val not in expected]
print("Unexpected or ambiguous gender values:", issues)

In [0]:
%sql
-- Standardize cst_key for joinability
SELECT cst_id,
       UPPER(TRIM(REGEXP_REPLACE(cst_key, '[^A-Za-z0-9]', ''))) AS std_cst_key,
       cst_firstname,
       cst_lastname,
       cst_marital_status,
       cst_gndr,
       cst_create_date
FROM workspace.bronze.crm_cust_info

In [0]:
from pyspark.sql import functions as F

# Read bronze table
bronze_df = spark.table('workspace.bronze.crm_cust_info')

def trim_all_string_columns(df):
    for col_name, dtype in df.dtypes:
        if dtype == 'string':
            df = df.withColumn(col_name, F.trim(F.col(col_name)))
    return df

trimmed_df = trim_all_string_columns(bronze_df)
display(trimmed_df)

In [0]:
from pyspark.sql import functions as F

# Read trimmed DataFrame from previous cell
trimmed_df = spark.table('workspace.bronze.crm_cust_info')
trimmed_df = trim_all_string_columns(trimmed_df)

# Rename columns to more readable names
silver_df = trimmed_df.withColumnRenamed('cst_id', 'customer_id') \
                     .withColumnRenamed('cst_key', 'customer_key') \
                     .withColumnRenamed('cst_firstname', 'first_name') \
                     .withColumnRenamed('cst_lastname', 'last_name') \
                     .withColumnRenamed('cst_marital_status', 'marital_status') \
                     .withColumnRenamed('cst_gndr', 'gender') \
                     .withColumnRenamed('cst_create_date', 'created_date')

# Write to silver table
silver_df.write.mode('overwrite').saveAsTable('workspace.silver.crm_cust_info')
display(silver_df)