In [0]:
df = spark.table('workspace.bronze.crm_sales_details')
display(df)

In [0]:
%sql
SELECT sls_ord_num, sls_prd_key, sls_cust_id, sls_order_dt, sls_ship_dt, sls_due_dt, sls_sales, sls_quantity, sls_price, COUNT(*) AS duplicate_count
FROM workspace.bronze.crm_sales_details
GROUP BY sls_ord_num, sls_prd_key, sls_cust_id, sls_order_dt, sls_ship_dt, sls_due_dt, sls_sales, sls_quantity, sls_price
HAVING COUNT(*) > 1

In [0]:
%sql
SELECT sls_ord_num, sls_prd_key, sls_cust_id, sls_order_dt, sls_ship_dt, sls_due_dt, sls_sales, sls_quantity, sls_price,
       CASE WHEN sls_ord_num != TRIM(sls_ord_num) OR LENGTH(sls_ord_num) != LENGTH(TRIM(sls_ord_num)) THEN 'extra_spaces' ELSE '' END AS sls_ord_num_flag,
       CASE WHEN sls_prd_key != TRIM(sls_prd_key) OR LENGTH(sls_prd_key) != LENGTH(TRIM(sls_prd_key)) THEN 'extra_spaces' ELSE '' END AS sls_prd_key_flag
FROM workspace.bronze.crm_sales_details
WHERE (sls_ord_num != TRIM(sls_ord_num) OR LENGTH(sls_ord_num) != LENGTH(TRIM(sls_ord_num)))
   OR (sls_prd_key != TRIM(sls_prd_key) OR LENGTH(sls_prd_key) != LENGTH(TRIM(sls_prd_key)))

In [0]:
%sql
-- Check for missing values in sls_order_dt, sls_ship_dt, sls_due_dt
SELECT COUNT(*) AS missing_order_count
FROM workspace.bronze.crm_sales_details
WHERE sls_order_dt IS NULL;

SELECT COUNT(*) AS missing_ship_count
FROM workspace.bronze.crm_sales_details
WHERE sls_ship_dt IS NULL;

SELECT COUNT(*) AS missing_due_count
FROM workspace.bronze.crm_sales_details
WHERE sls_due_dt IS NULL;

-- Show sample date values and their format
SELECT DISTINCT sls_order_dt
FROM workspace.bronze.crm_sales_details
ORDER BY sls_order_dt DESC;

SELECT DISTINCT sls_ship_dt
FROM workspace.bronze.crm_sales_details
ORDER BY sls_ship_dt DESC;

SELECT DISTINCT sls_due_dt
FROM workspace.bronze.crm_sales_details
ORDER BY sls_due_dt DESC;

In [0]:
from pyspark.sql import functions as F

df = spark.table('workspace.bronze.crm_sales_details')

# Convert integer date columns to string, then to date
df = df.withColumn('order_date', F.to_date(F.col('sls_order_dt').cast('string'), 'yyyyMMdd')) \
       .withColumn('ship_date', F.to_date(F.col('sls_ship_dt').cast('string'), 'yyyyMMdd')) \
       .withColumn('due_date', F.to_date(F.col('sls_due_dt').cast('string'), 'yyyyMMdd'))

# Optionally drop original columns or keep them for traceability
df = df.drop('sls_order_dt', 'sls_ship_dt', 'sls_due_dt')

display(df)

In [0]:
%sql
-- Check for missing values in sls_sales, sls_quantity, sls_price
SELECT COUNT(*) AS missing_sales_count
FROM workspace.bronze.crm_sales_details
WHERE sls_sales IS NULL;

SELECT COUNT(*) AS missing_quantity_count
FROM workspace.bronze.crm_sales_details
WHERE sls_quantity IS NULL;

SELECT COUNT(*) AS missing_price_count
FROM workspace.bronze.crm_sales_details
WHERE sls_price IS NULL;

-- Check for outliers in sls_sales, sls_quantity, sls_price
SELECT sls_sales
FROM workspace.bronze.crm_sales_details
WHERE sls_sales < 0 OR sls_sales > 1000000;

SELECT sls_quantity
FROM workspace.bronze.crm_sales_details
WHERE sls_quantity < 0 OR sls_quantity > 10000;

SELECT sls_price
FROM workspace.bronze.crm_sales_details
WHERE sls_price < 0 OR sls_price > 100000;

In [0]:
# List unique values and counts for product key
from pyspark.sql import functions as F

prd_key_counts = spark.table('workspace.bronze.crm_sales_details') \
    .groupBy('sls_prd_key') \
    .count() \
    .orderBy('count', ascending=False)
display(prd_key_counts)

# Flag unexpected values (e.g., null or empty)
unique_keys = [row['sls_prd_key'] for row in prd_key_counts.collect()]
issues = [val for val in unique_keys if val is None or val == '']
print("Unexpected or ambiguous product key values:", issues)

In [0]:
%sql
-- Standardize sls_prd_key for joinability
SELECT sls_ord_num,
       UPPER(TRIM(REGEXP_REPLACE(sls_prd_key, '[^A-Za-z0-9]', ''))) AS std_sls_prd_key,
       sls_cust_id,
       sls_order_dt,
       sls_ship_dt,
       sls_due_dt,
       sls_sales,
       sls_quantity,
       sls_price
FROM workspace.bronze.crm_sales_details

In [0]:
from pyspark.sql import functions as F

# Read bronze table
bronze_df = spark.table('workspace.bronze.crm_sales_details')

# Trim all string columns
def trim_all_string_columns(df):
    for col_name, dtype in df.dtypes:
        if dtype == 'string':
            df = df.withColumn(col_name, F.trim(F.col(col_name)))
    return df

bronze_df = trim_all_string_columns(bronze_df)

# Convert integer date columns to date type (tolerate invalid values)
bronze_df = bronze_df.withColumn('order_date', F.expr("try_to_date(CAST(sls_order_dt AS STRING), 'yyyyMMdd')")) \
    .withColumn('ship_date', F.expr("try_to_date(CAST(sls_ship_dt AS STRING), 'yyyyMMdd')")) \
    .withColumn('due_date', F.expr("try_to_date(CAST(sls_due_dt AS STRING), 'yyyyMMdd')"))

# Standardize product key
bronze_df = bronze_df.withColumn('std_sls_prd_key', F.upper(F.trim(F.regexp_replace(F.col('sls_prd_key'), '[^A-Za-z0-9]', ''))))

# Rename columns to more readable names
silver_df = bronze_df.withColumnRenamed('sls_ord_num', 'order_number') \
    .withColumnRenamed('sls_prd_key', 'product_key') \
    .withColumnRenamed('sls_cust_id', 'customer_id') \
    .withColumnRenamed('sls_sales', 'sales_amount') \
    .withColumnRenamed('sls_quantity', 'quantity') \
    .withColumnRenamed('sls_price', 'price')

# Drop original integer date columns
silver_df = silver_df.drop('sls_order_dt', 'sls_ship_dt', 'sls_due_dt')

# Reorder columns for clarity
cols = ['order_number', 'product_key', 'std_sls_prd_key', 'customer_id', 'order_date', 'ship_date', 'due_date', 'sales_amount', 'quantity', 'price']
silver_df = silver_df.select(*cols)

# Write to silver table
silver_df.write.mode('overwrite').saveAsTable('workspace.silver.crm_sales_details')
display(silver_df)