# Customer Reviews Table Analysis Utility

This notebook provides utility queries to analyze the `gold_customer_reviews` table in Databricks.

**Purpose**: Validate table state, check data distributions, and verify data quality.

**Last Generated**: Dec 25, 2025

## Quick Reference

| Metric | Expected Value |
|--------|----------------|
| Total Reviews | 5,000 |
| Review Types | product_review (60%), return_feedback (20%), purchase_experience (20%) |
| Rating Distribution | Positive-skewed (5-star: ~45%, 4-star: ~25%, etc.) |
| Customer Segments | vip, premium, loyal, regular, new |
| Product Categories | apparel, footwear, accessories |


## Setup: Import Configuration


In [None]:
%pip install --quiet pyyaml

dbutils.library.restartPython()

In [None]:
import sys

# Add the src directory to Python path for clean imports
sys.path.append('../src')

from fashion_retail.config import load_config

# Load configuration from project-level config.yaml
config = load_config()

CATALOG = config.catalog
SCHEMA = config.schema
TABLE_NAME = "gold_customer_reviews"
FULL_TABLE_NAME = f"{CATALOG}.{SCHEMA}.{TABLE_NAME}"

# Expected values for validation
EXPECTED_TOTAL_REVIEWS = 5000
EXPECTED_REVIEW_TYPES = {
    "product_review": 0.60,
    "return_feedback": 0.20,
    "purchase_experience": 0.20
}
EXPECTED_RATING_DISTRIBUTION = {
    5: 0.45,
    4: 0.25,
    3: 0.15,
    2: 0.10,
    1: 0.05
}
VALID_SEGMENTS = {"vip", "premium", "loyal", "regular", "new"}
VALID_CATEGORIES = {"apparel", "footwear", "accessories"}

print(f"Configuration loaded from config.yaml")
print(f"  Catalog: {CATALOG}")
print(f"  Schema: {SCHEMA}")
print(f"Target table: {FULL_TABLE_NAME}")


## 1. Table Schema & Row Count


In [None]:
# Display table schema with comments
display(spark.sql(f"DESCRIBE TABLE {FULL_TABLE_NAME}"))


In [None]:
# Check total row count
row_count_df = spark.sql(f"SELECT COUNT(*) as total_reviews FROM {FULL_TABLE_NAME}")
total_reviews = row_count_df.collect()[0]["total_reviews"]

print(f"Total reviews: {total_reviews:,}")
print(f"Expected: {EXPECTED_TOTAL_REVIEWS:,}")
print(f"Status: {'‚úÖ MATCH' if total_reviews == EXPECTED_TOTAL_REVIEWS else '‚ùå MISMATCH'}")


## 2. Table History (Recent Updates)


In [None]:
# Show recent table history (last 10 operations)
display(spark.sql(f"""
    DESCRIBE HISTORY {FULL_TABLE_NAME} 
    LIMIT 10
"""))


## 3. Review Type Distribution


In [None]:
# Review type distribution
review_type_df = spark.sql(f"""
    SELECT 
        review_type,
        COUNT(*) as count,
        ROUND(COUNT(*) * 100.0 / {EXPECTED_TOTAL_REVIEWS}, 1) as actual_pct
    FROM {FULL_TABLE_NAME}
    GROUP BY review_type
    ORDER BY count DESC
""")

print("Review Type Distribution:")
print("-" * 50)
for row in review_type_df.collect():
    expected_pct = EXPECTED_REVIEW_TYPES.get(row["review_type"], 0) * 100
    status = "‚úÖ" if abs(float(row["actual_pct"]) - expected_pct) < 2 else "‚ö†Ô∏è"
    print(f"{status} {row['review_type']:25} | {row['count']:,} ({row['actual_pct']}%) | Expected: {expected_pct}%")

display(review_type_df)

## 4. Rating Distribution


In [None]:
# Rating distribution
rating_df = spark.sql(f"""
    SELECT 
        rating,
        COUNT(*) as count,
        ROUND(COUNT(*) * 100.0 / {EXPECTED_TOTAL_REVIEWS}, 1) as actual_pct
    FROM {FULL_TABLE_NAME}
    GROUP BY rating
    ORDER BY rating DESC
""")

print("Rating Distribution (should be positive-skewed):")
print("-" * 50)
star_symbols = {5: "‚≠ê‚≠ê‚≠ê‚≠ê‚≠ê", 4: "‚≠ê‚≠ê‚≠ê‚≠ê", 3: "‚≠ê‚≠ê‚≠ê", 2: "‚≠ê‚≠ê", 1: "‚≠ê"}
for row in rating_df.collect():
    expected_pct = EXPECTED_RATING_DISTRIBUTION.get(row["rating"], 0) * 100
    diff = abs(float(row["actual_pct"]) - expected_pct)
    status = "‚úÖ" if diff < 10 else "‚ö†Ô∏è"  # Allow some variance due to segment-based adjustments
    print(f"{status} {star_symbols[row['rating']]} ({row['rating']}) | {row['count']:,} ({row['actual_pct']}%) | Expected: ~{expected_pct}%")

display(rating_df)


## 5. Customer Segment Distribution


In [None]:
# Customer segment distribution
segment_df = spark.sql(f"""
    SELECT 
        customer_segment,
        COUNT(*) as count,
        ROUND(COUNT(*) * 100.0 / {EXPECTED_TOTAL_REVIEWS}, 1) as pct
    FROM {FULL_TABLE_NAME}
    GROUP BY customer_segment
    ORDER BY count DESC
""")

print("Customer Segment Distribution:")
print("-" * 50)
found_segments = set()
for row in segment_df.collect():
    segment = row["customer_segment"]
    found_segments.add(segment)
    status = "‚úÖ" if segment in VALID_SEGMENTS else "‚ùå UNEXPECTED"
    print(f"{status} {segment:15} | {row['count']:,} reviews ({row['pct']}%)")

# Check for missing segments
missing = VALID_SEGMENTS - found_segments
if missing:
    print(f"\n‚ö†Ô∏è Missing segments: {missing}")
else:
    print(f"\n‚úÖ All expected segments present")

display(segment_df)


## 6. Product Category Distribution


In [None]:
# Product category distribution
category_df = spark.sql(f"""
    SELECT 
        product_category,
        COUNT(*) as count,
        ROUND(COUNT(*) * 100.0 / {EXPECTED_TOTAL_REVIEWS}, 1) as pct
    FROM {FULL_TABLE_NAME}
    GROUP BY product_category
    ORDER BY count DESC
""")

print("Product Category Distribution:")
print("-" * 50)
found_categories = set()
for row in category_df.collect():
    category = row["product_category"]
    if category:  # Can be NULL for purchase_experience reviews
        found_categories.add(category)
    status = "‚úÖ" if category in VALID_CATEGORIES or category is None else "‚ùå UNEXPECTED"
    cat_display = category if category else "(NULL - purchase_experience)"
    print(f"{status} {cat_display:25} | {row['count']:,} reviews ({row['pct']}%)")

display(category_df)


## 7. Date Range & Temporal Coverage


In [None]:
# Date range analysis
date_df = spark.sql(f"""
    SELECT 
        MIN(review_date) as earliest_review,
        MAX(review_date) as latest_review,
        DATEDIFF(MAX(review_date), MIN(review_date)) as days_covered,
        COUNT(DISTINCT review_date) as unique_dates
    FROM {FULL_TABLE_NAME}
""")

date_row = date_df.collect()[0]
print("Date Range Analysis:")
print("-" * 50)
print(f"Earliest Review: {date_row['earliest_review']}")
print(f"Latest Review:   {date_row['latest_review']}")
print(f"Days Covered:    {date_row['days_covered']} days")
print(f"Unique Dates:    {date_row['unique_dates']}")

display(date_df)


## 8. Sample Data Quality Check


In [None]:
# Sample reviews with full content preview
sample_df = spark.sql(f"""
    SELECT 
        review_id,
        rating,
        review_type,
        review_title,
        SUBSTRING(review_text, 1, 200) as review_preview,
        customer_segment,
        product_category,
        product_brand,
        sentiment_score,
        word_count
    FROM {FULL_TABLE_NAME}
    ORDER BY review_date DESC
    LIMIT 10
""")

display(sample_df)


## 9. Sentiment & Content Analysis


In [None]:
# Sentiment score distribution by rating
sentiment_df = spark.sql(f"""
    SELECT 
        rating,
        ROUND(AVG(sentiment_score), 2) as avg_sentiment,
        ROUND(MIN(sentiment_score), 2) as min_sentiment,
        ROUND(MAX(sentiment_score), 2) as max_sentiment,
        ROUND(AVG(word_count), 0) as avg_word_count
    FROM {FULL_TABLE_NAME}
    GROUP BY rating
    ORDER BY rating DESC
""")

print("Sentiment Score by Rating (should correlate with star rating):")
print("-" * 70)
for row in sentiment_df.collect():
    print(f"Rating {row['rating']}: Avg Sentiment = {row['avg_sentiment']:>5} | "
          f"Range: [{row['min_sentiment']}, {row['max_sentiment']}] | "
          f"Avg Words: {row['avg_word_count']}")

display(sentiment_df)


In [None]:
# Topic mentions analysis
topics_df = spark.sql(f"""
    SELECT 
        SUM(CASE WHEN mentions_sizing THEN 1 ELSE 0 END) as sizing_mentions,
        SUM(CASE WHEN mentions_quality THEN 1 ELSE 0 END) as quality_mentions,
        SUM(CASE WHEN mentions_delivery THEN 1 ELSE 0 END) as delivery_mentions,
        SUM(CASE WHEN mentions_price THEN 1 ELSE 0 END) as price_mentions,
        SUM(CASE WHEN mentions_comfort THEN 1 ELSE 0 END) as comfort_mentions,
        SUM(CASE WHEN has_recommendation THEN 1 ELSE 0 END) as has_recommendation
    FROM {FULL_TABLE_NAME}
""")

topics_row = topics_df.collect()[0]
print("Topic Mentions in Reviews:")
print("-" * 50)
print(f"  Sizing/Fit:     {topics_row['sizing_mentions']:,} reviews")
print(f"  Quality:        {topics_row['quality_mentions']:,} reviews")
print(f"  Delivery:       {topics_row['delivery_mentions']:,} reviews")
print(f"  Price/Value:    {topics_row['price_mentions']:,} reviews")
print(f"  Comfort:        {topics_row['comfort_mentions']:,} reviews")
print(f"  Recommendations: {topics_row['has_recommendation']:,} reviews")


## 10. Data Quality Summary


In [None]:
# Comprehensive data quality summary
quality_df = spark.sql(f"""
    SELECT 
        -- Null checks
        SUM(CASE WHEN review_id IS NULL THEN 1 ELSE 0 END) as null_review_ids,
        SUM(CASE WHEN customer_key IS NULL THEN 1 ELSE 0 END) as null_customer_keys,
        SUM(CASE WHEN review_text IS NULL OR LENGTH(review_text) = 0 THEN 1 ELSE 0 END) as empty_reviews,
        SUM(CASE WHEN rating IS NULL OR rating < 1 OR rating > 5 THEN 1 ELSE 0 END) as invalid_ratings,
        
        -- FK validation counts (these should be checked against dimension tables)
        COUNT(DISTINCT customer_key) as unique_customers,
        COUNT(DISTINCT product_key) as unique_products,
        
        -- Content stats
        ROUND(AVG(word_count), 1) as avg_word_count,
        MIN(word_count) as min_word_count,
        MAX(word_count) as max_word_count
    FROM {FULL_TABLE_NAME}
""")

qc = quality_df.collect()[0]
print("=" * 60)
print("DATA QUALITY SUMMARY")
print("=" * 60)

checks_passed = True

# Null checks
print("\nüìã Null/Empty Checks:")
if qc['null_review_ids'] == 0:
    print("  ‚úÖ No null review_ids")
else:
    print(f"  ‚ùå {qc['null_review_ids']} null review_ids")
    checks_passed = False

if qc['null_customer_keys'] == 0:
    print("  ‚úÖ No null customer_keys")
else:
    print(f"  ‚ùå {qc['null_customer_keys']} null customer_keys")
    checks_passed = False
    
if qc['empty_reviews'] == 0:
    print("  ‚úÖ No empty review texts")
else:
    print(f"  ‚ùå {qc['empty_reviews']} empty review texts")
    checks_passed = False

if qc['invalid_ratings'] == 0:
    print("  ‚úÖ All ratings valid (1-5)")
else:
    print(f"  ‚ùå {qc['invalid_ratings']} invalid ratings")
    checks_passed = False

# Stats
print(f"\nüìä Content Statistics:")
print(f"  Unique customers: {qc['unique_customers']:,}")
print(f"  Unique products:  {qc['unique_products']:,}")
print(f"  Avg word count:   {qc['avg_word_count']}")
print(f"  Word count range: [{qc['min_word_count']}, {qc['max_word_count']}]")

# Final verdict
print("\n" + "=" * 60)
if checks_passed:
    print("‚úÖ ALL DATA QUALITY CHECKS PASSED")
else:
    print("‚ùå SOME DATA QUALITY CHECKS FAILED - Review issues above")
print("=" * 60)
