In [0]:
# Standard library imports
import os

# Third-party library imports
from dotenv import load_dotenv
from pyspark.testing import assertDataFrameEqual

In [0]:
load_dotenv()

catalog_name = os.getenv('DATABRICKS_CATALOG_NAME')
schema_name = os.getenv('DATABRICKS_SCHEMA_NAME')

In [0]:
dq_checks = spark.sql(
    f"""
    WITH data_quality_checks AS (
    SELECT 
        -- unique_identifier: ticker_symbol + post_id
        COUNT(DISTINCT ticker_symbol, post_id) = COUNT(*) AS is_unique_check
        -- Sentiment score should be between 0 and 1 
        , COUNT(CASE WHEN sentiment_score < 0 AND sentiment_score > 1 THEN 1 END) = 0 AS is_sentiment_score_valid
        -- Sentiment should be: 'positive' or 'negative' or 'neutral' only
        , COUNT(CASE WHEN sentiment NOT IN ('positive', 'negative', 'neutral') THEN 1 END) = 0 AS is_sentiment_valid
    FROM {catalog_name}.{schema_name}.kdayno_silver_reddit_top_posts_sentiment
    )

    SELECT 
        is_unique_check 
        AND is_sentiment_score_valid
        AND is_sentiment_valid AS all_dq_checks_passed
    FROM data_quality_checks
    """)

In [0]:
expected_results = spark.createDataFrame(data=[(True,)], schema=['all_dq_checks_passed'])

assertDataFrameEqual(expected_results, dq_checks)  # If successful, all data quality checks passed