## Title Ratings

Exploring and Profiling data

## Cleaning table - transformation and standardisation
- standardise field names and types
- create rating band field



In [0]:
WITH

-- Clean base table
cleaned_base AS(
SELECT 
      /* -- Remove 'tt' prefix and any following zeros
      REGEXP_REPLACE(LOWER(tconst), '^tt0*', '') AS id_number, */
    LOWER(TRIM(tconst)) AS id_number,        -- Keep 'tt' prefix, as it is the IMDb key
    averageRating AS average_rating,
    numVotes AS num_votes
FROM imdb.raw.title_ratings
),

-- Create rating bands
rating_band AS (
  SELECT *,
    CASE
      WHEN average_rating IS NULL THEN 'unknown'
      WHEN average_rating < 3 THEN '0-3'
      WHEN average_rating < 5 THEN '3-5'
      WHEN average_rating < 7 THEN '5-7'
      WHEN average_rating <= 10 THEN '7-10'
      ELSE 'ERROR'
    END AS rating_band
    FROM cleaned_base
      WHERE num_votes >= 50 
)

SELECT * FROM rating_band

## Data Validation Checks

1) Data types check
2) Row count check
3) Identify functional types (i.e. Keys, Dimensions, Measures)
4) Data quality checks + DQ Audits

In [0]:
-- Data types check
DESCRIBE imdb.raw.title_ratings;

In [0]:
-- Row count check
SELECT COUNT(*),
  LEN(COUNT(*)) AS digits
 FROM imdb.raw.title_ratings;

In [0]:

-- Data Dictionary

SELECT * FROM (
  VALUES 
    ('tconst', 'Alphanumeric unique identifier of the title', 'Identifier'),
    ('averageRating', 'Weighted average of user ratings', 'Measure'),
    ('numVotes', 'Total count of votes received', 'Measure')
) AS data_dictionary(field_name, description, functional_type);

**Data Quality (DQ) Checks - column by column**

Checks:  
1) Completeness: checks involving null values  
2) Distinctiveness: checks involving duplicates
3) Uniformity: checks involving outliers, rogue values and redundant logic

DQ Check - tconst (id_number)

In [0]:

WITH

base_table AS(
  SELECT * FROM imdb.raw.title_ratings
  ),

-- Completeness
  null_check AS(
    SELECT * FROM base_table
    WHERE tconst IS NULL
  ), -- no null values in tconst

-- Distinctiveness
  duplicate_check_tconst AS(
    SELECT tconst, COUNT(*)
    FROM base_table
    GROUP BY tconst
    HAVING COUNT(*) > 1
  )

-- Uniformity 
  -- no outliers detected

SELECT * FROM duplicate_check_tconst


DQ CHECK - averageRating (average_rating)


In [0]:

WITH

base_table AS(
  SELECT * FROM imdb.raw.title_ratings
  ),

-- Completeness
  null_check AS(
    SELECT * FROM base_table
    WHERE averageRating IS NULL
  ) -- no null values

-- Distinctiveness
  -- no necessary checks


-- Uniformity 
  -- no outliers detected

SELECT * FROM null_check


DQ CHECK - numVotes (num_votes)

In [0]:

WITH

base_table AS(
  SELECT * FROM imdb.raw.title_ratings
  ),

-- Completeness
  null_check AS(
    SELECT * FROM base_table
    WHERE numVotes IS NULL
  ) -- no null values in tconst

-- Distinctiveness
  -- no necessary checks

-- Uniformity 
  -- no outliers detected

SELECT * FROM null_check
