### Title Ratings

raw -> staging



In [0]:
CREATE TABLE IF NOT EXISTS imdb.staging.title_ratings 
  AS(
    SELECT * FROM imdb.raw.title_ratings
  )

## Cleaning table

- standardise field names


- create "ratings band" field for future analysis



In [0]:
REPLACE TABLE imdb.staging.title_ratings AS(

WITH

-- Clean base table
cleaned_base AS(
SELECT 
      /* -- Remove 'tt' prefix and any following zeros
      REGEXP_REPLACE(LOWER(tconst), '^tt0*', '') AS id_number, */
    LOWER(TRIM(tconst)) AS id_number,        -- Keep 'tt' prefix, as it is the IMDb key
    averageRating AS average_rating,
    numVotes AS num_votes
FROM imdb.raw.title_ratings
),

-- Create rating bands
rating_band AS (
  SELECT *,
    CASE
      WHEN average_rating IS NULL THEN 'unknown'
      WHEN average_rating < 3 THEN '0-3'
      WHEN average_rating < 5 THEN '3-5'
      WHEN average_rating < 7 THEN '5-7'
      WHEN average_rating <= 10 THEN '7-10'
      ELSE 'ERROR'
    END AS rating_band
    FROM cleaned_base
      WHERE num_votes >= 50 
)

SELECT * FROM rating_band
)

## Data Validation Checks


In [0]:
WITH 

-- Setting base table
  base_table AS (
    SELECT * FROM imdb.raw.title_ratings
  ),

-- Row Count
  row_count AS(
      SELECT COUNT(*) AS row_count
      FROM base_table
    ),

-- Distinct Counts
  disctinct_count_tconst AS(
      SELECT COUNT(DISTINCT tconst) AS distinct_count_tconst
      FROM base_table
    ),

  disctinct_count_averageRating AS(
      SELECT COUNT(DISTINCT averageRating) AS disctinct_count_averageRatings
      FROM base_table
    ),

  disctinct_count_numVotes AS(
    SELECT COUNT(DISTINCT numVotes) AS disctinct_count_numVotes
    FROM base_table
  ),

-- Distinct values of each field
    distinct_value_tconst AS(
      SELECT DISTINCT tconst FROM base_table
    ),

    distinct_value_averageRating AS(
      SELECT DISTINCT averageRating FROM base_table
    ),

    distinct_value_numVotes AS(
      SELECT DISTINCT numVotes FROM base_table
    ),


-- Duplicate Check
  duplicate_check_tconst AS(
    SELECT tconst, COUNT(*)
      FROM base_table
      GROUP BY tconst
      HAVING COUNT(*) > 1
  ),


-- Regex Pattern Check
  regex_check AS(
    SELECT 
        LOWER(tconst) AS ID
    FROM base_table
    WHERE LOWER(tconst) NOT LIKE "tt%"
  )

SELECT * FROM duplicate_check_tconst

In [0]:
-- Check field types
DESCRIBE imdb.raw.title_ratings