## Title Basics

raw -> staging

In [0]:
CREATE TABLE imdb.staging.ttitle_basics
  AS(
    SELECT * FROM imdb.raw.title_basics
  )



## Cleaning table


In [0]:
SELECT DISTINCT endYear from imdb.raw.title_basics ORDER BY endYear DESC

In [0]:
REPLACE TABLE imdb.staging.title_basics AS(

WITH

-- Clean base table
cleaned_base AS(
SELECT 
    -- Remove 'tt' prefix and any following zeros
    -- REGEXP_REPLACE(LOWER(tconst), '^tt0*', '') AS id_number,
    LOWER(TRIM(tconst)) AS id_number,        -- Keep 'tt' prefix, as it is the IMDb key
    titleType AS title_type,
    primaryTitle AS primary_title,
    originalTitle AS original_title,
    startYear AS start_year,
    
    -- Replace values NOT IN (0,1) to 0, for isAdult
    CASE
      WHEN isAdult NOT IN (0,1) THEN 0
      ELSE CAST(isAdult AS INT)
      END AS is_adult,

    -- NOT REQUIRED: "\NN" Have been automatically nullfied
    --Replace '\N' with NULL and cast to INT for endYear and runtimeMinutesCASE WHEN endYear = '\N' THEN NULL ELSE CAST(endYear AS INT) END AS end_year, CASE  WHEN runtimeMinutes = '\N' THEN NULL ELSE CAST(runtimeMinutes AS INT) END AS runtime_minutes,

    endYear AS end_year,
    runtimeMinutes AS runtime_minutes,
    genres AS genres
FROM imdb.raw.title_basics
),

-- Exploding genres to helper tables
genre_helper_table AS (
  SELECT * EXCEPT (genres) 
  FROM cleaned_base
 LATERAL VIEW EXPLODE(SPLIT(TRIM(genres),','))
    AS genres_exploded
    WHERE genres_exploded IS NOT NULL
)

SELECT * FROM genre_helper_table
)


In [0]:
REPLACE TABLE imdb.staging.title_basics AS(

WITH

-- Clean base table
cleaned_base AS(
SELECT 

    LOWER(TRIM(tconst)) AS id_number,        -- Keep 'tt' prefix, as it is the IMDb key
    titleType AS title_type,
    primaryTitle AS primary_title,
    originalTitle AS original_title,
    startYear AS start_year,
    
    -- Replace values NOT IN (0,1) to 0, for isAdult
    CASE
      WHEN isAdult NOT IN (0,1) THEN 0
      ELSE CAST(isAdult AS INT)
      END AS is_adult,

    -- NOT REQUIRED: "\NN" Have been automatically nullfied
    --Replace '\N' with NULL and cast to INT for endYear and runtimeMinutesCASE WHEN endYear = '\N' THEN NULL ELSE CAST(endYear AS INT) END AS end_year, CASE  WHEN runtimeMinutes = '\N' THEN NULL ELSE CAST(runtimeMinutes AS INT) END AS runtime_minutes,

    endYear AS end_year,
    runtimeMinutes AS runtime_minutes,

    -- Enrich genres. Logic: if isAdult = 1 but genres does not have "Adult", then add "Adult" in genres
    CASE WHEN
    isAdult = 1 AND LOWER(genres) NOT LIKE "%adult%"
    THEN CONCAT(genres, ',Adult')
    ELSE genres
    END AS genres

FROM imdb.raw.title_basics
)

-- LEFT FOR "staging -> transformed" notebook: Exploding and repacking 'genres'
/*
,
  -- Exploding genres to helper column
  exploded AS (
    SELECT * 
    FROM cleaned_base
  LATERAL VIEW EXPLODE(SPLIT(TRIM(genres),','))
      AS genres_exploded
      WHERE genres_exploded IS NOT NULL
  ),                                                    -- this steps creates mutliple rows for each id_number based on how many unique genres there are

  -- Repack genres_exploded into arrays
  repacked AS(
    SELECT * EXCEPT (genres, genres_exploded),
  -- repack: groups data by id_number etc to collect exploded genres into single arrays
      COLLECT_SET(genres_exploded) AS genres
    FROM exploded
    GROUP BY id_number, title_type, primary_title, original_title, start_year, is_adult, end_year, runtime_minutes
  )
*/

SELECT * FROM cleaned_base
)


## Data Validation Checks

In [0]:
WITH 

-- Setting base table
  base_table AS (
    SELECT * FROM imdb.raw.title_basics
  ),

-- Row Count
  row_count AS(
      SELECT COUNT(*) AS row_count
      FROM base_table
    ),

-- Distinct Counts
  disctinct_count_tconst AS(
      SELECT COUNT(DISTINCT tconst) AS distinct_count_tconst
      FROM base_table
    ),

-- Distinct values of each field
    distinct_value_titleType AS(
      SELECT DISTINCT titleType FROM base_table
    ),

    distinct_value_isAdult AS(                    -- isAdult has values other than the binary (0, 1)
      SELECT DISTINCT isAdult FROM base_table
    ),

-- Duplicate Check
  duplicate_check_tconst AS(
    SELECT tconst, COUNT(*)
      FROM base_table
      GROUP BY tconst
      HAVING COUNT(*) > 1
  ),

  duplicate_check_primaryTitle AS(            -- many duplicates in primaryTitle/originalTitle, duplicates should not be deleted as tconst is unique  
    SELECT primaryTitle, COUNT(*)
      FROM base_table
      GROUP BY primaryTitle
      HAVING COUNT(*) > 1
  )

/* Regex Pattern Check
  --regex_check AS(
  --  SELECT 
  --      LOWER(tconst) AS ID
  --  FROM base_table
  --  WHERE LOWER(tconst) NOT LIKE "tt%"
  --)
*/

SELECT * FROM distinct_value_isAdult;

In [0]:
-- Check field types
DESCRIBE imdb.raw.title_basics;

In [0]:
-- Looking into isAdult field

WITH
-- Checking isAdult values not 0 or 1
isAdult_values AS (
  SELECT * 
  FROM imdb.raw.title_basics
  WHERE isAdult NOT IN (0, 1) -- 529 rows
    -- Seems to occur when originalTitle = '0'
),

-- isAdult vs originalTitle comparison
comparison AS(
  SELECT 
    -- number of rows where isAdult is not 0 or 1
      COUNT(CASE WHEN isAdult NOT IN (0,1) THEN 1 END) AS count_isAdult,
    -- number of rows where originalTitle is '0'
      COUNT(CASE WHEN originalTitle = '0' THEN 1 END) AS count_originalTitle
    FROM imdb.raw.title_basics
      -- 529 rows for both
    ),

-- DIAGNOSIS
  -- 1) Does all isAdult = 1 have "Adult" in genres field?
        diagnosis1 AS(
        SELECT COUNT(*) FROM imdb.raw.title_basics
        WHERE isAdult = 1 AND LOWER(genres) NOT LIKE "%adult%"
        ), -- only 163 entries
        -- assume these entries should have "Adult" in genres, add these entries during cleaning
  
  -- 2) How many isAdult <> {0, 1} have "Adult" in genres field?
        diagnosis2 AS(
          SELECT COUNT(*) FROM imdb.raw.title_basics
          WHERE isAdult NOT IN (0, 1) AND LOWER(genres) LIKE "%adult%"
        ) -- 0 entries

SELECT * FROM diagnosis2

-- 1) Add "Adult" to genres when isAdult = 1 and not already exist
-- 2) Assume isAdult = 0 when encountering invalid values

In [0]:
-- Checking whether there are start or end years that are upcoming

SELECT * FROM imdb.raw.title_basics
WHERE 
  startYear > YEAR(CURRENT_DATE)
  OR
  endYear > YEAR(CURRENT_DATE)  --7000+ imdb entries that are upcoming or ending in the future

-- add is_upcoming flag in staging -> transformed?