## Title Principals

raw -> staging

In [0]:
CREATE TABLE IF NOT EXISTS imdb.staging.title_principals
  AS(
    SELECT * FROM imdb.raw.title_principals
  )



## Cleaning table - transformation and standardisation


In [0]:
REPLACE TABLE imdb.staging.title_principals AS(

WITH

cleaned_table AS(
  SELECT 
    tconst AS id_number,     -- rename for consistency across all tables  
    ordering AS ordering,
    nconst AS name_id_number,

   --Replace 'actress' and 'self' with 'actor' 
    CASE 
    WHEN TRIM(LOWER(category)) = 'actress' THEN 'actor'
    WHEN TRIM(LOWER(category)) = 'self' THEN 'actor'
    ELSE TRIM(LOWER(category))
    END AS role,

  -- Remove duplicates of category from job
    CASE
    WHEN TRIM(LOWER(job)) = TRIM(LOWER(category)) THEN null
    ELSE TRIM(LOWER(job))
    END AS job,

  -- Remove regex pattern
    REGEXP_REPLACE(characters, '[\\"\\[\\]]', '') AS characters

  FROM imdb.raw.title_principals
),

-- Coalesce job and characters columns
merge_job_characters AS(
  SELECT * EXCEPT(job, characters),
    COALESCE(job, characters) AS job_details
  FROM cleaned_table
)

SELECT * FROM merge_job_characters
)


## Data Validation Checks

1) Data types check
2) Row count check
3) Identify functional types (i.e. Keys, Dimensions, Measures)
4) Data quality checks + DQ Audits

In [0]:
-- Data types check
DESCRIBE imdb.raw.title_principals;

In [0]:
-- Row count check
SELECT COUNT(*),
  LEN(COUNT(*)) AS digits
 FROM imdb.raw.title_principals;

In [0]:

-- Data Dictionary

SELECT * FROM (
  VALUES 
    ('tconst', 'Unique identifier for the title', 'Identifier'),
    ('nconst', 'Unique identifier for the person', 'Identifier'),
    ('ordering', 'Numerical rank used to identify rows and billing sequence', 'Measure'),
    ('category', 'Unified role of the person (e.g., actor, director, writer)', 'Dimension'),
    ('job', 'Specific title for the role if different from category', 'Dimension'),
    ('characters', 'Cleaned name of the character(s) played by the performer', 'Dimension')
) AS data_dictionary(field_name, description, functional_type);

**Data Quality (DQ) Checks - column by column**

Checks:  
1) Completeness: checks involving null values  
2) Distinctiveness: checks involving duplicates
3) Uniformity: checks involving outliers, rogue values and redundant logic

DQ Check - tconst (id_number)

In [0]:

WITH

base_table AS(
  SELECT * FROM imdb.raw.title_principals
  ),

-- Completeness
  null_check AS(
    SELECT * FROM base_table
    WHERE tconst IS NULL
  ), -- no null values in tconst

-- Distinctiveness
  duplicate_check_tconst_ordering AS(
    SELECT tconst, ordering, COUNT(*)
    FROM base_table
    GROUP BY 1, 2
    HAVING COUNT(*) >1
  ), -- no duplicates

  duplicate_check_tconst_nconst AS(
    SELECT tconst, nconst, COUNT(*)
    FROM base_table
    GROUP BY 1, 2
    HAVING COUNT(*) > 1
  ) -- duplicates exist, indicates a person can have multiple roles in a movie

-- Uniformity 
  -- no outliers detected

SELECT * FROM   duplicate_check_tconst_nconst


DQ CHECK - ordering

Notes:
- Delete in staging -> transformed

In [0]:

WITH

base_table AS(
  SELECT * FROM imdb.raw.title_principals
  ),

-- Completeness
  null_check AS(
    SELECT * FROM base_table
    WHERE ordering IS NULL
  ), -- no null values in tconst

-- Distinctiveness
  duplicate_check_tconst_ordering_nconst AS(
    SELECT tconst, ordering, nconst, COUNT(*)
    FROM base_table
    GROUP BY 1, 2, 3
    HAVING COUNT(*) >1
  ), -- no duplicates


-- Uniformity 
  less_than_1_check AS(
    SELECT COUNT(*) 
    FROM base_table
    WHERE ordering < 1
  ) -- all values > 1

SELECT * FROM less_than_1_check


DQ CHECK - nconst (name_id_number)

In [0]:

WITH

base_table AS(
  SELECT * FROM imdb.raw.title_principals
  ),

-- Completeness
  null_check AS(
    SELECT * FROM base_table
    WHERE nconst IS NULL
  ), -- no null values in tconst

-- Distinctiveness
  -- no more checks

-- Uniformity 
  -- no outliers detected

SELECT * FROM null_check


DQ CHECK - category (role)

Notes:
- actor/actress/self -> just 'actor'

In [0]:

WITH

base_table AS(
  SELECT * FROM imdb.raw.title_principals
  ),

-- Completeness
  null_check AS(
    SELECT * FROM base_table
    WHERE category IS NULL
  ), -- no null values in tconst

-- Distinctiveness
  duplicate_check_tconst_nconst_category AS(
    SELECT tconst, ordering, nconst, COUNT(*)
    FROM base_table
    GROUP BY 1, 2, 3
    HAVING COUNT(*) >1
  ), -- no duplicates

-- Uniformity 
  distinct_check AS(
    SELECT DISTINCT category FROM base_table
  )
  -- actor/actress gender distinction should not exist, standardise to maintain gender neutrality
  -- self can also be changed to actor

SELECT * FROM distinct_check


DQ CHECK - job (merge with characters)
- Delete values that are duplicates of category
- Merge with characters field: 'role_detail'


In [0]:

WITH

base_table AS(
  SELECT * FROM imdb.raw.title_principals
  ),

-- Completeness
  null_check AS(
    SELECT * FROM base_table
    WHERE job IS NULL
  ), -- null values exist

-- Distinctiveness


-- Uniformity 
  -- many non-null job values seem to be duplicates of category
  -- how many non-null job values are NOT duplicates?
  non_duplicate_job_count AS (
    SELECT COUNT(*)
    AS non_duplicate_count
    FROM base_table 
    WHERE job <> category 
    AND job IS NOT NULL
  ),

  redundant_proportion AS(
    SELECT 
    non_duplicate_count, 
    (SELECT COUNT(*) FROM base_table) AS total_count,
    ( 1 - (non_duplicate_count / total_count) ) AS redundant_proportion 
    -- this calculates (1 - [proportion of non-redundant proportion] = redundant proportion)
    FROM non_duplicate_job_count
  )   -- 93% of values in job field is redundant
      -- delete duplicates 


SELECT * FROM redundant_proportion


DQ CHECK - characters (merge with job)  
- Remove regex pattern
- merge with job field: 'role_detail'

In [0]:

WITH

base_table AS(
  SELECT * FROM imdb.raw.title_principals
  ),

-- Completeness
  null_check AS(
    SELECT * FROM base_table
    WHERE characters IS NULL
  ), -- no null values in tconst

-- Distinctiveness
  

-- Uniformity 
  regex_check AS(
    SELECT COUNT(*) 
    FROM base_table
    WHERE TRIM(LOWER(characters)) NOT LIKE '["%"]'
  ) -- 0 values that do not match regex pattern

  -- check character field relationships in below cell

SELECT * FROM regex_check


In [0]:
-- job and characters relationship

  -- Checking mutual exclusivity of job and characters
    SELECT COUNT(*) FROM imdb.raw.title_principals
    WHERE 
    job IS NOT NULL
    AND 
      characters IS NOT NULL

  -- COUNT(*) = 0
  -- job and characters are MUTUALLY EXCLUSIVE
  -- safe to merge into one column AS 'role_detail'

In [0]:
-- category and characters relationship

 SELECT 
 -- Does 'self' values in category all have value in characters?
 (SELECT COUNT(*) FROM imdb.raw.title_principals
 WHERE 
 TRIM(LOWER(category)) LIKE '%self%'
 AND 
 TRIM(LOWER(characters)) = null)
 AS check1,   -- yes

 -- Does all non-null values in characters correspond to category = 'self'?
 (SELECT COUNT(*) FROM imdb.raw.title_principals
 WHERE
 characters <> null
 AND 
 TRIM(LOWER(category)) NOT LIKE '%self%'
 )
 AS check2   -- yes

 -- Dependency: characters is populated iff category is 'self'
  -- therefore, safe to change 'self' -> 'actor' in category field without information loss
