## Name Basics

raw -> staging

In [0]:
CREATE TABLE IF NOT EXISTS imdb.staging.name_basics
  AS(
    SELECT * FROM imdb.raw.name_basics
  )



## Cleaning table - transformation and standardisation


In [0]:
REPLACE TABLE imdb.staging.name_basics AS(

WITH

cleaned_table AS(
  SELECT
    nconst AS person_id,
    primaryName AS person_name,
    birthYear AS person_birth_year,
    deathYear AS person_death_year,
    
    -- Convert merged strings to arrays - primaryProfession
    split(TRIM(LOWER(primaryProfession)), ',') AS person_primary_profession,

    --Convert merged stings to arrays - knownForTitles
    split(TRIM(LOWER(knownForTitles)), ',') AS person_known_for_titles

  FROM imdb.raw.name_basics
  WHERE primaryName IS NOT null
)

SELECT * FROM cleaned_table
)


## Data Validation Checks

1) Data types check
2) Row count check
3) Identify functional types (i.e. Keys, Dimensions, Measures)
4) Data quality checks + DQ Audits

In [0]:
-- Data types check
DESCRIBE imdb.raw.name_basics;

In [0]:
-- Row count check
SELECT COUNT(*),
  LEN(COUNT(*)) AS digits
 FROM imdb.raw.name_basics;

In [0]:

-- Data Dictionary

SELECT * FROM (
  VALUES 
    ('nconst', 'Alphanumeric unique identifier of the person', 'Identifier'),
    ('primaryName', 'Name by which the person is most often credited', 'Dimension'),
    ('birthYear', 'YYYY format birth year', 'Measure'),
    ('deathYear', 'YYYY format death year if applicable, else null', 'Measure'),
    ('primaryProfession', 'The top-3 professions of the person', 'Dimension'),
    ('knownForTitles', 'Titles the person is known for (tconsts)', 'Identifier')
) AS data_dictionary(field_name, description, functional_type);

**Data Quality (DQ) Checks - column by column**

Checks:  
1) Completeness: checks involving null values  
2) Distinctiveness: checks involving duplicates
3) Uniformity: checks involving outliers, rogue values and redundant logic

DQ Check - nconst (person_id)

In [0]:

WITH

base_table AS(
  SELECT * FROM imdb.raw.name_basics
  ),

-- Completeness
  null_check AS(
    SELECT * FROM base_table
    WHERE nconst IS NULL
  ), -- no null values in tconst

-- Distinctiveness
  duplicate_check AS(
    SELECT nconst, COUNT(*) 
    FROM base_table
    GROUP BY nconst
    HAVING COUNT(*) > 1
  ) -- no duplicates

-- Uniformity 
  -- no outliers detected

SELECT * FROM duplicate_check


DQ CHECK - primaryName (person_name) 

Notes:
- Delete rows with null in primaryName


In [0]:

WITH

base_table AS(
  SELECT * FROM imdb.raw.name_basics
  ),

-- Completeness
  null_check AS(
    SELECT * FROM base_table
    WHERE primaryName IS NULL
  ) -- 66 null values with effectively no data across all fields except nconst, delete these rows

-- Distinctiveness
  -- no necessary checks


-- Uniformity 
  -- no outliers detected

SELECT * FROM null_check


DQ CHECK - birthYear (person_birth_year)  

Notes:
- Over 95% null

In [0]:

WITH

base_table AS(
  SELECT * FROM imdb.raw.name_basics
  ),

-- Completeness
  null_check AS(
    SELECT COUNT(*) AS null_birthYear_count
    FROM base_table
    WHERE birthYear IS NULL
  ), -- large amount of nulls
  
  null_proportion AS(
    SELECT null_birthYear_count,
    (SELECT COUNT(*) FROM base_table) AS total_count,
    (null_birthYear_count / total_count) AS null_proportion
    FROM null_check
  )   -- over 95% of values are null

-- Distinctiveness
  -- no more checks

-- Uniformity 
  -- no outliers detected

SELECT * FROM null_proportion


DQ CHECK - deathYear (person_death_year)

Notes:
- Over 98% null

In [0]:

WITH

base_table AS(
  SELECT * FROM imdb.raw.name_basics
  ),

-- Completeness
  null_check AS(
    SELECT COUNT(*) AS null_count_deathYear
    FROM base_table
    WHERE deathYear IS NULL
  ), -- large amounts of null values in deathYear

  null_proportion AS(
    SELECT null_count_deathYear,
      (SELECT COUNT(*) FROM base_table) AS total_count,
      (null_count_deathyear / total_count) AS null_proportion
    FROM null_check
  )   -- over 98% of values in deathYear are null

-- Distinctiveness


-- Uniformity 


SELECT * FROM null_proportion


DQ CHECK - primaryProfession (person_primary_profession)  

Notes:
- Convert merged strings into arrays



In [0]:

WITH

base_table AS(
  SELECT * FROM imdb.raw.name_basics
  ),

-- Completeness
  null_check AS(
    SELECT COUNT(*) FROM base_table
    WHERE primaryProfession IS NULL
  ), -- null values exist, not a large proportion

-- Distinctiveness


-- Uniformity 
  distinct_values AS(
    SELECT DISTINCT primaryProfession FROM base_table
  ) -- many values are merged strings

SELECT * FROM distinct_values


DQ CHECK - knownForTitles (person_known_for_titles)  

Notes:
- Large proportion of merged strings, convert to arrays 

In [0]:

WITH

base_table AS(
  SELECT * FROM imdb.raw.name_basics
  ),

-- Completeness
  null_check AS(
    SELECT COUNT(*) FROM base_table
    WHERE knownForTitles IS NULL
  ), -- nulls exist, not significant proportion

-- Distinctiveness
  

-- Uniformity 
  distinct_values AS(
    SELECT DISTINCT knownForTitles FROM base_table 
  )   -- heavily populated with merged strings

  -- check character field relationships in below cell

SELECT * FROM distinct_values
