In [1]:
import duckdb
from pathlib import Path

In [134]:
base_path = Path().cwd().parent
storage_path = base_path / Path('raw/entity/anime/*.json')

'c:\\Users\\JH\\project\\road-to-data-engineer\\graph-search\\graph-search-data-engineering\\raw\\entity\\anime\\*.json'

In [34]:
columns1 = """
Page: 'STRUCT(
    pageInfo STRUCT,
    media STRUCT
)'
"""

columns2 = {
'Page': """
STRUCT(
    pageInfo STRUCT(
        total INT, 
        currentPage INT
    ),
    media STRUCT(
        id BIGINT,
        title STRUCT(
            romaji VARCHAR,
            english VARCHAR,
            native VARCHAR
        ),
        type VARCHAR
    )[]
)
"""
}

In [135]:
df_page = duckdb.read_json(str(storage_path), format='auto')

In [137]:
df_media = duckdb.sql("""
WITH df_page_media AS (
    SELECT
        UNNEST(
           (json_extract(df_page, '$.Page.media'))::JSON[]
        ) media
    FROM df_page 
)
SELECT 
    d->'media'->>'id' media_id
    ,d->'media'->'title'->>'romaji' title_romaji
    ,d->'media'->'title'->>'romaji' title_english
    ,d->'media'->'title'->>'romaji' title_native
    ,d->'media'->>'type' "type"
    ,d->'media'->>'format' format
    ,d->'media'->>'description' "description"
    ,d->'media'->'startDate'->>'year' "start_date_year"
    ,d->'media'->'startDate'->>'month' "start_date_month"
    ,d->'media'->'startDate'->>'day' "start_date_day"
    ,d->'media'->'endDate'->>'year' "end_date_year"
    ,d->'media'->'endDate'->>'month' "end_date_month"
    ,d->'media'->'endDate'->>'day' "end_date_day"
    ,d->'media'->>'season' "season"
    ,d->'media'->>'seasonYear' "season_year"
    ,d->'media'->>'seasonInt' "season_int"
    ,d->'media'->>'episodes' "episodes"
    ,d->'media'->>'duration' "duration"
    ,d->'media'->>'volumes' "volumes"
    ,d->'media'->>'countryOfOrigin' "country_of_origin"
    ,d->'media'->>'source' "source"
    ,(d->'media'->>'averageScore')::INT "averageScore"
    ,(d->'media'->>'meanScore')::INT "meanScore"
    ,(d->'media'->>'popularity')::INT64 "popularity"
    ,(d->'media'->>'trending')::INT "trending"
    ,(d->'media'->>'favourites')::INT "favourites"
    ,(d->'media'->>'isAdult')::BOOLEAN "isAdult"
    ,(d->'media'->'genres')::VARCHAR[] "genres"
    ,(d->'media'->'synonyms')::VARCHAR[] "synonyms"
    ,(d->'media'->'tags')::JSON[] "tags"
    ,(d->'media'->'externalLinks')::JSON[] "external_links"
FROM df_page_media d
""")

In [138]:
df_media_genres = duckdb.sql("""
SELECT 
    media_id, UNNEST(genres) "genres"
FROM df_media
""")

In [139]:
df_media_synonym = duckdb.sql("""
SELECT 
    media_id, UNNEST(synonyms) "synonyms"
FROM df_media
""")

In [140]:
df_meda_tags = duckdb.sql("""
WITH tb_media_tags AS (
SELECT 
    media_id, UNNEST(tags) "tags"
FROM df_media
)
SELECT
    media_id
    ,d->'tags'->>'id' "tag_id"
    ,d->'tags'->>'name' "tag_name"
FROM tb_media_tags d
""")

In [141]:
df_media_link = duckdb.sql("""
WITH tb_media_link AS (
SELECT 
    media_id, UNNEST(external_links) "links"
FROM df_media
)
SELECT
    media_id
    ,d->'links'->>'id' "link_id"
    ,d->'links'->>'url' "url"
    ,d->'links'->>'site' "site"
    ,d->'links'->>'type' "type"
    ,d->'links'->>'siteId' "siteId"
    ,d->'links'->>'language' "language"
FROM tb_media_link d
""")

In [145]:
media_seed = duckdb.sql("""
SELECT DISTINCT media_id, title_english
FROM df_media
""")

In [146]:
seed_storage_path = Path('raw/seed/top-anime.csv')
seed_path = base_path / seed_storage_path

In [147]:
media_seed.to_csv(str(seed_path))