In [1]:
import duckdb
from pathlib import Path
import datetime

In [2]:
ts = datetime.datetime.now()

In [3]:
base_path = Path().cwd().parent
source_path = base_path / Path('raw/entity/anime-studio')
target_path = base_path / Path(f'silver/anilist/studio/studio-{ts.year}-{ts.month}-{ts.day}.parquet')

In [6]:
tb_studio = duckdb.sql(f"""
    WITH source AS(
        SELECT * FROM read_json_auto('{str(source_path)}/*.json')
    )
    , tb_edge_list AS (
        SELECT
        json_extract(source,
        [
            '$.data.Media.id'
            , '$.data.Media.studios.edges'
        ]) edges_list
        FROM source
    )
    , tb_unnest_node_list AS (
        SELECT
            edges_list[1] AS anime_id
        ,   unnest(edges_list[2]::JSON[]) as edge
        FROM tb_edge_list
    )
    , tb_extract_node AS (
        SELECT
        anime_id
        , json_extract_string(edge, [
            '$.id'
            , '$.isMain'
            , '$.node.id'
            , '$.node.name'
            , '$.node.isAnimationStudio'
        ]) nodes
        FROM tb_unnest_node_list
    ), rename AS(
        SELECT
        anime_id::INT as anime_id
        , nodes[1]::INT AS studio_edge_id
        , nodes[2]::BOOLEAN AS isMain
        , nodes[3]::INT AS studio_id
        , nodes[4]::VARCHAR AS name
        , nodes[5]::BOOLEAN AS isAnimationStudio
        FROM tb_extract_node
    )
        SELECT *
        FROM rename
""")


In [9]:
tb_studio.to_parquet(str(target_path))