In [1]:
import duckdb


In [2]:
conn = duckdb.connect()


## Dockets

In [None]:
query = """\
CREATE OR REPLACE VIEW src_docket_files AS
SELECT
  filename,
  content,
  split_part(filename, '/', 4) as agency_code,
  split_part(filename, '/', 5) as docket_id,
  split_part(split_part(filename, '/', 5), '-', 2) as year,
FROM read_text('mirrulations/bulk/raw-data/*/*/*/docket/*.json');

CREATE OR REPLACE VIEW docket_parsed AS
SELECT
  f.agency_code,
  f.docket_id,
  f.year,

  json_extract_string(f.content, '$.data.attributes.docketType') as docket_type,
  json_extract_string(f.content, '$.data.attributes.modifyDate')::TIMESTAMP as modify_date,
  json_extract_string(f.content, '$.data.attributes.title') as title,

  f.content AS raw_json
FROM src_docket_files f;

COPY (
  SELECT *
  FROM docket_parsed
) TO 'parquet/dockets'
  (FORMAT PARQUET,
   PARTITION_BY (agency_code, year),
   COMPRESSION SNAPPY);
"""

conn.query(query)


## Documents

In [3]:
query = """\
CREATE OR REPLACE VIEW src_document_files AS
SELECT
  filename,
  content,
  split_part(filename, '/', 4) as agency_code,
  split_part(filename, '/', 5) as docket_id,
  split_part(split_part(filename, '/', 5), '-', 2) as year,
FROM read_text('mirrulations/bulk/raw-data/*/*/*/documents/*.json');

CREATE OR REPLACE VIEW documents_parsed AS
SELECT
  f.agency_code,
  f.docket_id,
  f.year,

  json_extract_string(f.content, '$.data.id') as document_id,

  json_extract_string(f.content, '$.data.attributes.category') as category,

  json_extract_string(f.content, '$.data.attributes.documentType') as document_type,

  json_extract_string(f.content, '$.data.attributes.commentStartDate')::TIMESTAMP as comment_start_date,
  json_extract_string(f.content, '$.data.attributes.commentEndDate')::TIMESTAMP as comment_end_date,
  json_extract_string(f.content, '$.data.attributes.modifyDate')::TIMESTAMP as modify_date,
  json_extract_string(f.content, '$.data.attributes.postedDate')::TIMESTAMP as posted_date,
  json_extract_string(f.content, '$.data.attributes.receiveDate')::TIMESTAMP as receive_date,

  json_extract_string(f.content, '$.data.attributes.pageCount')::INT as page_count,
  json_extract_string(f.content, '$.data.attributes.withdrawn')::BOOLEAN as withdrawn,

  f.content AS raw_json
FROM src_document_files f;

COPY (
  SELECT *
  FROM documents_parsed
) TO 'parquet/documents'
  (FORMAT PARQUET,
   PARTITION_BY (agency_code, year),
   COMPRESSION SNAPPY);
"""

conn.query(query)


## Comments

In [4]:
query = """\
CREATE OR REPLACE VIEW src_comment_files AS
SELECT
  filename,
  content,
  split_part(filename, '/', 4) as agency_code,
  split_part(filename, '/', 5) as docket_id,
  split_part(split_part(filename, '/', 5), '-', 2) as year,

FROM read_text('mirrulations/bulk/raw-data/*/*/*/comments/*.json');

CREATE OR REPLACE VIEW comments_parsed AS
SELECT
  f.agency_code,
  f.docket_id,
  f.year,

  json_extract_string(f.content, '$.data.id') as comment_id,

  json_extract_string(f.content, '$.data.attributes.category') as category,
  json_extract_string(f.content, '$.data.attributes.comment') as comment,
  json_extract_string(f.content, '$.data.attributes.documentType') as document_type,
  json_extract_string(f.content, '$.data.attributes.modifyDate')::TIMESTAMP as modify_date,
  json_extract_string(f.content, '$.data.attributes.postedDate')::TIMESTAMP as posted_date,
  json_extract_string(f.content, '$.data.attributes.receiveDate')::TIMESTAMP as receive_date,
  json_extract_string(f.content, '$.data.attributes.subtype') as subtype,
  json_extract_string(f.content, '$.data.attributes.title') as title,
  json_extract_string(f.content, '$.data.attributes.withdrawn')::BOOLEAN as withdrawn,

  f.content AS raw_json
FROM src_comment_files f;

COPY (
  SELECT *
  FROM comments_parsed
) TO 'parquet/comments'
  (FORMAT PARQUET,
   PARTITION_BY (agency_code, year, docket_id),
   COMPRESSION SNAPPY);
"""

conn.query(query)
