In [0]:
%run ./00_config

### 1. bronze

In [0]:
spark.sql(f"""
-- ボリュームのファイルの文字を抽出し、bronzeテーブルを作成
CREATE OR REPLACE TABLE {MY_CATALOG}.{MY_SCHEMA}.receipt_bronze AS
SELECT
  path,
  ai_parse_document(content) AS content
FROM
  READ_FILES(
    '/Volumes/{MY_CATALOG}/{MY_SCHEMA}/{MY_VOLUME}/binary/pdf',
    format => 'binaryFile'
  );
""")

### 2. silver

In [0]:
prompt = """
貴方はテキスト整理のプロフェッショナルです。請求書についてJSON形式でまとめてください。parse_jsonできるように余計な文字列は入れないでください。
発行日（yyyy-MM-dd形式）、請求先名、請求元（郵便番号、住所、企業名、部署名）、明細（商品名、数量、単価、金額）、小計、消費税、合計金額、備考をまとめて一つのテキストに整形してください。
Keyはそれぞれ"発行日"、"請求先名"、"請求元"、"明細"、"小計"、"消費税"、"合計金額"、"備考"、でお願いします。
[で始まり、]で終わるJSONで返してください。``````という文字列は抜いてください。
数値（数量、単価、金額、小計、消費税金額、合計金額）は数字型で扱えるように単位は抜いてください。
不明な場合は空白("") or nullにしてください。
フォーマットサンプル：
[
  {
    "発行日": "1900-01-01",
    "請求先名": "",
    "請求元": {
      "郵便番号": "",
      "住所": "",
      "企業名": "",
      "部署名": ""
    },
    "明細": [
      {
        "商品名": "",
        "数量": 0,
        "単価": 0,
        "金額": 0
      }
    ],
    "小計": 0,
    "消費税": 0,
    "合計金額": 0,
    "備考": ""
  }
]
"""

spark.sql(f"""
-- bronzeテーブルから必要項目を切り出して、Silverをテーブルを作成
CREATE OR REPLACE TABLE {MY_CATALOG}.{MY_SCHEMA}.receipt_silver AS
SELECT
  path,
  ai_query(
    'databricks-claude-sonnet-4',
    '{prompt}' || content
  ) AS summary
FROM
  {MY_CATALOG}.{MY_SCHEMA}.receipt_bronze
""")

In [0]:
# prompt = """
# You are a professional at organizing text. Please summarize the invoice in JSON format. Do not include any unnecessary strings so that it can be parsed with parse_json.
# Please organize the following items into a single text: Issue date (yyyy-MM-dd format), Billing recipient name, Billing sender (postal code, address, company name, department name), Details (product name, quantity, unit price, amount), Subtotal, Consumption tax, Total amount, Remarks.
# Please use the following keys: "発行日", "請求先名", "請求元", "明細", "小計", "消費税", "合計金額", "備考".
# Return the result as JSON starting with [ and ending with ]. Do not include the string .
# For numeric values (quantity, unit price, amount, subtotal, consumption tax, total amount), remove units so that they can be handled as numbers.
# If any item is unknown, leave it as an empty string ("") or null.
# Format sample:
# [
#   {
#     "発行日": "1900-01-01",
#     "請求先名": "",
#     "請求元": {
#       "郵便番号": "",
#       "住所": "",
#       "企業名": "",
#       "部署名": ""
#     },
#     "明細": [
#       {
#         "商品名": "",
#         "数量": 0,
#         "単価": 0,
#         "金額": 0
#       }
#     ],
#     "小計": 0,
#     "消費税": 0,
#     "合計金額": 0,
#     "備考": ""
#   }
# ]
# """

# spark.sql(f"""
# -- bronzeテーブルから必要項目を切り出して、Silverをテーブルを作成
# CREATE OR REPLACE TABLE {MY_CATALOG}.{MY_SCHEMA}.receipt_silver AS
# SELECT
#   path,
#   ai_query(
#     'databricks-claude-sonnet-4',
#     '{prompt}' || content
#   ) AS summary
# FROM
#   {MY_CATALOG}.{MY_SCHEMA}.receipt_bronze
# """)

In [0]:
%sql
select * from receipt_silver

### 2. gold

In [0]:
spark.sql(f"""
-- json形式に格納されたSilverテーブルを展開して、Goldテーブルを作成
CREATE OR REPLACE TABLE {MY_CATALOG}.{MY_SCHEMA}.receipt_gold AS
SELECT
  uuid() AS id,
  regexp_replace(
    split(path, '/')[size(split(path, '/'))-1],
    '\\.pdf$',
    ''
  ) AS file_name,
  parsed.`発行日` AS issue_date,
  parsed.`請求先名` AS billing_name,
  parsed.`請求元`.`郵便番号` AS supplier_zip,
  parsed.`請求元`.`住所` AS supplier_address,
  parsed.`請求元`.`企業名` AS supplier_company,
  parsed.`請求元`.`部署名` AS supplier_department,
  details.`商品名` AS item_name,
  CAST(details.`数量` AS BIGINT) AS item_qty,
  CAST(details.`単価` AS BIGINT) AS item_unit_price,
  CAST(details.`金額` AS BIGINT) AS item_amount,
  CAST(parsed.`小計` AS BIGINT) AS subtotal,
  CAST(parsed.`消費税` AS BIGINT) AS consumption_tax,
  CAST(parsed.`小計` AS BIGINT) + CAST(parsed.`消費税` AS BIGINT) AS total_amount_with_tax,
  parsed.`備考` AS remarks
FROM
  {MY_CATALOG}.{MY_SCHEMA}.receipt_silver
LATERAL VIEW
  EXPLODE(
    from_json(
      summary,
      'ARRAY<STRUCT<`発行日`:STRING,`請求先名`:STRING,`請求元`:STRUCT<`郵便番号`:STRING,`住所`:STRING,`企業名`:STRING,`部署名`:STRING>,`明細`:ARRAY<STRUCT<`商品名`:STRING,`数量`:DOUBLE,`単価`:DOUBLE,`金額`:DOUBLE>>,`小計`:DOUBLE,`消費税`:DOUBLE,`備考`:STRING>>'
    )
  ) exploded_receipt AS parsed
LATERAL VIEW
  EXPLODE(parsed.`明細`) details AS details
""")

In [0]:
spark.sql(f"""
-- Goldテーブルに対してCDFを有効化
ALTER TABLE {MY_CATALOG}.{MY_SCHEMA}.receipt_gold
SET TBLPROPERTIES (delta.enableChangeDataFeed = true);
""")

In [0]:
# 変数定義
TABLE_PATH = f'{MY_CATALOG}.{MY_SCHEMA}.receipt_gold'                 # テーブルパス
PK_CONSTRAINT_NAME = f'pk_receipt_gold'                               # 主キー

# NOT NULL制約の追加
columns_to_set_not_null = [
    'id']

for column in columns_to_set_not_null:
    spark.sql(f"""
    ALTER TABLE {TABLE_PATH}
    ALTER COLUMN {column} SET NOT NULL;
    """)

# 主キー設定
spark.sql(f'''
ALTER TABLE {TABLE_PATH} DROP CONSTRAINT IF EXISTS {PK_CONSTRAINT_NAME};
''')

spark.sql(f'''
ALTER TABLE {TABLE_PATH}
ADD CONSTRAINT {PK_CONSTRAINT_NAME} PRIMARY KEY (id);
''')

# # チェック
# display(
#     spark.sql(f'''
#     DESCRIBE EXTENDED {TABLE_PATH}
#     '''))

In [0]:
certified_tag = 'system.Certified'

try:
    spark.sql(f"ALTER TABLE receipt_gold SET TAGS ('{certified_tag}')")
    print(f"認定済みタグ '{certified_tag}' の追加が完了しました。")

except Exception as e:
    print(f"認定済みタグ '{certified_tag}' の追加中にエラーが発生しました: {str(e)}")
    print("このエラーはタグ機能に対応していないワークスペースで実行した場合に発生する可能性があります。")

In [0]:
# テーブル名
table_name = f'{MY_CATALOG}.{MY_SCHEMA}.receipt_gold'

# テーブルコメント
comment = """
テーブル名：`receipt_gold / 領収書（ゴールド）`
説明：領収書データをパースして構造化したテーブルです。分析用に使います。
"""
spark.sql(f'COMMENT ON TABLE {table_name} IS "{comment}"')

# カラムコメント
column_comments = {
    "id": "自動採番したユニークID",
    "file_name": "アップロードされたPDFのファイル名",
    "issue_date": "発行日",
    "billing_name": "請求先名",
    "supplier_zip": "請求元 郵便番号",
    "supplier_address": "請求元 住所",
    "supplier_company": "請求元 企業名",
    "supplier_department": "請求元 部署名",
    "item_name": "明細 商品名",
    "item_qty": "明細 数量",
    "item_unit_price": "明細 単価",
    "item_amount": "明細 金額",
    "subtotal": "小計（税抜）",
    "consumption_tax": "消費税金額",
    "total_amount_with_tax": "合計金額（税込）",
    "remarks": "備考"
}

for column, comment in column_comments.items():
    escaped_comment = comment.replace("'", "\\'")
    sql_query = f"ALTER TABLE {table_name} ALTER COLUMN {column} COMMENT '{escaped_comment}'"
    spark.sql(sql_query)


## Agent Bricks

In [0]:
print("""
[
  {
    "issue_date": "2024-12-21",
    "billing_name": "ブリックステンレス株式会社",
    "supplier_zip": "150-0002",
    "supplier_address": "東京都公共交通合2-12-19 ΔΔΔビル 1 F",
    "supplier_company": "株式会社パーパスメディア",
    "supplier_department": "営業推進部",
    "details": [
      {
        "item_name": "プロジェクター短期レンタル",
        "item_qty": null,
        "item_unit_price": null,
        "item_amount": 407957393
      }
    ],
    "subtotal": 370870358,
    "consumption_tax": 37087035,
    "total_amount_with_tax": 407957393,
    "remarks": "プロジェクター短期レンタル代金として"
  }
]
""")