# AWS Python SDK(boto3) DataZone Client의 Search 메서드를 이용하여 Asset 메타데이터 저장

- 아래 **현재 실행 중인 IAM 정보**는 실제 호출할 주체를 확인하기 위해 실행



In [1]:
import boto3

sts = boto3.client("sts")
identity = sts.get_caller_identity()

print("✅ 현재 실행 중인 IAM Identity 정보:")
print(f"Account ID : {identity['Account']}")
print(f"UserId     : {identity['UserId']}")
print(f"Arn        : {identity['Arn']}")


✅ 현재 실행 중인 IAM Identity 정보:
Account ID : 533616270150
UserId     : AIDAXYPQCDNDNSVSIPZTE
Arn        : arn:aws:iam::533616270150:user/jh.bae@sk.com


### Asset 메타데이터 전처리 과정을 **SMUS Notebook** 내에서 실행할 경우에는 아래와 같이 출력됨.

✅ **현재 실행 중인 IAM Identity 정보**

| 항목 | 값 |
|------|----|
| Account ID | `533616270150` |
| UserId | `AROAXYPQCDNDMUFTA7JFO:SageMaker` |
| Arn | `arn:aws:sts::533616270150:assumed-role/datazone_usr_role_bknisgxzopjuhk_4oa8qs5krhk8ig/SageMaker` |

---

### SMUS Notebook에서 실행할 경우 아래 정책을 **`datazone_usr_role_bknisgxzopjuhk_4oa8qs5krhk8ig`** 역할에 추가해야 함:

```json
{
  "Version": "2012-10-17",
  "Statement": [
    {
      "Sid": "AllowDataZoneReadAccess",
      "Effect": "Allow",
      "Action": [
        "datazone:Search",
        "datazone:GetAsset",
        "datazone:GetDomain",
        "datazone:ListProjects",
        "datazone:GetProject"
      ],
      "Resource": [
        "arn:aws:datazone:ap-northeast-2:533616270150:domain/dzd-cjvglgj4d43fmg",
        "arn:aws:datazone:ap-northeast-2:533616270150:domain/dzd-cjvglgj4d43fmg/*"
      ]
    }
  ]
}


### 먼저 단순히 SMUS Domain 내 전체 Asset을 조회
- AWS SDK DataZone Client의 Search메서드를 사용하며, 아래 자세한 정보 확인 가능
- https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/datazone/client/search.html
- Search메서드는 searchScope 기준으로 조회가 가능하며, 아래와 같이 4가지를 지원
- searchScope: "ASSET" || "GLOSSARY" || "GLOSSARY_TERM" || "DATA_PRODUCT"

In [2]:
#====전체 Asset 조회====
import boto3

dz = boto3.client("datazone", region_name="ap-northeast-2")
domain_id = "dzd-cjvglgj4d43fmg" # 도메인 ID 입력

# 1️⃣ 모든 프로젝트 조회
projects_resp = dz.list_projects(domainIdentifier=domain_id)
project_ids = [p["id"] for p in projects_resp.get("items", [])]

all_assets = []

# 2️⃣ 각 프로젝트별 Asset 조회
for pid in project_ids:
    next_token = None
    while True:
        params = {
            "domainIdentifier": domain_id,
            "owningProjectIdentifier": pid,
            "searchScope": "ASSET"
        }
        if next_token:
            params["nextToken"] = next_token

        resp = dz.search(**params)
        for item in resp.get("items", []):
            asset_item = item.get("assetItem", {})
            all_assets.append({
                "id": asset_item.get("identifier"),
                "name": asset_item.get("name"),
                "type": asset_item.get("typeIdentifier"),
                "project": asset_item.get("owningProjectId"),
            })

        next_token = resp.get("nextToken")
        if not next_token:
            break

print(f"✅ Found {len(all_assets)} assets in domain {domain_id}")
for a in all_assets:
    print(f"{a['id']} | {a['name']} | {a['type']} | {a['project']}")


✅ Found 61 assets in domain dzd-cjvglgj4d43fmg
buw7nbnbc015zs | amazon.titan-embed-text-v2:0 | amazon.datazone.BedrockModelAssetType | 4fibhopyretcwo
698iz9zfzqwd20 | anthropic.claude-3-haiku-20240307-v1:0 | amazon.datazone.BedrockInferenceOnlyAssetType | 4fibhopyretcwo
3zxsp1gk25yh3s | anthropic.claude-3-5-sonnet-20241022-v2:0 | amazon.datazone.BedrockModelAssetType | 4fibhopyretcwo
clbnw7jnd0wh3s | anthropic.claude-3-5-sonnet-20240620-v1:0 | amazon.datazone.BedrockInferenceOnlyAssetType | 4fibhopyretcwo
cbye31pqw1zk0o | anthropic.claude-3-sonnet-20240229-v1:0 | amazon.datazone.BedrockInferenceOnlyAssetType | 4fibhopyretcwo
d3t49d9elc62l4 | amazon.nova-lite-v1:0 | amazon.datazone.BedrockInferenceOnlyAssetType | 4fibhopyretcwo
daj1vb349aqffc | amazon.nova-pro-v1:0 | amazon.datazone.BedrockModelAssetType | 4fibhopyretcwo
cd7vfswrgovxnc | amazon.nova-micro-v1:0 | amazon.datazone.BedrockModelAssetType | 4fibhopyretcwo
b2v1vakdvzrjbs | anthropic.claude-3-7-sonnet-20250219-v1:0 | amazon.dat

### Datazone Client의 Search의 경우 모든 타입의 Asset 중 typeIdentifier를 지정하여 조회도 가능
### 아래 예시에서는 ASSET 중 GlueTableAssetType만 조회하고 Project Name도 같이 조회

In [3]:
import boto3

dz = boto3.client("datazone", region_name="ap-northeast-2")
domain_id = "dzd-cjvglgj4d43fmg"

# 1️⃣ 도메인 내 모든 프로젝트 조회
projects_resp = dz.list_projects(domainIdentifier=domain_id)
projects = {p["id"]: p["name"] for p in projects_resp.get("items", [])}  # id → name mapping

# 2️⃣ 모든 프로젝트별 Asset 조회 (GlueTableAssetType만)
glue_assets = []

for project_id, project_name in projects.items():
    next_token = None
    while True:
        params = {
            "domainIdentifier": domain_id,
            "owningProjectIdentifier": project_id,
            "searchScope": "ASSET"
        }
        if next_token:
            params["nextToken"] = next_token

        resp = dz.search(**params)

        for item in resp.get("items", []):
            asset_item = item.get("assetItem", {})

            if asset_item.get("typeIdentifier") == "amazon.datazone.GlueTableAssetType":
                glue_assets.append({
                    "id": asset_item.get("identifier"),
                    "name": asset_item.get("name"),
                    "type": asset_item.get("typeIdentifier"),
                    "project_id": project_id,
                    "project_name": project_name,
                })

        next_token = resp.get("nextToken")
        if not next_token:
            break

# 3️⃣ 결과 출력
print(f"✅ Found {len(glue_assets)} GlueTableAssetType assets in domain {domain_id}\n")
for a in glue_assets:
    print(f"{a['id']} | {a['name']} | {a['type']} | {a['project_name']} ({a['project_id']})")


✅ Found 33 GlueTableAssetType assets in domain dzd-cjvglgj4d43fmg

bl4k4rbfix626w | cloudtrail_logs | amazon.datazone.GlueTableAssetType | admin-project-533616270150 (akis412ankp388)
4i2y1zpe7arac8 | cloudtrail_raw_json | amazon.datazone.GlueTableAssetType | admin-project-533616270150 (akis412ankp388)
4et4gjf44dygwo | 20250527_1042 | amazon.datazone.GlueTableAssetType | admin-project-533616270150 (akis412ankp388)
4aakl0fpfjnvko | amzn_us | amazon.datazone.GlueTableAssetType | admin-project-533616270150 (akis412ankp388)
bmea7wll7zgs9k | weather | amazon.datazone.GlueTableAssetType | admin-project-533616270150 (akis412ankp388)
5vl151z62nio0o | daily_sales | amazon.datazone.GlueTableAssetType | admin-project-533616270150 (akis412ankp388)
62yh7xofkhy1i0 | weather | amazon.datazone.GlueTableAssetType | admin-project-533616270150 (akis412ankp388)
4cmg8yh79fkjh4 | nas_add | amazon.datazone.GlueTableAssetType | admin-project-533616270150 (akis412ankp388)
ds3hobtmvwdspk | house-price | amazon.d

### 아래 예시는 특정 프로젝트 내의 전체 Asset 조회 후 get_asset메서드로 해당 Asset에 대한 메타데이터 결과 출력
- get_asset 메서드 정보
- https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/datazone/client/get_asset.html

In [4]:
import boto3
import json

region = "ap-northeast-2"
dz = boto3.client("datazone", region_name=region)

domain_id = "dzd-cjvglgj4d43fmg"
project_id = "bknisgxzopjuhk"

# 1️⃣ Asset 검색
resp = dz.search(
    domainIdentifier=domain_id,
    owningProjectIdentifier=project_id,
    searchScope="ASSET"
)

assets = resp.get("items", [])
print(f"🔍 Found {len(assets)} assets\n")

if not assets:
    raise Exception("No assets found")

# 2️⃣ 전체 Asset 순회
for idx, asset in enumerate(assets, 1):
    asset_id = asset["assetItem"]["identifier"]
    asset_name = asset["assetItem"].get("name")
    print(f"=== Asset {idx} ===")
    print(f"Identifier: {asset_id}")
    print(f"Name: {asset_name}")

    # 3️⃣ 메타데이터 조회
    asset_resp = dz.get_asset(
        domainIdentifier=domain_id,
        identifier=asset_id
    )
    print("Metadata:")
    print(json.dumps(asset_resp, default=str, indent=2))

    # 4️⃣ 스키마 정보가 있다면 출력
    schema = asset_resp.get("metadata", {}).get("schema", None)
    if schema:
        print("Schema:")
        print(json.dumps(schema, default=str, indent=2))
    else:
        print("Schema: None")
    
    print("\n")  # 구분


🔍 Found 3 assets

=== Asset 1 ===
Identifier: c2w55srt624g0o
Name: music_catalog
Metadata:
{
  "ResponseMetadata": {
    "RequestId": "f6c1c2e5-ae1c-4724-907a-57c934690d6a",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/json",
      "content-length": "13929",
      "connection": "keep-alive",
      "date": "Wed, 29 Oct 2025 04:41:51 GMT",
      "x-amzn-trace-id": "Root=1-69019b0f-3ebe3255269baaa21a58fe68",
      "x-amzn-requestid": "f6c1c2e5-ae1c-4724-907a-57c934690d6a",
      "x-amz-apigw-id": "TMUqgEnVoE0ETTw=",
      "x-cache": "Miss from cloudfront",
      "via": "1.1 a83e83bac45033fa742a636490bdeb7e.cloudfront.net (CloudFront)",
      "x-amz-cf-pop": "LAX54-P2",
      "x-amz-cf-id": "Ogi4REzxv0ojKveNklzU6D76HrosqqyJz2KXFB2hrGJo_Z11JaLwHw=="
    },
    "RetryAttempts": 0
  },
  "createdAt": "2025-10-27 17:14:59.721000+09:00",
  "createdBy": "SYSTEM",
  "domainId": "dzd-cjvglgj4d43fmg",
  "externalIdentifier": "arn:aws:glue:ap-northeast-2:5336162

### 아래 예시는 Asset 결과를 JSON 파일로 저장

In [6]:
import boto3
import json

region = "ap-northeast-2"
dz = boto3.client("datazone", region_name=region)

domain_id = "dzd-cjvglgj4d43fmg"
project_id = "bknisgxzopjuhk"

# 1️⃣ Asset 검색
resp = dz.search(
    domainIdentifier=domain_id,
    owningProjectIdentifier=project_id,
    searchScope="ASSET"
)

assets = resp.get("items", [])
print(f"🔍 Found {len(assets)} assets\n")

if not assets:
    raise Exception("No assets found")

all_assets_data = []

for asset in assets:
    asset_id = asset["assetItem"]["identifier"]
    asset_name = asset["assetItem"].get("name")

    # 2️⃣ Asset 상세 조회
    asset_resp = dz.get_asset(
        domainIdentifier=domain_id,
        identifier=asset_id
    )

    # 3️⃣ 전체 구조 확인용
    # print(json.dumps(asset_resp, default=str, indent=2))

    # 4️⃣ 안전하게 필드 추출
    # 실제 Asset 정보가 있는 최상위 키를 자동으로 찾기
    asset_item = asset_resp.get("assetItem") or asset_resp.get("asset") or asset_resp

    type_id = asset_item.get("typeIdentifier")
    metadata = asset_item  # 전체 내용을 metadata로 저장

    # schema가 존재하면 가져오기 (Asset 타입에 따라 필드명 다를 수 있음)
    schema = asset_item.get("schema") or asset_item.get("glueTable", {}).get("columns")

    # 5️⃣ JSON 저장용 데이터 정리
    asset_data = {
        "identifier": asset_id,
        "name": asset_name,
        "typeIdentifier": type_id,
        "metadata": metadata,
        "schema": schema
    }

    all_assets_data.append(asset_data)

# 6️⃣ JSON 파일로 저장
output_file = "sample_result_data/datazone_assets_detailed.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_assets_data, f, default=str, ensure_ascii=False, indent=2)

print(f"✅ Detailed asset info saved to '{output_file}'")


🔍 Found 3 assets

✅ Detailed asset info saved to 'sample_result_data/datazone_assets_detailed.json'


### 아래의 코드는 전체 Asset 중 GlueTableAssetType만 찾아서 해당 Asset 별 메타데이터를 "[Asset명]_[AssetID].json"형식으로 저장하는 예시 

In [1]:
import boto3
import json
import os
import re

region = "ap-northeast-2"
dz = boto3.client("datazone", region_name=region)
domain_id = "dzd-cjvglgj4d43fmg"

# 결과 저장 폴더
output_dir = "sample_result_data/glue_assets"
os.makedirs(output_dir, exist_ok=True)

# ✅ 1️⃣ 도메인 내 모든 프로젝트 조회
projects_resp = dz.list_projects(domainIdentifier=domain_id)
projects = {p["id"]: p["name"] for p in projects_resp.get("items", [])}

print(f"🔍 Found {len(projects)} projects in domain {domain_id}\n")

# ✅ 2️⃣ 모든 프로젝트별 GlueTableAssetType 조회
glue_assets = []

for project_id, project_name in projects.items():
    print(f"📁 Searching assets in project: {project_name} ({project_id})")

    next_token = None
    while True:
        params = {
            "domainIdentifier": domain_id,
            "owningProjectIdentifier": project_id,
            "searchScope": "ASSET"
        }
        if next_token:
            params["nextToken"] = next_token

        resp = dz.search(**params)

        for item in resp.get("items", []):
            asset_item = item.get("assetItem", {})
            if asset_item.get("typeIdentifier") == "amazon.datazone.GlueTableAssetType":
                glue_assets.append({
                    "id": asset_item.get("identifier"),
                    "name": asset_item.get("name"),
                    "type": asset_item.get("typeIdentifier"),
                    "project_id": project_id,
                    "project_name": project_name,
                })

        next_token = resp.get("nextToken")
        if not next_token:
            break

print(f"\n✅ Found total {len(glue_assets)} GlueTableAssetType assets.\n")

# ✅ 3️⃣ 각 Asset에 대해 get_asset 호출 후 개별 JSON 저장
for asset in glue_assets:
    asset_id = asset["id"]
    asset_name = asset["name"]

    # 파일명에 쓸 수 없는 문자 제거 (예: /, :, ?, 등)
    safe_name = re.sub(r'[^a-zA-Z0-9가-힣_\-]', '_', asset_name)

    output_path = os.path.join(output_dir, f"{safe_name}_{asset_id}.json")

    print(f"📦 Fetching asset: {asset_name} ({asset_id})")

    try:
        # Asset 상세 조회
        asset_resp = dz.get_asset(
            domainIdentifier=domain_id,
            identifier=asset_id
        )

        asset_item = asset_resp.get("assetItem") or asset_resp.get("asset") or asset_resp
        type_id = asset_item.get("typeIdentifier")
        metadata = asset_item
        schema = asset_item.get("schema") or asset_item.get("glueTable", {}).get("columns")

        # 저장 데이터 구조
        asset_data = {
            "identifier": asset_id,
            "name": asset_name,
            "project": asset["project_name"],
            "typeIdentifier": type_id,
            "metadata": metadata,
            "schema": schema
        }

        # JSON 파일로 저장
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(asset_data, f, default=str, ensure_ascii=False, indent=2)

        print(f"✅ Saved: {output_path}")

    except Exception as e:
        print(f"❌ Failed to fetch asset {asset_name} ({asset_id}): {e}")

print("\n🎉 All GlueTableAssetType asset metadata saved successfully!")


🔍 Found 8 projects in domain dzd-cjvglgj4d43fmg

📁 Searching assets in project: GenerativeAIModelGovernanceProject (4fibhopyretcwo)
📁 Searching assets in project: admin-project-533616270150 (akis412ankp388)
📁 Searching assets in project: My_Project_mgouoova (bknisgxzopjuhk)
📁 Searching assets in project: My_Project_mgqbqvxi (4m40kss7xbb7rs)
📁 Searching assets in project: protest22 (3w3ju6yt8mxm6w)
📁 Searching assets in project: genaipjt (akfeonmmtbl6hk)
📁 Searching assets in project: My_Project_mgujrswf (6c5oylbl5visyg)
📁 Searching assets in project: My_Project_mguk8bbj (6dnkldcly284ns)

✅ Found total 33 GlueTableAssetType assets.

📦 Fetching asset: cloudtrail_logs (bl4k4rbfix626w)
✅ Saved: sample_result_data/glue_assets\cloudtrail_logs_bl4k4rbfix626w.json
📦 Fetching asset: cloudtrail_raw_json (4i2y1zpe7arac8)
✅ Saved: sample_result_data/glue_assets\cloudtrail_raw_json_4i2y1zpe7arac8.json
📦 Fetching asset: 20250527_1042 (4et4gjf44dygwo)
✅ Saved: sample_result_data/glue_assets\20250527_