# Seed 생성 확인용 테스트 노트북
http://4.230.26.44:8888

In [4]:
import os
import sys
sys.path.append("/app")
from src.utils.spark_builder import get_spark_session
from pyspark.sql import functions as F
os.environ["PYSPARK_SUBMIT_ARGS"] = """
    --conf spark.cores.max=2
    --conf spark.executor.memory=8g
    --conf spark.app.name=GDELT_dbt_test
    pyspark-shell
"""

# Spark 세션 생성
spark = get_spark_session("GDELT_dbt_test", "spark://spark-master:7077")
print("✅ Spark 세션 생성 완료")

✅ Spark 세션 생성 완료


In [12]:
# Spark Metastore에 존재하는 모든 데이터베이스(스키마) 목록을 확인합니다.
print("📖 사용 가능한 데이터베이스(스키마) 목록:")
spark.sql("SHOW DATABASES").show()

📖 사용 가능한 데이터베이스(스키마) 목록:
+------------+
|   namespace|
+------------+
|     default|
|    gold_dev|
|   gold_prod|
|    seed_dev|
|   seed_prod|
|      silver|
| staging_dev|
|staging_prod|
+------------+



In [18]:
# dbt seed 또는 dbt run으로 생성된 테이블 확인
print("📜 '_dev' 스키마의 테이블 목록:")
spark.sql("SHOW TABLES IN seed_dev").show()
spark.sql("SHOW TABLES IN staging_dev").show()
spark.sql("SHOW TABLES IN gold_dev").show()

📜 '_dev' 스키마의 테이블 목록:
+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
| seed_dev|actor_ethnic_grou...|      false|
| seed_dev|actor_organizatio...|      false|
| seed_dev|actor_religion_codes|      false|
| seed_dev|    actor_role_codes|      false|
| seed_dev|  event_detail_codes|      false|
| seed_dev|event_quad_class_...|      false|
| seed_dev|    event_root_codes|      false|
| seed_dev|       geo_adm_codes|      false|
| seed_dev|   geo_country_codes|      false|
| seed_dev|      geo_type_codes|      false|
+---------+--------------------+-----------+

+-----------+--------------------+-----------+
|  namespace|           tableName|isTemporary|
+-----------+--------------------+-----------+
|staging_dev|stg_actors_descri...|      false|
|staging_dev|stg_gkg_detailed_...|      false|
|staging_dev|    stg_seed_mapping|      false|
+-----------+--------------------+-----------+

+---------+------

### Seed 확인 및 관리

In [None]:
# Seed 테이블(총 10개)의 샘플 데이터를 5개씩 출력하여 내용 확인
seed_tables = [
    "actor_ethnic_group_codes",
    "actor_organization_codes",
    "actor_religion_codes",
    "actor_role_codes",
    "event_root_codes",
    "event_detail_codes",
    "event_quad_class_codes",
    "geo_adm_codes",
    "geo_country_codes",
    "geo_type_codes",
]

for table in seed_tables:
    print(f"\n 🔍 {table} 테이블 샘플 데이터")
    try:
        spark.sql(f"SELECT * FROM seed_dev.{table} LIMIT 5").show(truncate=False)
    except Exception as e:
        print(f"❌ 테이블 조회 중, 오류 발생: {e}")

생성된 seed 테이블을 삭제할 경우, 아래 코드에 "삭제할 seed 테이블명"을 입력하여 실행

In [None]:
# 삭제할 seed 테이블 이름 목록
seed_tables_to_drop = [
    # "[삭제할 seed 테이블명]"
    # "geo_country_fips_codes",
    # "geo_country_iso_codes"
]
print("🗑️ 기존 Seed 테이블 삭제를 시작합니다.")

for table_name in seed_tables_to_drop:
    try:
        spark.sql(f"DROP TABLE IF EXISTS seed_dev.{table_name}")
        print(f"  - 테이블 'seed_dev.{table_name}' 삭제 완료.")
    except Exception as e:
        print(f"  - 테이블 'seed_dev.{table_name}' 삭제 중 오류 발생: {e}")

print("\n✅ 모든 Seed 테이블 삭제 작업이 완료되었습니다.")

# 삭제 후 테이블 목록을 다시 확인하여 깨끗해졌는지 확인
print("📜 현재 'seed_dev' 스키마의 테이블 목록:")
spark.sql("SHOW TABLES IN seed_dev").show()

### Staging 폴더의 SQL 파일을 dbt run한 결과 확인

In [14]:
# stg_seed_mapping 결과 확인
print("🥈 [stg_seed_mapping] 뷰 : 상위 20개 데이터")
spark.sql("SELECT * FROM staging_dev.stg_seed_mapping LIMIT 20").show(truncate=False)

🥈 [stg_seed_mapping] 뷰 : 상위 20개 데이터


25/09/24 12:39:12 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/09/24 12:39:18 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+---------------+----------+-------------+-------------------+------------------------+----------+-------------+---------------+------------+-----------+------------+-------------------+-----------+------------------+--------------------------+--------------------------------------------------------+--------------------------+------------------------+---------------------------+----------------+------------------+-----------------------+--------------+-------------------+------------------+-----------------------------------------------+-------------------------+--------------------------------------------------------+-------------------------+--------------+---------------+-----------+----------------+--------------------------+--------------------------------------------------------+--------------------------+----------------------+---------------------------+----------------+------------------+-----------------------+--------------+-------------------+-------------------+-----------

In [17]:
# stg_actors_description 결과 확인
print("🥈 [stg_actors_description] 뷰 : 상위 20개 데이터")
spark.sql("SELECT * FROM staging_dev.stg_actors_description ORDER BY actor_full_description ASC LIMIT 20").show(truncate=False)

🥈 [stg_actors_description] 뷰 : 상위 20개 데이터




+----------+----------------------+
|actor_code|actor_full_description|
+----------+----------------------+
|GHA       |가나                  |
|GHACOP    |가나 경찰             |
|GHAEDU    |가나 교육             |
|GHAMIL    |가나 군대             |
|GHACHRCTH |가나 기독교 가톨릭    |
|GHABUS    |가나 기업             |
|GHACVL    |가나 민간인           |
|GHACRM    |가나 범죄 조직        |
|GHAHLH    |가나 보건             |
|GHAJUD    |가나 사법부           |
|GHAMED    |가나 언론             |
|GHAELI    |가나 엘리트           |
|GHALEG    |가나 입법부           |
|GHAGOV    |가나 정부             |
|GHAGOVMED |가나 정부 언론        |
|GAB       |가봉                  |
|gay       |가요족                |
|GUY       |가이아나              |
|GUYCOP    |가이아나 경찰         |
|GUYEDU    |가이아나 교육         |
+----------+----------------------+



                                                                                

In [19]:
# stg_gkg_detailed_events 결과 확인
print("🥈 [stg_gkg_detailed_events] 뷰 : 상위 20개 데이터")
spark.sql("SELECT * FROM staging_dev.stg_gkg_detailed_events LIMIT 20").show(truncate=False)

🥈 [stg_gkg_detailed_events] 뷰 : 상위 20개 데이터


                                                                                

+---------------+----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+----------------+----------+----------------+------------------+-------+-------------------+
|global_event_id|event_date|source_url                                                                                                                                                                              |mention_source_name|mention_doc_tone|v2_persons|v2_organizations|v2_enhanced_themes|amounts|processed_at       |
+---------------+----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+----------------+----------+----------------+------------------+-------+-------------------+
|1264759198     |2024-

### Marts 폴더의 SQL 파일을 dbt run한 결과 확인

In [None]:
# 15분마다 업데이트) gdelt_events 데이터만을 사용하여, 대시보드의 핵심 KPI를 빠르게 집계
print("🥇 [gold_near_realtime_summary] 뷰")
spark.sql("SELECT * FROM gold_dev.gold_near_realtime_summary").show(truncate=False)

🥇 [gold_near_realtime_summary] 뷰


                                                                                

+----------+-------------------------+--------------------------+-------------------------+-----------+--------------------+---------------------+--------------+-------------+--------------+-----------------------+--------------------+-------------------+-------------------+-------------------+------------------------------------------------------------------------------------+
|event_date|mp_action_geo_country_iso|mp_action_geo_country_eng |mp_action_geo_country_kor|event_count|avg_goldstein_scale |avg_tone             |total_mentions|total_sources|total_articles|count_cooperation_event|count_conflict_event|count_anomaly_event|risk_score         |processed_at       |daily_tone_summary                                                                  |
+----------+-------------------------+--------------------------+-------------------------+-----------+--------------------+---------------------+--------------+-------------+--------------+-----------------------+--------------------+---

In [21]:
# 1일 마다 업데이트) Staging 모델들을 결합하고, 스토리 및 KPI 파생 컬럼을 생성하는 상세 이벤트 테이블
print("🥇 [gold_daily_detailed_events] 뷰")
spark.sql("SELECT * FROM gold_dev.gold_daily_detailed_events").show(truncate=False)

🥇 [gold_daily_detailed_events] 뷰


                                                                                

+---------------+----------+-------------+-------------------+------------------------+----------+-------------+---------------+------------+-----------+------------+-----------------+-----------+-----------+--------------------------+--------------------------+--------------------------+----------------------+---------------------------+----------------+------------------+-----------------------+--------------+-------------------+------------------+---------------------------------+-------------------------+-------------------------+-------------------------+--------------+---------------+-----------+-------------------+--------------------------+----------------------------+--------------------------+----------------------+---------------------------+----------------+------------------+-----------------------+--------------+-------------------+------------------+-----------------------------------------------+-----------------------+-------------------------+-------------------------

In [22]:
# Superset 연결용 최종 테이블
print("🥇 [gold_superset_view] 뷰")
spark.sql("SELECT * FROM gold_dev.gold_superset_view").show(truncate=False)

🥇 [gold_superset_view] 뷰


25/09/24 13:10:44 WARN DeltaLog: Change in the table id detected while updating snapshot. 
Previous snapshot = Snapshot(path=s3a://warehouse/gold_dev/gold_near_realtime_summary/_delta_log, version=1, metadata=Metadata(7a0bb746-3ff0-4497-96e4-1d7fb05fc023,null,null,Format(parquet,Map()),{"type":"struct","fields":[{"name":"event_date","type":"date","nullable":true,"metadata":{}},{"name":"mp_action_geo_country_iso","type":"string","nullable":true,"metadata":{}},{"name":"mp_action_geo_country_eng","type":"string","nullable":true,"metadata":{}},{"name":"mp_action_geo_country_kor","type":"string","nullable":true,"metadata":{}},{"name":"event_count","type":"long","nullable":true,"metadata":{}},{"name":"avg_goldstein_scale","type":"double","nullable":true,"metadata":{}},{"name":"avg_tone","type":"double","nullable":true,"metadata":{}},{"name":"total_mentions","type":"long","nullable":true,"metadata":{}},{"name":"total_sources","type":"long","nullable":true,"metadata":{}},{"name":"total_article

+----------+------------------+------------------+------------------+------------------+-----------+-------------------+-------------------+--------------+-----------------------+--------------------+-------------------+---------------------------------------------------------------------+---------------------------------------------------------+-------------------+
|event_date|action_country_iso|action_country_eng|action_country_kor|risk_score        |event_count|avg_goldstein_scale|avg_tone           |total_articles|count_cooperation_event|count_conflict_event|count_anomaly_event|representative_simple_story                                          |representative_headline_story                            |updated_at         |
+----------+------------------+------------------+------------------+------------------+-----------+-------------------+-------------------+--------------+-----------------------+--------------------+-------------------+------------------------------------------

In [3]:
spark.stop()
print("\n✅ Spark 세션이 종료되었습니다.")


✅ Spark 세션이 종료되었습니다.
