# 간단한 GDELT 테스트 노트북

Silver 테이블 불러와서 컬럼들 확인하는 용도

In [1]:
import sys
sys.path.append('/app')

from src.utils.spark_builder import get_spark_session
from pyspark.sql import functions as F

# Spark 세션 생성
spark = get_spark_session("GDELT_Test", "spark://spark-master:7077")
print("✅ Spark 세션 생성 완료")

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8b16c699-8d46-4f1d-a3b5-09342ce05223;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
downloading https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.4.0/delta-core_2.12-2.4.0.jar ...
	[SUCCESSFUL ] io.delta#delta-core_2.12;2.4.0!delta-core_2.12.jar (1180ms)
downloading https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar ...
	[SUCCESSFUL ] o

✅ Spark 세션 생성 완료


In [2]:
# Silver Table 읽기
df = spark.read.format("delta").load("s3a://warehouse/silver/gdelt_events")

print(f"📊 레코드 수: {df.count()}")
print(f"📋 컬럼 수: {len(df.columns)}")

25/09/07 08:43:26 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/09/07 08:43:29 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

📊 레코드 수: 498
📋 컬럼 수: 63


                                                                                

In [3]:
# 전체 컬럼명 확인
print("📝 전체 컬럼 목록:")
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

📝 전체 컬럼 목록:
 1. global_event_id
 2. day
 3. month_year
 4. year
 5. fraction_date
 6. actor1_code
 7. actor1_name
 8. actor1_country_code
 9. actor1_known_group_code
10. actor1_ethnic_code
11. actor1_religion1_code
12. actor1_religion2_code
13. actor1_type1_code
14. actor1_type2_code
15. actor1_type3_code
16. actor2_code
17. actor2_name
18. actor2_country_code
19. actor2_known_group_code
20. actor2_ethnic_code
21. actor2_religion1_code
22. actor2_religion2_code
23. actor2_type1_code
24. actor2_type2_code
25. actor2_type3_code
26. is_root_event
27. event_code
28. event_base_code
29. event_root_code
30. quad_class
31. goldstein_scale
32. num_mentions
33. num_sources
34. num_articles
35. avg_tone
36. actor1_geo_type
37. actor1_geo_fullname
38. actor1_geo_country_code
39. actor1_geo_adm1_code
40. actor1_geo_lat
41. actor1_geo_long
42. actor1_geo_feature_id
43. actor2_geo_type
44. actor2_geo_fullname
45. actor2_geo_country_code
46. actor2_geo_adm1_code
47. actor2_geo_lat
48. actor2_geo_long

In [3]:
# 샘플 데이터 확인
print("🔍 샘플 데이터:")
df.show(20, truncate=False)

🔍 샘플 데이터:


                                                                                

+---------------+--------+----------+----+-------------+-----------+-------------+-------------------+-----------------------+------------------+---------------------+---------------------+-----------------+-----------------+-----------------+-----------+-------------+-------------------+-----------------------+------------------+---------------------+---------------------+-----------------+-----------------+-----------------+-------------+----------+---------------+---------------+----------+---------------+------------+-----------+------------+-----------------+---------------+-------------------------------+-----------------------+--------------------+--------------+---------------+---------------------+---------------+-------------------+---------------------------------------+--------------------+--------------+---------------+---------------------+---------------+-------------------+-----------------------+---------------------------------------+--------------+---------------+---

In [7]:
filtered_df = df.filter("actor1_code IS NOT NULL AND actor2_code IS NOT NULL")
result_df = filtered_df.withColumn(
    "event_type",
    F.when(F.col("actor1_country_code") == F.col("actor2_country_code"), "Internal")
    .otherwise("International")
)
# --- 필터링 강화! ---
# 1. Actor1과 Actor2의 국가 코드가 모두 NULL이 아니어야 하고,
# 2. 두 국가 코드가 서로 달라야 함 (International)
international_df = df.filter(
    (F.col("actor1_country_code").isNotNull()) &
    (F.col("actor2_country_code").isNotNull()) &
    (F.col("actor1_country_code") != F.col("actor2_country_code"))
)

print("\n======= Top 10 Pure International Event Pairs (NULLs removed) =======")
international_df.groupBy("actor1_country_code", "actor2_country_code") \
    .count() \
    .orderBy(F.desc("count")) \
    .show(10)


+-------------------+-------------------+-----+
|actor1_country_code|actor2_country_code|count|
+-------------------+-------------------+-----+
|                DEU|                VNM|    8|
|                ISR|                PSE|    5|
|                VNM|                DEU|    5|
|                ARE|                IRL|    4|
|                CHN|                RUS|    4|
|                RUS|                CHN|    4|
|                GBR|                IRL|    4|
|                FRA|                IRL|    4|
|                IRL|                GBR|    4|
|                CHN|                AFG|    4|
+-------------------+-------------------+-----+
only showing top 10 rows



In [5]:
# 스키마 확인
df.printSchema()

root
 |-- global_event_id: long (nullable = true)
 |-- day: integer (nullable = true)
 |-- month_year: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- fraction_date: double (nullable = true)
 |-- actor1_code: string (nullable = true)
 |-- actor1_name: string (nullable = true)
 |-- actor1_country_code: string (nullable = true)
 |-- actor1_known_group_code: string (nullable = true)
 |-- actor1_ethnic_code: string (nullable = true)
 |-- actor1_religion1_code: string (nullable = true)
 |-- actor1_religion2_code: string (nullable = true)
 |-- actor1_type1_code: string (nullable = true)
 |-- actor1_type2_code: string (nullable = true)
 |-- actor1_type3_code: string (nullable = true)
 |-- actor2_code: string (nullable = true)
 |-- actor2_name: string (nullable = true)
 |-- actor2_country_code: string (nullable = true)
 |-- actor2_known_group_code: string (nullable = true)
 |-- actor2_ethnic_code: string (nullable = true)
 |-- actor2_religion1_code: string (nullable = true)

In [None]:
# 간단한 통계
df.select("actor1_country_code").groupBy("actor1_country_code").count().orderBy(F.desc("count")).show(10)

+-------------------+-----+
|actor1_country_code|count|
+-------------------+-----+
|               null| 3388|
|                CHN|  749|
|                USA|  742|
|                RUS|  481|
|                IRN|  301|
|                ISR|  293|
|                GBR|  238|
|                AUS|  183|
|                UKR|  168|
|                PSE|  158|
+-------------------+-----+
only showing top 10 rows



25/09/01 05:45:08 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
25/09/01 05:45:08 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:978)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:165)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:263)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:170)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce

In [4]:
spark.stop()

NameError: name 'spark' is not defined