## 테스트 데이터 준비

In [0]:
# 1. 샘플 데이터 생성 (1000만 개 레코드)
data = [(i, f"name_{i}", float(i % 100) * 1.1) for i in range(10**7)]
df = spark.createDataFrame(data, ["id", "name", "value"])

In [0]:
# 2. 저장 경로 지정
csv_path = "dbfs:/test_data_csv"
parquet_path = "dbfs:/test_data_parquet"
delta_path = "dbfs:/test_data_delta"

In [0]:
# 3. 저장
df.write.mode("overwrite").option("header", True).csv(csv_path)
df.write.mode("overwrite").parquet(parquet_path)
df.write.mode("overwrite").format("delta").save(delta_path)

## 타입별 읽기 성능 체크해보기

In [0]:
import time

# CSV 로드 시간 측정
start_csv = time.time()
df_csv = spark.read.option("header", True).csv(csv_path)
df_csv.count()
end_csv = time.time()

print(f"CSV Load Time:     {end_csv - start_csv:.2f} seconds")

CSV Load Time:     9.32 seconds


In [0]:
# Parquet 로드 시간 측정
start_parquet = time.time()
df_parquet = spark.read.parquet(parquet_path)
df_parquet.count()
end_parquet = time.time()

print(f"Parquet Load Time: {end_parquet - start_parquet:.2f} seconds")

Parquet Load Time: 2.37 seconds


In [0]:
# Delta 로드 시간 측정
start_delta = time.time()
df_delta = spark.read.format("delta").load(delta_path)
df_delta.count()
end_delta = time.time()

print(f"Delta Load Time: {end_delta - start_delta:.2f} seconds")

Delta Load Time: 1.28 seconds


## DELTA의 Time Travel 기능을 사용해보기

In [0]:
from pyspark.sql import Row

df1 = spark.createDataFrame([Row(id=1, name="Alice"), Row(id=2, name="Bob")])
df1.write.format("delta").mode("overwrite").save("dbfs:/delta_tt_example")

In [0]:
df2 = spark.createDataFrame([Row(id=1, name="Alice_updated"), Row(id=2, name="Bob_updated")])
df2.write.format("delta").mode("overwrite").save("dbfs:/delta_tt_example")

In [0]:
# 최신 버전 (version 1)
df_latest = spark.read.format("delta").load("dbfs:/delta_tt_example")
df_latest.show()

+---+-------------+
| id|         name|
+---+-------------+
|  1|Alice_updated|
|  2|  Bob_updated|
+---+-------------+



In [0]:
# 이전 버전 (version 0)
# 타임스탬프를 사용할 수도 있음 (timestampAsOf="2025-04-15 10:00:00")
df_old = spark.read.format("delta").option("versionAsOf", 0).load("dbfs:/delta_tt_example")
df_old.show()

+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  2|  Bob|
+---+-----+



Delta 포맷은 PARQUET를 기본으로 쓰고 그 위에 다양한 기능 추가

In [0]:
%fs ls dbfs:/delta_tt_example/


path,name,size,modificationTime
dbfs:/delta_tt_example/_delta_log/,_delta_log/,0,0
dbfs:/delta_tt_example/part-00003-26db0ed0-e390-4b60-b4b2-d30260c13897-c000.snappy.parquet,part-00003-26db0ed0-e390-4b60-b4b2-d30260c13897-c000.snappy.parquet,841,1743905023000
dbfs:/delta_tt_example/part-00003-9e682a15-c5e5-4f6c-af25-5e3f5cf4c14f-c000.snappy.parquet,part-00003-9e682a15-c5e5-4f6c-af25-5e3f5cf4c14f-c000.snappy.parquet,841,1743917161000
dbfs:/delta_tt_example/part-00003-c520c718-bf95-4190-a407-10e3cbd760f0-c000.snappy.parquet,part-00003-c520c718-bf95-4190-a407-10e3cbd760f0-c000.snappy.parquet,897,1743905094000
dbfs:/delta_tt_example/part-00003-d54d3fdd-a334-4359-96bb-2838386d3415-c000.snappy.parquet,part-00003-d54d3fdd-a334-4359-96bb-2838386d3415-c000.snappy.parquet,897,1743917205000
dbfs:/delta_tt_example/part-00007-139b2391-ec38-482c-9f41-19e17e71f285-c000.snappy.parquet,part-00007-139b2391-ec38-482c-9f41-19e17e71f285-c000.snappy.parquet,827,1743905023000
dbfs:/delta_tt_example/part-00007-39fc5252-1df5-486e-a098-f30c87c9188a-c000.snappy.parquet,part-00007-39fc5252-1df5-486e-a098-f30c87c9188a-c000.snappy.parquet,883,1743905094000
dbfs:/delta_tt_example/part-00007-c5bf196b-a9ed-4576-b70b-1de4be11f901-c000.snappy.parquet,part-00007-c5bf196b-a9ed-4576-b70b-1de4be11f901-c000.snappy.parquet,827,1743917161000
dbfs:/delta_tt_example/part-00007-d9dabcce-4c70-4831-9887-3eeb6e0a8171-c000.snappy.parquet,part-00007-d9dabcce-4c70-4831-9887-3eeb6e0a8171-c000.snappy.parquet,883,1743917205000


In [0]:
%fs ls dbfs:/delta_tt_example/_delta_log/

path,name,size,modificationTime
dbfs:/delta_tt_example/_delta_log/.s3-optimization-0,.s3-optimization-0,0,1743905024000
dbfs:/delta_tt_example/_delta_log/.s3-optimization-1,.s3-optimization-1,0,1743905024000
dbfs:/delta_tt_example/_delta_log/.s3-optimization-2,.s3-optimization-2,0,1743905024000
dbfs:/delta_tt_example/_delta_log/00000000000000000000.crc,00000000000000000000.crc,2902,1743905027000
dbfs:/delta_tt_example/_delta_log/00000000000000000000.json,00000000000000000000.json,1894,1743905024000
dbfs:/delta_tt_example/_delta_log/00000000000000000001.crc,00000000000000000001.crc,2934,1743905098000
dbfs:/delta_tt_example/_delta_log/00000000000000000001.json,00000000000000000001.json,2240,1743905095000
dbfs:/delta_tt_example/_delta_log/00000000000000000002.crc,00000000000000000002.crc,2902,1743917165000
dbfs:/delta_tt_example/_delta_log/00000000000000000002.json,00000000000000000002.json,2208,1743917162000
dbfs:/delta_tt_example/_delta_log/00000000000000000003.crc,00000000000000000003.crc,2934,1743917208000
