In [1]:
from pyspark.sql import SparkSession
from dotenv import load_dotenv
import os

load_dotenv()
aws_access_key = os.getenv("ACCESS_KEY")
aws_secret_access_key = os.getenv("SECRET_ACCESS_KEY")

# Create Spark session with Iceberg and S3 configuration
spark = SparkSession.builder \
    .appName("IcebergS3Example") \
    .config("spark.jars.packages", 
            "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.4.2,"
            "org.apache.hadoop:hadoop-aws:3.3.4,"
            "com.amazonaws:aws-java-sdk-bundle:1.12.262,"
            "org.apache.iceberg:iceberg-aws-bundle:1.4.2") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.iceberg_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg_catalog.type", "hadoop") \
    .config("spark.sql.catalog.iceberg_catalog.warehouse", "s3a://if4044-big-data-kel-4/iceberg-test/") \
    .config("spark.sql.catalog.iceberg_catalog.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.hadoop.fs.s3a.access.key", aws_access_key) \
    .config("spark.hadoop.fs.s3a.secret.key", aws_secret_access_key) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .getOrCreate()

# Read your parquet file
df = spark.read.parquet("s3a://if4044-big-data-kel-4/tpc-h-1gb-parquet-test/customer/customer.1.parquet")

# Create namespace (database) using DataFrame API to avoid SQL catalog issues
spark.sql("CREATE NAMESPACE IF NOT EXISTS iceberg_catalog.db")

# Create and write to Iceberg table using DataFrame API (more reliable)
df.writeTo("iceberg_catalog.db.customer") \
    .partitionedBy("c_mktsegment") \
    .tableProperty("format-version", "2") \
    .createOrReplace()

# Verify the table
spark.table("iceberg_catalog.db.customer").show(5)

spark.stop()

25/05/28 16:58:45 WARN Utils: Your hostname, haziqam-ubuntu resolves to a loopback address: 127.0.1.1; using 192.168.1.24 instead (on interface enx0000000001dc)
25/05/28 16:58:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/haziqam/.ivy2/cache
The jars for the packages stored in: /home/haziqam/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
org.apache.iceberg#iceberg-aws-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-10427c62-e654-4585-a94f-7965f1587445;1.0
	confs: [default]


:: loading settings :: url = jar:file:/home/haziqam/Education/IF4044%20Big%20Data/if4044-data-lakehouse/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.4.2 in central
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.apache.iceberg#iceberg-aws-bundle;1.4.2 in central
downloading https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.4.2/iceberg-spark-runtime-3.5_2.12-1.4.2.jar ...
	[SUCCESSFUL ] org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.4.2!iceberg-spark-runtime-3.5_2.12.jar (6968ms)
downloading https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/1.4.2/iceberg-aws-bundle-1.4.2.jar ...
	[SUCCESSFUL ] org.apache.iceberg#iceberg-aws-bundle;1.4.2!iceberg-aws-bundle.jar (5530ms)
:: resolution report :: resolve 1968ms :: artifacts dl 12504ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]

+---------+------------------+--------------------+-----------+---------------+---------+------------+--------------------+
|C_CUSTKEY|            C_NAME|           C_ADDRESS|C_NATIONKEY|        C_PHONE|C_ACCTBAL|C_MKTSEGMENT|           C_COMMENT|
+---------+------------------+--------------------+-----------+---------------+---------+------------+--------------------+
|        2|Customer#000000002|XSTf4,NCwDVaWNe6t...|         13|23-768-687-3665|   121.65|  AUTOMOBILE|l accounts. blith...|
|        3|Customer#000000003|        MG9kdTD2WBHm|          1|11-719-748-3364|  7498.12|  AUTOMOBILE| deposits eat sly...|
|        6|Customer#000000006|sKZz0CsnMD7mp4Xd0...|         20|30-114-968-4951|  7638.57|  AUTOMOBILE|tions. even depos...|
|        7|Customer#000000007|TcGe5gaZNgVePxU5k...|         18|28-190-982-9759|  9561.95|  AUTOMOBILE|ainst the ironic,...|
|       17|Customer#000000017|izrh 6jdqtp2eqdtb...|          2|12-970-682-3487|     6.34|  AUTOMOBILE|packages wake! bl...|
+-------