In [None]:
import os
from pyspark.sql import SparkSession
import pyspark

# Set AWS credentials as environment variables
os.environ['AWS_ACCESS_KEY_ID'] = ''
os.environ['AWS_SECRET_ACCESS_KEY'] = ''

# Nessie and AWS configurations
NESSIE_URI = "http://nessie:19120/api/v1"
WAREHOUSE = "s3a://warehouse/"
AWS_S3_ENDPOINT = "http://dremio-minio:9000"
AWS_REGION = "us-east-1"  # Change this to the region of your S3 bucket

# Path to the PostgreSQL JDBC driver jar
jdbc_driver_path = "postgresql-42.2.23.jar"

# Initialize SparkConf with updated packages and configurations
conf = (
    pyspark.SparkConf()
    .setAppName("Iceberg Partitioned Data Write")
    .set("spark.jars", jdbc_driver_path)  # Include the JDBC driver
    .set("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.2,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.91.3,software.amazon.awssdk:bundle:2.17.81,org.apache.hadoop:hadoop-aws:3.3.1")  # Include Iceberg, Nessie, AWS SDK, and Hadoop AWS packages
    .set("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions")  # Corrected Spark session extensions
    .set("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
    .set("spark.sql.catalog.nessie.uri", NESSIE_URI)
    .set("spark.sql.catalog.nessie.ref", "main")
    .set("spark.sql.catalog.nessie.authentication.type", "NONE")
    .set("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
    .set("spark.sql.catalog.nessie.s3.endpoint", AWS_S3_ENDPOINT)
    .set("spark.sql.catalog.nessie.warehouse", WAREHOUSE)
    .set("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    .set("spark.hadoop.fs.s3a.endpoint", AWS_S3_ENDPOINT)
    .set("spark.hadoop.fs.s3a.access.key", os.environ['AWS_ACCESS_KEY_ID'])
    .set("spark.hadoop.fs.s3a.secret.key", os.environ['AWS_SECRET_ACCESS_KEY'])
    .set("spark.hadoop.fs.s3a.endpoint.region", AWS_REGION)
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
)

# Initialize Spark session with the configured SparkConf
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Database connection parameters
db_user = "jyorko"
db_password = "jyorkopassword"
db_host = 'db-postgresql'
db_port = "5432"
db_name = "daily_work"
db_url = f"jdbc:postgresql://{db_host}:{db_port}/{db_name}"
db_properties = {
    "user": db_user,
    "password": db_password,
    "driver": "org.postgresql.Driver"
}

# Create a Table
spark.sql("CREATE TABLE nessie.names2 (name STRING) USING iceberg;").show()

# Insert Some Data
spark.sql("INSERT INTO nessie.names2 VALUES ('Alex Merced'), ('Joshua Yorko'), ('Jason Hughes'), ('Ron DeMena')").show()

# Query the Data
spark.sql("SELECT * FROM nessie.names;").show()


In [None]:
from pyspark.sql import SparkSession
import os
import pyspark

# Set AWS credentials as environment variables
os.environ['AWS_ACCESS_KEY_ID'] = ''
os.environ['AWS_SECRET_ACCESS_KEY'] = ''

# Nessie and AWS configurations
NESSIE_URI = "http://nessie:19120/api/v1"
WAREHOUSE = "s3a://warehouse/"
AWS_S3_ENDPOINT = "http://dremio-minio:9000"
AWS_REGION = "us-east-1"  # Change this to the region of your S3 bucket

# Database connection parameters
db_user = "jyorko"
db_password = "jyorkopassword"
db_host = 'db-postgresql'
db_port = "5432"
db_name = "daily_work"
db_url = f"jdbc:postgresql://{db_host}:{db_port}/{db_name}"
db_properties = {
    "user": db_user,
    "password": db_password,
    "driver": "org.postgresql.Driver"
}

# Path to the PostgreSQL JDBC driver jar
jdbc_driver_path = "postgresql-42.2.23.jar"

# Initialize SparkConf with updated packages and configurations
conf = (
    pyspark.SparkConf()
    .setAppName("Postgres to Nessie")
    .set("spark.jars", jdbc_driver_path)  # Include the JDBC driver
    .set("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.2,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.91.3,software.amazon.awssdk:bundle:2.17.81,org.apache.hadoop:hadoop-aws:3.3.1")  # Include Iceberg, Nessie, AWS SDK, and Hadoop AWS packages
    .set("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions")  # Corrected Spark session extensions
    .set("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
    .set("spark.sql.catalog.nessie.uri", NESSIE_URI)
    .set("spark.sql.catalog.nessie.ref", "main")
    .set("spark.sql.catalog.nessie.authentication.type", "NONE")
    .set("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
    .set("spark.sql.catalog.nessie.s3.endpoint", AWS_S3_ENDPOINT)
    .set("spark.sql.catalog.nessie.warehouse", WAREHOUSE)
    .set("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    .set("spark.hadoop.fs.s3a.endpoint", AWS_S3_ENDPOINT)
    .set("spark.hadoop.fs.s3a.access.key", os.environ['AWS_ACCESS_KEY_ID'])
    .set("spark.hadoop.fs.s3a.secret.key", os.environ['AWS_SECRET_ACCESS_KEY'])
    .set("spark.hadoop.fs.s3a.endpoint.region", AWS_REGION)
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .set("spark.executor.memory", "4g")  # Increase executor memory
    .set("spark.driver.memory", "4g")  # Increase driver memory
    .set("spark.sql.debug.maxToStringFields", "1000")  # Adjust the maximum number of fields to be displayed
)

# Initialize Spark session with the configured SparkConf
spark = SparkSession.builder.config(conf=conf).getOrCreate()

def create_and_upload_table_to_nessie(table_name, db_url, db_properties):
    try:
        # Read distinct dates from PostgreSQL
        distinct_dates_df = spark.read.jdbc(url=db_url, table=f"(SELECT DISTINCT date FROM {table_name}) AS dates", properties=db_properties)
        distinct_dates = [row["date"] for row in distinct_dates_df.collect()]

        # Define the Iceberg table name in Nessie
        iceberg_table_name = f"nessie.{table_name}"

        # Create the Iceberg table schema in Nessie if it doesn't exist
        # Create an empty DataFrame with the correct schema to use for table creation
        source_df = spark.read.jdbc(url=db_url, table=table_name, properties=db_properties)
        source_df.limit(0).writeTo(iceberg_table_name).using("iceberg").createOrReplace()

        for date in distinct_dates:
            print(f"Processing date: {date}")

            # Read data for the specific date from PostgreSQL
            date_df = spark.read.jdbc(url=db_url, table=f"(SELECT * FROM {table_name} WHERE date = '{date}') AS date_table", properties=db_properties)

            # Write data to Iceberg table in Nessie for the specific date
            date_df.writeTo(iceberg_table_name).append()

        print(f"Successfully wrote table {table_name} to Nessie")
        
    except Exception as e:
        print(f"Failed to process table {table_name}: {e}")

# List of tables to process
tables = ["hdr"]  # Add other table names as needed

for table_name in tables:
    # Create and upload the table data to Nessie
    create_and_upload_table_to_nessie(table_name, db_url, db_properties)

# Stop the Spark session
spark.stop()
