In [None]:
def configure_spark(provider, branch):
    """Configure and initialize Spark session."""
    configure_environment(provider)
    NESSIE_URI = "http://a2e39ba23e81f4c4aaed676e345f1fc9-1136246907.us-west-2.elb.amazonaws.com:19120/api/v1"
    WAREHOUSE = "s3a://warehouse/"
    AWS_S3_ENDPOINT = "http://a8c78c2e92dd442ff93f5d247b6a2cc7-1221119726.us-west-2.elb.amazonaws.com:9000"
    AWS_REGION = "us-east-1"
    iceberg_warehouse_path = "iceberg_warehouse"

    conf = (
        pyspark.SparkConf()
        .setAppName("Iceberg Partitioned Data Write")
        .set(
            "spark.jars.packages",
            "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.2,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.91.3,software.amazon.awssdk:bundle:2.17.81,org.apache.hadoop:hadoop-aws:3.3.1",
        )
        .set(
            "spark.sql.extensions",
            "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions",
        )
        .set("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
        .set("spark.sql.catalog.nessie.uri", NESSIE_URI)
        .set("spark.sql.catalog.nessie.ref", branch)
        .set("spark.sql.catalog.nessie.authentication.type", "NONE")
        .set(
            "spark.sql.catalog.nessie.catalog-impl",
            "org.apache.iceberg.nessie.NessieCatalog",
        )
        .set("spark.sql.catalog.nessie.s3.endpoint", AWS_S3_ENDPOINT)
        .set("spark.sql.catalog.nessie.warehouse", WAREHOUSE)
        .set("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
        .set("spark.hadoop.fs.s3a.endpoint", AWS_S3_ENDPOINT)
        .set("spark.hadoop.fs.s3a.access.key", os.environ["AWS_ACCESS_KEY_ID"])
        .set("spark.hadoop.fs.s3a.secret.key", os.environ["AWS_SECRET_ACCESS_KEY"])
        .set("spark.hadoop.fs.s3a.endpoint.region", AWS_REGION)
        .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .set("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
        .set("spark.sql.catalog.local.type", "hadoop")
        .set("spark.sql.catalog.local.warehouse", iceberg_warehouse_path)
        .set("spark.executor.memory", "8g")
        .set("spark.driver.memory", "8g")
        .set("spark.executor.instances", "5")
        .set("spark.local.dir", "/tmp/spark-temp")
    )
    return SparkSession.builder.config(conf=conf).getOrCreate()


In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import os

# Define the path for the Iceberg warehouse
iceberg_warehouse_path = "iceberg_warehouse"  # Change this to your desired path
table = input("Enter the table name: ")

os.makedirs(iceberg_warehouse_path, exist_ok=True)

# Spark configuration
conf = (
    pyspark.SparkConf()
    .setAppName("Iceberg Partitioned Data Write")
    .set(
        "spark.jars.packages",
        "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.2,software.amazon.awssdk:bundle:2.17.81,org.apache.hadoop:hadoop-aws:3.3.1",
    )
    .set(
        "spark.sql.extensions",
        "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
    )
    .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkCatalog")
    .set("spark.sql.catalog.spark_catalog.type", "hadoop")
    .set("spark.sql.catalog.spark_catalog.warehouse", iceberg_warehouse_path)
    .set("spark.executor.memory", "8g")
    .set("spark.driver.memory", "8g")
    .set("spark.executor.instances", "5")
    .set("spark.local.dir", "/tmp/spark-temp")
)

# Initialize Spark session with the configured SparkConf
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Path to the Parquet file
parquet_path = "scripts/python_go_backups/employees_go.parquet"  # Change this to the actual path

# Read the Parquet file into a Spark DataFrame
df = spark.read.parquet(parquet_path)

# Convert column headers to uppercase (optional)
df = df.select([col(column).alias(column.upper()) for column in df.columns])

# Define the table name
table_name = f"spark_catalog.default.{table}"

# Define the table schema with backticks around column names
schema = ', '.join([f'`{field.name}` {field.dataType.simpleString()}' for field in df.schema.fields])

# Create the Iceberg table if it doesn't exist
create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        {schema}
    )
    USING iceberg

    PARTITIONED BY (COMPANYNAME)
  
"""
spark.sql(create_table_query)

# Write the DataFrame to the existing Iceberg table partitioned by the 'DATE' column
df.writeTo(table_name).append()

print(f"Data has been written to Iceberg table at {iceberg_warehouse_path}/default/allocation_iceberg"/{table})

# Stop the Spark session
spark.stop()


In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import os
from utils.utils import configure_spark, get_db_tables
# Define the path for the Iceberg warehouse
iceberg_warehouse_path = "iceberg_warehouse"  # Change this to your desired path




spark = configure_spark('minio','main')

# Path to the Parquet file
#df = get_db_tables('hdr','2024-08-02',spark)
local_table_name = "spark_catalog.default.employees"
df = spark.read.format("iceberg").table(local_table_name)


# Convert column headers to uppercase (optional)
#df = df.select([col(column).alias(column.upper()) for column in df.columns])

# Define the table name
table_name = "nessie.employees"

# Define the table schema with backticks around column names
schema = ', '.join([f'{field.name} {field.dataType.simpleString()}' for field in df.schema.fields])

def create_iceberg_table_if_not_exists(spark, table_name, df, partition_column):
    """Create an Iceberg table if it doesn't exist."""
    schema = ', '.join([f'`{field.name}` {field.dataType.simpleString()}' for field in df.schema.fields])
    create_table_query = f"""
        CREATE TABLE IF NOT EXISTS {table_name} (
            {schema}
        )
        USING iceberg
        PARTITIONED BY ({partition_column})
    """
    spark.sql(create_table_query)

# Write the DataFrame to the existing Iceberg table partitioned by the 'DATE' column
create_iceberg_table_if_not_exists(spark, table_name, df, 'companyName')
df.write.format("iceberg").mode("overwrite") \
    .save(table_name)

print(f"Data has been written to Iceberg table at {iceberg_warehouse_path}/default/{table_name}")

# Stop the Spark session
spark.stop()

In [None]:
schema

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import os
from utils.utils import configure_spark

def create_iceberg_table_if_not_exists(spark, table_name, df, partition_column):
    """Create an Iceberg table if it doesn't exist."""
    schema = ', '.join([f'`{field.name}` {field.dataType.simpleString()}' for field in df.schema.fields])
    create_table_query = f"""
        CREATE TABLE IF NOT EXISTS {table_name} (
            {schema}
        )
        USING iceberg
        PARTITIONED BY ({partition_column})
    """
    spark.sql(create_table_query)

def query_and_write_to_local_table(spark, parquet_path, local_table_name, partition_column):
    """Query data from Nessie table for a given date and write it to the local Iceberg table."""
    df = spark.read.parquet(parquet_path)
    create_iceberg_table_if_not_exists(spark, local_table_name, df, partition_column)
    df.writeTo(local_table_name).append()

def read_local_and_write_to_nessie(spark, local_table_name, nessie_table_name, partition_column):
    """Read data from local Iceberg table and write it to the Nessie-managed Iceberg table."""
    local_df = spark.read.format("iceberg").table(local_table_name)
    create_iceberg_table_if_not_exists(spark, nessie_table_name, local_df, partition_column)
    local_df.writeTo(nessie_table_name).append()

def process_data(spark, parquet_path, local_table_name,nessie_table_name,partition_column):
    """Process a list of dates to query from Nessie and write to both local and Nessie Iceberg tables."""
  
    query_and_write_to_local_table(spark, parquet_path, local_table_name,partition_column)
    #read_local_and_write_to_nessie(spark, local_table_name, nessie_table_name, partition_column)



# Define the path for the Iceberg warehouse
iceberg_warehouse_path = "iceberg_warehouse"
os.makedirs(iceberg_warehouse_path, exist_ok=True)
# Initialize Spark session with the configured SparkConf
#spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark = configure_spark('minio','main')

# Define the Nessie and local table names
parquet_path = 'scripts/python_go_backups/employees_go.parquet'
nessie_table_name = "nessie.employees"
local_table_name = "spark_catalog.default.employees"
partition_column = 'companyName'

process_data(spark, parquet_path, local_table_name,nessie_table_name, partition_column)



In [None]:
124_036_762

In [None]:
from utils.utils import configure_spark
import pandas as pd

local_table_name = "spark_catalog.default.employees"
spark = configure_spark('minio','main')

local_df = spark.read.format("iceberg").table(local_table_name)
local_df.show()

In [1]:
import pandas as pd
from utils.utils import configure_spark

local_table_name = "spark_catalog.default.employees"
spark = configure_spark('minio','main')

# Read the Iceberg table into a DataFrame
local_df = spark.read.format("iceberg").table(local_table_name)

# Register the DataFrame as a temporary view
local_df.createOrReplaceTempView("employees_view")

# Perform the query using Spark SQL
result_df = spark.sql("""
    SELECT companyName, COUNT(*) as employee_count
    FROM employees_view
    GROUP BY companyName
""")

# Show the results
result_df.toPandas()

Configured MINIO environment


24/08/22 01:51:17 WARN Utils: Your hostname, vision resolves to a loopback address: 127.0.1.1; using 192.168.1.112 instead (on interface enxc8a36204def5)
24/08/22 01:51:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/kdlocpanda/.ivy2/cache
The jars for the packages stored in: /home/kdlocpanda/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12 added as a dependency
software.amazon.awssdk#bundle added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-217553ac-1c97-4f62-b02d-2be6a199692e;1.0
	confs: [default]


:: loading settings :: url = jar:file:/home/kdlocpanda/yorko_io/open-datalakehouse/.venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.5.2 in central
	found org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12;0.91.3 in central
	found software.amazon.awssdk#bundle;2.17.81 in central
	found software.amazon.eventstream#eventstream;1.0.1 in central
	found org.apache.hadoop#hadoop-aws;3.3.1 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 126ms :: artifacts dl 6ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.901 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.1 from central in [default]
	org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.5.2 from central in [default]
	org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12;0.91.3 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	software.amazon.awssdk#bundle;2.17.81 from central in [d

Unnamed: 0,companyName,employee_count
0,Tagcat-6285,485254
1,Jaxworks,341507
2,Jabberbean-6063,430670
3,Viva-2310,433971
4,Jabbersphere-5488,187187
...,...,...
995,Skyble,259178
996,Zoozzy,202266
997,Dynava-3880,373082
998,Realcube-9666,376158


In [None]:
import duckdb

# Connect to DuckDB
con = duckdb.connect('iceberg_data.duckdb')

# Install and load the Iceberg extension
con.execute("INSTALL iceberg;")
con.execute("LOAD iceberg;")

# Query the Iceberg table and save the results to a DuckDB table
con.execute("""
    CREATE TABLE employees AS
    SELECT *
    FROM iceberg_scan('iceberg_warehouse/default/employees', allow_moved_paths = true);
""")

In [1]:
import duckdb
import pandas as pd

# Connect to DuckDB
con = duckdb.connect('iceberg_data.duckdb')

# Query to get the count of employees by companyName and convert it to a pandas DataFrame
df = con.execute("""
    SELECT companyName, COUNT(*) as employee_count
    FROM employees
    GROUP BY companyName ORDER BY employee_count DESC;
""").fetchdf()

df

Unnamed: 0,companyName,employee_count
0,Zoombeat,499813
1,Voomm-3470,499707
2,Izio-4877,498717
3,Oozz-8135,498524
4,Roomm,498292
...,...,...
995,Wordify,2055
996,Wikivu,1887
997,Bubblebox,1225
998,Edgeblab-9808,503


In [None]:
mc cp iceberg_data.duckdb myminio/upload/iceberg_data.duckdb