In [1]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Iceberg Catalog Setup") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.catalog-impl", "org.apache.iceberg.rest.RESTCatalog") \
    .config("spark.sql.catalog.iceberg.uri", "http://iceberg-rest:8181") \
    .config("spark.sql.catalog.iceberg.warehouse", "warehouse") \
    .config("spark.sql.catalog.iceberg.s3.access-key", "admin") \
    .config("spark.sql.catalog.iceberg.s3.secret-key", "password") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .config("spark.sql.catalog.iceberg.s3.path-style-access", "true") \
    .config("spark.sql.catalog.iceberg.client.factory", "com.starrocks.connector.iceberg.IcebergAwsClientFactory") \
    .getOrCreate()

print("Spark Running")
print(spark.sparkContext.getConf().getAll())
print("current catalog:", spark.catalog.currentCatalog())
print("Spark UI:", spark.sparkContext.uiWebUrl)

24/07/15 00:41:50 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Spark Running
[('spark.eventLog.enabled', 'true'), ('spark.driver.cores', '4'), ('spark.task.cpus', '4'), ('spark.app.id', 'local-1721004109123'), ('spark.executor.cores', '4'), ('spark.history.fs.logDirectory', '/home/iceberg/spark-events'), ('spark.sql.catalog.demo.s3.endpoint', 'http://minio:9000'), ('spark.eventLog.dir', '/home/iceberg/spark-events'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.executor.memory', '8g'), ('spark.submit.deployMode', 'client'), ('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=ja

In [2]:
def clean_csv(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    with open(file_path, 'w') as file:
        for line in lines:
            file.write(line.rstrip().rstrip(',') + '\n')

In [3]:
import os
from pathlib import Path

wbi_data_path = "/home/iceberg/data/world_bank_data"
csv_files = [file for file in os.listdir(wbi_data_path) if file.endswith(".csv")]

spark.sql("CREATE NAMESPACE IF NOT EXISTS raw")

for csv in csv_files:
  file_path = os.path.join(wbi_data_path, csv)
  clean_csv(file_path)
  file_name = Path(file_path).stem
  file_name = file_name.replace("-", "_")

  df = spark.read.option('header', 'true').csv(file_path)
  df.createOrReplaceTempView(f"{file_name}_tempTable")

  spark.sql(f"CREATE TABLE IF NOT EXISTS raw.world_development_indicators.{file_name} as select * from {file_name}_tempTable")
  spark.catalog.dropTempView(f"{file_name}_tempTable")


24/07/15 00:42:01 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [4]:
emissions_data_path = "/home/iceberg/data/emissions_data"

for year in [2017, 2018, 2019]:
  file_path = f"{emissions_data_path}/co2_emissions_passenger_cars_{year}.json"
  file_name = Path(file_path).stem

  df = spark.read.option("multiline","true").json(file_path)
  df.createOrReplaceTempView(f"{file_name}_tempTable")
  spark.sql(f"CREATE TABLE IF NOT EXISTS raw.co2_passenger_cars_emissions.{file_name} as select * from {file_name}_tempTable")
  spark.catalog.dropTempView(f"{file_name}_tempTable")


                                                                                

In [5]:
iceberg_data_df = spark.read.table("raw.world_development_indicators.WDIData")
iceberg_co2_emissions_df = spark.read.table("raw.co2_passenger_cars_emissions.co2_emissions_passenger_cars_2017")

iceberg_data_df.printSchema()
iceberg_co2_emissions_df.printSchema()

root
 |-- Country Name: string (nullable = true)
 |-- Country Code: string (nullable = true)
 |-- Indicator Name: string (nullable = true)
 |-- Indicator Code: string (nullable = true)
 |-- 1960: string (nullable = true)
 |-- 1961: string (nullable = true)
 |-- 1962: string (nullable = true)
 |-- 1963: string (nullable = true)
 |-- 1964: string (nullable = true)
 |-- 1965: string (nullable = true)
 |-- 1966: string (nullable = true)
 |-- 1967: string (nullable = true)
 |-- 1968: string (nullable = true)
 |-- 1969: string (nullable = true)
 |-- 1970: string (nullable = true)
 |-- 1971: string (nullable = true)
 |-- 1972: string (nullable = true)
 |-- 1973: string (nullable = true)
 |-- 1974: string (nullable = true)
 |-- 1975: string (nullable = true)
 |-- 1976: string (nullable = true)
 |-- 1977: string (nullable = true)
 |-- 1978: string (nullable = true)
 |-- 1979: string (nullable = true)
 |-- 1980: string (nullable = true)
 |-- 1981: string (nullable = true)
 |-- 1982: string (null

In [6]:
print(f"Number of records  for CO2 emissions DF: {iceberg_co2_emissions_df.count()}")
print(f"Number of records  for World Development Indicators: {iceberg_data_df.count()}")

Number of records  for CO2 emissions DF: 100000
Number of records  for World Development Indicators: 383838


In [7]:
display(iceberg_co2_emissions_df.describe())

DataFrame[summary: string, At1 (mm): string, At2 (mm): string, Cn: string, Cr: string, Ct: string, De: string, E (g/km): string, Enedc (g/km): string, Er (g/km): string, Ernedc (g/km): string, Erwltp (g/km): string, Ewltp (g/km): string, Fm: string, Ft: string, ID: string, It: string, MMS: string, MS: string, Man: string, Mh: string, Mk: string, Mp: string, Mt: string, Status: string, T: string, TAN: string, VFN: string, Va: string, Ve: string, Vf: string, W (mm): string, Zr: string, ec (cm3): string, ep (KW): string, m (kg): string, r: string, version_file: string, year: string, z (Wh/km): string]

In [8]:
display(iceberg_data_df.describe())

DataFrame[summary: string, Country Name: string, Country Code: string, Indicator Name: string, Indicator Code: string, 1960: string, 1961: string, 1962: string, 1963: string, 1964: string, 1965: string, 1966: string, 1967: string, 1968: string, 1969: string, 1970: string, 1971: string, 1972: string, 1973: string, 1974: string, 1975: string, 1976: string, 1977: string, 1978: string, 1979: string, 1980: string, 1981: string, 1982: string, 1983: string, 1984: string, 1985: string, 1986: string, 1987: string, 1988: string, 1989: string, 1990: string, 1991: string, 1992: string, 1993: string, 1994: string, 1995: string, 1996: string, 1997: string, 1998: string, 1999: string, 2000: string, 2001: string, 2002: string, 2003: string, 2004: string, 2005: string, 2006: string, 2007: string, 2008: string, 2009: string, 2010: string, 2011: string, 2012: string, 2013: string, 2014: string, 2015: string, 2016: string, 2017: string, 2018: string, 2019: string, 2020: string]

In [9]:
display(iceberg_data_df.head())

Row(Country Name='Africa Eastern and Southern', Country Code='AFE', Indicator Name='Access to clean fuels and technologies for cooking (% of population)', Indicator Code='EG.CFT.ACCS.ZS', 1960=None, 1961=None, 1962=None, 1963=None, 1964=None, 1965=None, 1966=None, 1967=None, 1968=None, 1969=None, 1970=None, 1971=None, 1972=None, 1973=None, 1974=None, 1975=None, 1976=None, 1977=None, 1978=None, 1979=None, 1980=None, 1981=None, 1982=None, 1983=None, 1984=None, 1985=None, 1986=None, 1987=None, 1988=None, 1989=None, 1990=None, 1991=None, 1992=None, 1993=None, 1994=None, 1995=None, 1996=None, 1997=None, 1998=None, 1999=None, 2000='12.205985334256', 2001='12.5493332143826', 2002='12.8877052744847', 2003='13.2263347774934', 2004='13.5755915831281', 2005='13.9243539706952', 2006='14.2660310520972', 2007='14.5962971181626', 2008='14.9552860566119', 2009='15.2810233296453', 2010='15.6312911024101', 2011='15.9812561302977', 2012='16.320474865037', 2013='16.6432428558172', 2014='16.9946949543723',

In [10]:
display(iceberg_co2_emissions_df.head())

Row(At1 (mm)=1679, At2 (mm)=1632, Cn='458 SPECIALE A AD S-A', Cr='', Ct='M1', De=None, E (g/km)=None, Enedc (g/km)=559, Er (g/km)=None, Ernedc (g/km)=None, Erwltp (g/km)=None, Ewltp (g/km)=None, Fm='M', Ft='petrol', ID=416839, It='', MMS='FERRARI', MS='GB', Man='FERRARI SPA', Mh='FERRARI', Mk='FERRARI', Mp='', Mt=None, Status='F', T='F142', TAN='e3*2007/46*0040*10', VFN='', Va='AB', Ve='L', Vf=None, W (mm)=2650, Zr=None, ec (cm3)=4497, ep (KW)=None, m (kg)=1485, r=1, version_file='v16', year=2017, z (Wh/km)=None)

In [11]:
spark.stop()