In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Iceberg Catalog Setup") \
    .getOrCreate()

print("Spark Running")
print(spark.sparkContext.getConf().getAll())
print("current catalog:", spark.catalog.currentCatalog())
print("Spark UI:", spark.sparkContext.uiWebUrl)

In [None]:
def clean_csv(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    with open(file_path, 'w') as file:
        for line in lines:
            file.write(line.rstrip().rstrip(',') + '\n')

In [None]:
import os
from pathlib import Path

wbi_data_path = "/home/iceberg/data/world_bank_data"
csv_files = [file for file in os.listdir(wbi_data_path) if file.endswith(".csv")]

spark.sql("CREATE NAMESPACE IF NOT EXISTS raw")

for csv in csv_files:
  file_path = os.path.join(wbi_data_path, csv)
  clean_csv(file_path)
  file_name = Path(file_path).stem
  file_name = file_name.replace("-", "_")

  df = spark.read.option('header', 'true').csv(file_path)

  (df
  .write
  .mode('overwrite')
  .format('iceberg')
  .saveAsTable(f'raw.world_development_indicators.{file_name}')
  )


In [None]:
emissions_data_path = "/home/iceberg/data/emissions_data"

for year in [2017, 2018, 2019]:
  file_path = f"{emissions_data_path}/co2_emissions_passenger_cars_{year}.json"
  file_name = Path(file_path).stem

  df = spark.read.option("multiline","true").json(file_path)

  (df
  .write
  .mode('overwrite')
  .format('iceberg')
  .saveAsTable(f'raw.co2_passenger_cars_emissions.{file_name}')
  )


In [None]:
iceberg_data_df = spark.read.table("raw.world_development_indicators.WDIData")
iceberg_co2_emissions_df = spark.read.table("raw.co2_passenger_cars_emissions.co2_emissions_passenger_cars_2017")

iceberg_data_df.printSchema()
iceberg_co2_emissions_df.printSchema()

In [None]:
print(f"Number of records  for CO2 emissions DF: {iceberg_co2_emissions_df.count()}")
print(f"Number of records  for World Development Indicators: {iceberg_data_df.count()}")

In [None]:
display(iceberg_co2_emissions_df.describe())

In [None]:
display(iceberg_data_df.describe())

In [None]:
display(iceberg_data_df.head())

In [None]:
display(iceberg_co2_emissions_df.head())

In [None]:
spark.stop()

In [None]:
databases = spark.catalog.listDatabases()

# Print database names
for db in databases:
    print(db.name)

print("current catalog:", spark.catalog.currentCatalog())