In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Iceberg Catalog Setup") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.catalog-impl", "org.apache.iceberg.rest.RESTCatalog") \
    .config("spark.sql.catalog.iceberg.uri", "http://iceberg-rest:8181") \
    .config("spark.sql.catalog.iceberg.warehouse", "warehouse") \
    .config("spark.sql.catalog.iceberg.s3.access-key", "admin") \
    .config("spark.sql.catalog.iceberg.s3.secret-key", "password") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .config("spark.sql.catalog.iceberg.s3.path-style-access", "true") \
    .config("spark.sql.catalog.iceberg.client.factory", "com.starrocks.connector.iceberg.IcebergAwsClientFactory") \
    .getOrCreate()

print("Spark Running")
print(spark.sparkContext.getConf().getAll())
print("current catalog:", spark.catalog.currentCatalog())
print("Spark UI:", spark.sparkContext.uiWebUrl)

In [None]:
wdi_data_df = spark.read.table("raw.world_development_indicators.WDIData")
wdi_country_df = spark.read.table("raw.world_development_indicators.WDICountry")
wdi_series_df = spark.read.table("raw.world_development_indicators.WDISeries")

In [None]:
print(f"Number of records for wdi data DF: {wdi_data_df.count()}")
print(f"Number of records for wdi country DF: {wdi_country_df.count()}")
print(f"Number of records for wdi series DF: {wdi_series_df.count()}")

In [None]:
# Replace spaces in column names with underscores (“_”) for all DataFrames.

# wdi_data 
wdi_data_columns = wdi_data_df.columns

for column in wdi_data_columns:
  if column.__contains__(" "):
    new_column_name = column.replace(" ", "_")
    wdi_data_df = wdi_data_df.withColumnRenamed(column, new_column_name)

print(f"Updated Column names:: {wdi_data_df.columns}")


# wdi_country
wdi_country_columns = wdi_country_df.columns

for column in wdi_country_columns:
  if column.__contains__(" "):
    new_column_name = column.replace(" ", "_")
    wdi_country_df = wdi_country_df.withColumnRenamed(column, new_column_name)

print(f"Updated Column names:: {wdi_country_df.columns}")

# wdi_series
wdi_series_columns = wdi_series_df.columns

for column in wdi_series_columns:
  if column.__contains__(" "):
    new_column_name = column.replace(" ", "_")
    wdi_series_df = wdi_series_df.withColumnRenamed(column, new_column_name)

print(f"Updated Column names:: {wdi_series_df.columns}")

In [None]:
# Drop records that only consist of null values (records with null values on all columns).

year_columns = list(str(year) for year in range(1960, 2021))

wdi_data_df = wdi_data_df.dropna(how="all", subset=year_columns)
wdi_country_df = wdi_country_df.dropna(how="all")
wdi_series_df = wdi_series_df.dropna(how="all")

print(f"Wdi data with null dropped count:: {wdi_data_df.count()}")
print(f"Wdi country with null dropped count:: {wdi_country_df.count()}")
print(f"Wdi series with null dropped count:: {wdi_series_df.count()}")

# Drop duplicate records

wdi_data = wdi_data_df.dropDuplicates()
wdi_country = wdi_country_df.dropDuplicates()
wdi_series = wdi_series_df.dropDuplicates()

print(f"Wdi data with duplicates dropped count:: {wdi_data_df.count()}")
print(f"Wdi country with duplicates dropped count:: {wdi_country_df.count()}")
print(f"Wdi series with duplicates dropped count:: {wdi_series_df.count()}")

In [None]:
# For the WDICountry.csv and WDIData.csv files
# Drop all records that have a country code (column: Country_Code) with a size other than three
from pyspark.sql.functions import length

wdi_country_df = wdi_country_df.filter(length(wdi_country.Country_Code) == 3)
print(f"wdi country with filtered country code:: {wdi_country_df.count()}")
wdi_data_df = wdi_data_df.filter(length(wdi_data_df.Country_Code) == 3)
print(f"wdi data with filtered country code:: {wdi_data_df.count()}")

In [None]:
# For WDISeries.csv, drop all records that contain a space character (" ") in the Series_Code column.
from pyspark.sql.functions import col

wdi_series_df = wdi_series_df.filter(~col("Series_Code").contains(" "))
print(f"wdi series with filtered series code:: {wdi_series_df.count()}")


In [None]:
wdi_data.createOrReplaceTempView("wdi_data_tempTable")
wdi_country.createOrReplaceTempView("wdi_country_tempTable")
wdi_series.createOrReplaceTempView("wdi_series_tempTable")

spark.sql("CREATE NAMESPACE IF NOT EXISTS curated")

spark.sql(f"CREATE TABLE IF NOT EXISTS curated.world_development_indicators.data as select * from wdi_data_tempTable")
spark.sql(f"CREATE TABLE IF NOT EXISTS curated.world_development_indicators.country as select * from wdi_country_tempTable")
spark.sql(f"CREATE TABLE IF NOT EXISTS curated.world_development_indicators.series as select * from wdi_series_tempTable")

spark.catalog.dropTempView("wdi_data_tempTable")
spark.catalog.dropTempView("wdi_country_tempTable")
spark.catalog.dropTempView("wdi_series_tempTable")

In [None]:
spark.stop()