In [1]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Iceberg Catalog Setup") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.catalog-impl", "org.apache.iceberg.rest.RESTCatalog") \
    .config("spark.sql.catalog.iceberg.uri", "http://iceberg-rest:8181") \
    .config("spark.sql.catalog.iceberg.warehouse", "warehouse") \
    .config("spark.sql.catalog.iceberg.s3.access-key", "admin") \
    .config("spark.sql.catalog.iceberg.s3.secret-key", "password") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .config("spark.sql.catalog.iceberg.s3.path-style-access", "true") \
    .config("spark.sql.catalog.iceberg.client.factory", "com.starrocks.connector.iceberg.IcebergAwsClientFactory") \
    .getOrCreate()

print("Spark Running")
print(spark.sparkContext.getConf().getAll())

Spark Running
[('spark.eventLog.enabled', 'true'), ('spark.driver.host', 'eaacf646f70f'), ('spark.history.fs.logDirectory', '/home/iceberg/spark-events'), ('spark.sql.catalog.demo.s3.endpoint', 'http://minio:9000'), ('spark.eventLog.dir', '/home/iceberg/spark-events'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.submit.deployMode', 'client'), ('spark.driver.port', '34313'), ('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=AL

24/07/13 23:13:38 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
wdi_data_df = spark.read.table("raw.world_development_indicators.WDIData")
wdi_country_df = spark.read.table("raw.world_development_indicators.WDICountry")
wdi_series_df = spark.read.table("raw.world_development_indicators.WDISeries")

In [3]:
print(f"Number of records for wdi data DF: {wdi_data_df.count()}")
print(f"Number of records for wdi country DF: {wdi_country_df.count()}")
print(f"Number of records for wdi series DF: {wdi_series_df.count()}")

Number of records for wdi data DF: 383838
Number of records for wdi country DF: 270
Number of records for wdi series DF: 4274


In [4]:
# Replace spaces in column names with underscores (“_”) for all DataFrames.

# wdi_data 
wdi_data_columns = wdi_data_df.columns

for column in wdi_data_columns:
  if column.__contains__(" "):
    new_column_name = column.replace(" ", "_")
    wdi_data_df = wdi_data_df.withColumnRenamed(column, new_column_name)

print(f"Updated Column names:: {wdi_data_df.columns}")


# wdi_country
wdi_country_columns = wdi_country_df.columns

for column in wdi_country_columns:
  if column.__contains__(" "):
    new_column_name = column.replace(" ", "_")
    wdi_country_df = wdi_country_df.withColumnRenamed(column, new_column_name)

print(f"Updated Column names:: {wdi_country_df.columns}")

# wdi_series
wdi_series_columns = wdi_series_df.columns

for column in wdi_series_columns:
  if column.__contains__(" "):
    new_column_name = column.replace(" ", "_")
    wdi_series_df = wdi_series_df.withColumnRenamed(column, new_column_name)

print(f"Updated Column names:: {wdi_series_df.columns}")

Updated Column names:: ['Country_Name', 'Country_Code', 'Indicator_Name', 'Indicator_Code', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
Updated Column names:: ['Country_Code', 'Short_Name', 'Table_Name', 'Long_Name', '2-alpha_code', 'Currency_Unit', 'Special_Notes', 'Region', 'Income_Group', 'WB-2_code', 'National_accounts_base_year', 'National_accounts_reference_year', 'SNA_price_valuation', 'Lending_category', 'Other_groups', 'System_of_National_Accounts', 'Alternative_conversion_factor', 'PPP_survey_year', 'Balance_of_Payments_Manual_in_use', 'Ext

In [5]:
# Drop records that only consist of null values (records with null values on all columns).

year_columns = list(str(year) for year in range(1960, 2021))

wdi_data_df = wdi_data_df.dropna(how="all", subset=year_columns)
wdi_country_df = wdi_country_df.dropna(how="all")
wdi_series_df = wdi_series_df.dropna(how="all")

print(f"Wdi data with null dropped count:: {wdi_data_df.count()}")
print(f"Wdi country with null dropped count:: {wdi_country_df.count()}")
print(f"Wdi series with null dropped count:: {wdi_series_df.count()}")

# Drop duplicate records

wdi_data = wdi_data_df.dropDuplicates()
wdi_country = wdi_country_df.dropDuplicates()
wdi_series = wdi_series_df.dropDuplicates()

print(f"Wdi data with duplicates dropped count:: {wdi_data_df.count()}")
print(f"Wdi country with duplicates dropped count:: {wdi_country_df.count()}")
print(f"Wdi series with duplicates dropped count:: {wdi_series_df.count()}")

24/07/13 23:20:06 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Wdi data with null dropped count:: 280622
Wdi country with null dropped count:: 270
Wdi series with null dropped count:: 4274


                                                                                

Wdi data with duplicates dropped count:: 280622
Wdi country with duplicates dropped count:: 270
Wdi series with duplicates dropped count:: 4274


In [6]:
# For the WDICountry.csv and WDIData.csv files
# Drop all records that have a country code (column: Country_Code) with a size other than three
from pyspark.sql.functions import length

wdi_country_df = wdi_country_df.filter(length(wdi_country.Country_Code) == 3)
print(f"wdi country with filtered country code:: {wdi_country_df.count()}")
wdi_data_df = wdi_data_df.filter(length(wdi_data_df.Country_Code) == 3)
print(f"wdi data with filtered country code:: {wdi_data_df.count()}")

wdi country with filtered country code:: 265


[Stage 30:>                                                         (0 + 4) / 4]

wdi data with filtered country code:: 280622


                                                                                

In [7]:
# For WDISeries.csv, drop all records that contain a space character (" ") in the Series_Code column.
from pyspark.sql.functions import col

wdi_series_df = wdi_series_df.filter(~col("Series_Code").contains(" "))
print(f"wdi series with filtered series code:: {wdi_series_df.count()}")


wdi series with filtered series code:: 1508


In [9]:
spark.sql("CREATE NAMESPACE IF NOT EXISTS curated.world_development_indicators")

wdi_data_df.write.saveAsTable(name="curated.world_development_indicators.data", mode="overwrite")
wdi_series_df.write.saveAsTable(name="curated.world_development_indicators.series", mode="overwrite")
wdi_country_df.write.saveAsTable(name="curated.world_development_indicators.country", mode="overwrite")

                                                                                

In [10]:
spark.stop()