# World Development Indicators Data
- ***WDIData.csv***: This is the main data file that contains the values of different indicators per country for years ranging from 1960 to 2020.
- ***WDICountry.csv***: This file contains additional columns for each country.
- ***WDISeries.csv***: This file contains additional columns for each series. (A series is basically a collection of data for a specific indicator and a specific set of countries over a period of time.)

In [1]:
# Read the csvs above into a spark dataframe
from databricks.connect import DatabricksSession

spark = DatabricksSession.builder.profile("DEFAULT").getOrCreate()

In [2]:
date_to_get = "20240705"

wdi_data = (spark.read
                                        .option('header', 'true')
                                        .csv(f'dbfs:/Volumes/emissions_datapipeline_workspace/default/datalake/raw/world_development_indicators/date={date_to_get}/world_bank_data/WDIData.csv'))

wdi_country = (spark.read
                                        .option('header', 'true')
                                        .csv(f'dbfs:/Volumes/emissions_datapipeline_workspace/default/datalake/raw/world_development_indicators/date={date_to_get}/world_bank_data/WDICountry.csv'))

wdi_series = (spark.read
                                        .option('header', 'true')
                                        .csv(f'dbfs:/Volumes/emissions_datapipeline_workspace/default/datalake/raw/world_development_indicators/date={date_to_get}/world_bank_data/WDISeries.csv'))

In [3]:
print(f"Number of records  for wdi data DF: {wdi_data.count()}")
print(f"Number of records  for wdi country DF: {wdi_country.count()}")
print(f"Number of records  for wdi series DF: {wdi_series.count()}")

Number of records  for wdi data DF: 383838
Number of records  for wdi country DF: 270
Number of records  for wdi series DF: 4274


In [4]:
# Replace spaces in column names with underscores (“_”) for all DataFrames.

# wdi_data 
wdi_data_columns = wdi_data.columns

for column in wdi_data_columns:
  if column.__contains__(" "):
    new_column_name = column.replace(" ", "_")
    wdi_data = wdi_data.withColumnRenamed(column, new_column_name)

print(f"Updated Column names:: {wdi_data.columns}")


# wdi_country
wdi_country_columns = wdi_country.columns

for column in wdi_country_columns:
  if column.__contains__(" "):
    new_column_name = column.replace(" ", "_")
    wdi_country = wdi_country.withColumnRenamed(column, new_column_name)

print(f"Updated Column names:: {wdi_country.columns}")

# wdi_series
wdi_series_columns = wdi_series.columns

for column in wdi_series_columns:
  if column.__contains__(" "):
    new_column_name = column.replace(" ", "_")
    wdi_series = wdi_series.withColumnRenamed(column, new_column_name)

print(f"Updated Column names:: {wdi_series.columns}")

Updated Column names:: ['Country_Name', 'Country_Code', 'Indicator_Name', 'Indicator_Code', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '_c65']
Updated Column names:: ['Country_Code', 'Short_Name', 'Table_Name', 'Long_Name', '2-alpha_code', 'Currency_Unit', 'Special_Notes', 'Region', 'Income_Group', 'WB-2_code', 'National_accounts_base_year', 'National_accounts_reference_year', 'SNA_price_valuation', 'Lending_category', 'Other_groups', 'System_of_National_Accounts', 'Alternative_conversion_factor', 'PPP_survey_year', 'Balance_of_Payments_Manual_in_us

## Apply data quality filters on the data for each file.

In [5]:
# Drop records that only consist of null values (records with null values on all columns).

year_columns = list(str(year) for year in range(1960, 2021))

wdi_data = wdi_data.dropna(how="all", subset=year_columns)
wdi_country = wdi_country.dropna(how="all")
wdi_series = wdi_series.dropna(how="all")

print(f"Wdi data with null dropped count:: {wdi_data.count()}")
print(f"Wdi country with null dropped count:: {wdi_country.count()}")
print(f"Wdi series with null dropped count:: {wdi_series.count()}")

# Drop duplicate records

wdi_data = wdi_data.dropDuplicates()
wdi_country = wdi_country.dropDuplicates()
wdi_series = wdi_series.dropDuplicates()

print(f"Wdi data with duplicates dropped count:: {wdi_data.count()}")
print(f"Wdi country with duplicates dropped count:: {wdi_country.count()}")
print(f"Wdi series with duplicates dropped count:: {wdi_series.count()}")

Wdi data with null dropped count:: 280622
Wdi country with null dropped count:: 270
Wdi series with null dropped count:: 4274
Wdi data with duplicates dropped count:: 280622
Wdi country with duplicates dropped count:: 270
Wdi series with duplicates dropped count:: 2310


In [6]:
# For the WDICountry.csv and WDIData.csv files
# Drop all records that have a country code (column: Country_Code) with a size other than three
from pyspark.sql.functions import length

wdi_country = wdi_country.filter(length(wdi_country.Country_Code) == 3)
print(f"wdi country with filtered country code:: {wdi_country.count()}")
wdi_data = wdi_data.filter(length(wdi_data.Country_Code) == 3)
print(f"wdi data with filtered country code:: {wdi_data.count()}")



wdi country with filtered country code:: 265
wdi data with filtered country code:: 280622


In [7]:
# For WDISeries.csv, drop all records that contain a space character (" ") in the Series_Code column.
from pyspark.sql.functions import col

wdi_series = wdi_series.filter(~col("Series_Code").contains(" "))
print(f"wdi series with filtered series code:: {wdi_series.count()}")


wdi series with filtered series code:: 1470


Write the data to the data lake’s curated layer on DBFS (/datalake/curated/) under the following paths. The data should be in Parquet format and partitioned based on the current date, with one output file per partition

Create the following external tables on top of the data:

- wdi_curated.country for the countries DataFrame
- wdi_curated.series for the series DataFrame
- wdi_curated.data for the main DataFrame

In [13]:
from datetime import datetime
current_year = datetime.now().year
current_month = datetime.now().month
current_day = datetime.now().day

dbfs_wdi_data_path = f"dbfs:/Volumes/emissions_datapipeline_workspace/default/datalake/curated/world_development_indicators/data/year={current_year}/month={current_month}/day={current_day}/"
dbfs_wdi_series_path = f"dbfs:/Volumes/emissions_datapipeline_workspace/default/datalake/curated/world_development_indicators/series/year={current_year}/month={current_month}/day={current_day}/"
dbfs_wdi_country_path = f"dbfs:/Volumes/emissions_datapipeline_workspace/default/datalake/curated/world_development_indicators/country/year={current_year}/month={current_month}/day={current_day}/"

wdi_data.coalesce(1).write.mode("overwrite").parquet(dbfs_wdi_data_path)
wdi_series.coalesce(1).write.mode("overwrite").parquet(dbfs_wdi_series_path)
wdi_country.coalesce(1).write.mode("overwrite").parquet(dbfs_wdi_country_path)