Create schema for csv file and import it as a dataframe

In [0]:
from pyspark.sql.types import StructType, StructField, ShortType, StringType, FloatType

df_schema = StructType([
    StructField("STATISTIC", StringType()),
    StructField("STATISTIC Label", StringType()),
    StructField("TLIST(A1)", ShortType()),
    StructField("Census Year", ShortType()),
    StructField("C04167V04938", StringType()),
    StructField("Electoral Divisions", StringType()),
    StructField("UNIT", StringType()),
    StructField("VALUE", FloatType())
])

df = spark.read.load('/bronze/Population_Density_and_Area_Size.csv', format='csv', header=True, schema=df_schema)

Drop some columns: 
- The 1st column is redundant.
- The 3rd column is a duplicate of the fourth column.
- The 5th and 7th columns are redundant.

In [0]:
# Get the column names
all_columns = df.columns

# Specify the indices of the columns to drop
indices_to_drop = [0, 2, 4, 6]

# Get the column names to drop based on their indices
columns_to_drop = [all_columns[i] for i in indices_to_drop]

# Drop the specified columns
df = df.drop(*columns_to_drop)


Rename some columns

In [0]:
# Dictionary of columns to rename and their new names
columns_to_rename = {
    "STATISTIC Label": "Statistic_label",
    "Census Year": "Census_year",
    "Electoral Divisions": "Electoral_divisions",
    "VALUE": "Value"
}

# Rename columns
for old_col, new_col in columns_to_rename.items():
    df = df.withColumnRenamed(old_col, new_col)

Replace null and blank values in string columns with "Unknown"

In [0]:
from pyspark.sql.functions import when, lit, col
 
string_cols = ['Statistic_label', 'Electoral_divisions']

for col_name in string_cols:
    df = df.withColumn(col_name, when(
        (col(col_name).isNull() | (col(col_name)=="")), 
        lit("Unknown")
    ).otherwise(col(col_name)))

Trim string columns

In [0]:
from pyspark.sql.functions import trim

for col_name in string_cols:
    df = df.withColumn(col_name, trim(df[col_name]))

Write to silver layer as a delta table

In [0]:
from delta.tables import DeltaTable

df.write.format("delta").mode("append").save("/delta/demographic_data_silver")