Import the geojson file as a dataframe and remove the first 5 rows as they contain only metadata

In [0]:
df = spark.read.json('/bronze/Public_EV_Charging_Points_SDCC.geojson')

In [0]:
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import col

# Add an auto-increasing ID column (starts from 0)
df = df.withColumn("ID", monotonically_increasing_id())

# Drop the first 5 rows
df = df.filter(col("ID") > 4)

Keep only the **properties** column, flatten the dataframe and drop null rows

In [0]:
# Keep only the properties column
df = df.select("properties")

In [0]:
from pyspark.sql.functions import expr

# Flatten the dataframe so it looks like a proper table
df = df.select(
    expr("properties.LEA").alias("Town"),
    expr("properties.Location").alias("Location"),
    expr("properties.Number_of_chargers"),
    expr("properties.ObjectID"),
    expr("properties.Operator"),
    expr("properties.Rating"),
    expr("properties.Type")
)

In [0]:
# Drop null rows
df = df.na.drop()

Replace null and blank values in string columns with "Unknown"

In [0]:
from pyspark.sql.functions import when, lit, col
 
string_cols = ['Town', 'Location', 'Operator', 'Rating', 'Type']

for col_name in string_cols:
    df = df.withColumn(col_name, when(
        (col(col_name).isNull() | (col(col_name)=="")), 
        lit("Unknown")
    ).otherwise(col(col_name)))

Extract the left-most word from **Town** before delimiter using regular expressions

In [0]:
from pyspark.sql.functions import regexp_extract

# Use regular expression to extract the first word based on multiple delimiters
df = df.withColumn("Town", regexp_extract(col("Town"), r"^[^\s-_]+", 0))

Trim string columns

In [0]:
from pyspark.sql.functions import trim

for col_name in string_cols:
    df = df.withColumn(col_name, trim(df[col_name]))

Convert datatypes

In [0]:
from pyspark.sql.types import StringType, LongType, ByteType

# Convert string columns to StringType
for col_name in string_cols:
    df = df.withColumn(col_name, col(col_name).cast(StringType()))

# Convert ObjectID & Number_of_chargers to LongType & ByteType respectively
df = df.withColumn("ObjectID", col("ObjectID").cast(LongType()))
df = df.withColumn("Number_of_chargers", col("Number_of_chargers").cast(ByteType()))

Write to silver layer as a delta table

In [0]:
from delta.tables import DeltaTable

df.write.format("delta").mode("append").save("/delta/EV_charging_points_silver")