In [0]:
# Define the file location and type
geo_location = "/mnt/0affc011d3cf-mount/topics/0affc011d3cf.geo/partition=0/*.json"
file_type = "json"
infer_schema = "true"

# Load the JSON files into a Spark DataFrame
df_geo = spark.read.format(file_type).option("inferSchema", infer_schema).load(geo_location)

# Display the DataFrame
display(df_geo)

country,ind,latitude,longitude,timestamp
British Indian Ocean Territory (Chagos Archipelago),8221,-20.5574,-54.4834,2021-12-29T06:33:46
British Indian Ocean Territory (Chagos Archipelago),7151,-14.6744,-75.3714,2020-06-05T23:37:24
British Indian Ocean Territory (Chagos Archipelago),7569,-86.5675,-149.565,2018-10-16T08:40:26
British Indian Ocean Territory (Chagos Archipelago),6731,-86.5675,-149.565,2021-06-08T19:10:49
British Indian Ocean Territory (Chagos Archipelago),9345,22.1505,-158.823,2020-06-20T08:22:49
Antarctica (the territory South of 60 deg S),9127,-77.9931,-175.682,2021-09-25T14:31:22
Antarctica (the territory South of 60 deg S),5855,-84.9073,-105.769,2022-04-19T19:50:22
Antarctica (the territory South of 60 deg S),7743,-24.9999,-92.9126,2020-06-02T06:18:41
Antarctica (the territory South of 60 deg S),2294,-88.4642,-171.061,2020-05-04T09:26:02
South Georgia and the South Sandwich Islands,10024,-24.7257,89.1335,2021-03-19T02:41:59


In [0]:
from pyspark.sql.functions import array, to_timestamp

def create_coordinates_col(df):
    return df.withColumn("coordinates", array("latitude", "longitude"))

def drop_long_lat(df):
    return df.drop("latitude", "longitude")

def convert_to_timestamp(df):
    return df.withColumn("timestamp", to_timestamp("timestamp"))

def order_columns(df):
    return df.select('ind', 'country', 'coordinates', 'timestamp')
  

In [0]:
df_geo = create_coordinates_col(df_geo)
df_geo = drop_long_lat(df_geo)
df_geo = convert_to_timestamp(df_geo)
cleaned_geo = order_columns(df_geo)

display(cleaned_geo)

ind,country,coordinates,timestamp
8221,British Indian Ocean Territory (Chagos Archipelago),"List(-20.5574, -54.4834)",2021-12-29T06:33:46.000+0000
7151,British Indian Ocean Territory (Chagos Archipelago),"List(-14.6744, -75.3714)",2020-06-05T23:37:24.000+0000
7569,British Indian Ocean Territory (Chagos Archipelago),"List(-86.5675, -149.565)",2018-10-16T08:40:26.000+0000
6731,British Indian Ocean Territory (Chagos Archipelago),"List(-86.5675, -149.565)",2021-06-08T19:10:49.000+0000
9345,British Indian Ocean Territory (Chagos Archipelago),"List(22.1505, -158.823)",2020-06-20T08:22:49.000+0000
9127,Antarctica (the territory South of 60 deg S),"List(-77.9931, -175.682)",2021-09-25T14:31:22.000+0000
5855,Antarctica (the territory South of 60 deg S),"List(-84.9073, -105.769)",2022-04-19T19:50:22.000+0000
7743,Antarctica (the territory South of 60 deg S),"List(-24.9999, -92.9126)",2020-06-02T06:18:41.000+0000
2294,Antarctica (the territory South of 60 deg S),"List(-88.4642, -171.061)",2020-05-04T09:26:02.000+0000
10024,South Georgia and the South Sandwich Islands,"List(-24.7257, 89.1335)",2021-03-19T02:41:59.000+0000


In [0]:
# Define the Parquet file save location
parquet_save_location = "/mnt/0affc011d3cf-mount/cleaned_geo"

# Save the cleaned DataFrame as a Parquet file
cleaned_geo.write.mode('overwrite').parquet(parquet_save_location)