# Step 1
### Analyze and extract the json information into three parquet files 

![](/Volumes/de_demo/default/ev_data/1. Analyze and Extract.png)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, explode, explode_outer, split
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
from pyspark.sql.functions import col
from pyspark.sql.types import ArrayType,StructType
import json

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Read JSON Data") \
    .master("local[*]") \
    .getOrCreate()

In [2]:
# Read JSON file from S3 bucket
file = "/home/jovyan/work/data/ElectricVehiclePopulationData.json"
multiline_df = spark.read.option("multiline", "true") \
      .json(file)
multiline_df.printSchema()
multiline_df.show()

root
 |-- data: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- meta: struct (nullable = true)
 |    |-- view: struct (nullable = true)
 |    |    |-- approvals: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- reviewedAt: long (nullable = true)
 |    |    |    |    |-- reviewedAutomatically: boolean (nullable = true)
 |    |    |    |    |-- state: string (nullable = true)
 |    |    |    |    |-- submissionDetails: struct (nullable = true)
 |    |    |    |    |    |-- permissionType: string (nullable = true)
 |    |    |    |    |-- submissionId: long (nullable = true)
 |    |    |    |    |-- submissionObject: string (nullable = true)
 |    |    |    |    |-- submissionOutcome: string (nullable = true)
 |    |    |    |    |-- submissionOutcomeApplication: struct (nullable = true)
 |    |    |    |    |    |-- failureCount: long (nullable = true

In [3]:
# Generate 3 data frames one each for table metadata, column metadata and vehicle data
table_metadata = multiline_df.select("meta.view.*").drop("columns")
columns_metadata = multiline_df.select(explode(col("meta.view.columns")).alias("columns"))
columns_metadata = columns_metadata.select("columns.*")
vehicle_data = multiline_df.select(explode(col("data")).alias("row_data"))

In [4]:
# Function to flatten the json by iterating through fields - both arrays and structs


def flatten_json(df):

    """
    Flattens a DataFrame with complex nested fields (Arrays and Structs) by converting them into individual columns.
   
    Parameters:
    - df: The input DataFrame with complex nested fields
   
    Returns:
    - The flattened DataFrame with all complex fields expanded into separate columns.
   """
   # compute Complex Fields (Lists and Structs) in Schema   
    complex_fields = dict([(field.name, field.dataType)
                             for field in df.schema.fields
                             if type(field.dataType) == ArrayType or  type(field.dataType) == StructType])
    print(df.schema)
    
    print("")
    while len(complex_fields)!=0:
      col_name=list(complex_fields.keys())[0]
      print ("Processing :"+col_name+" Type : "+str(type(complex_fields[col_name])))
    
      # if StructType then convert all sub element to columns.
      # i.e. flatten structs
      if (type(complex_fields[col_name]) == StructType):
         expanded = [col(col_name+'.'+k).alias(col_name+'_'+k) for k in [ n.name for n in  complex_fields[col_name]]]
         df=df.select("*", *expanded).drop(col_name)
    
      # if ArrayType then add the Array Elements as Rows using the explode function
      # i.e. explode Arrays
      elif (type(complex_fields[col_name]) == ArrayType):    
         df=df.withColumn(col_name,explode_outer(col_name))
    
      # recompute remaining Complex Fields in Schema       
      complex_fields = dict([(field.name, field.dataType)
                             for field in df.schema.fields
                             if type(field.dataType) == ArrayType or  type(field.dataType) == StructType])
    return df

In [5]:
# Flatten table metadata and exclude metadata fields that has special characters
table_metadata = multiline_df.select("meta.view.*").drop("columns")
table_meta = table_metadata.select("*").drop("metadata")
flatten_table_meta_df = flatten_json(table_meta)
flatten_table_meta_df.limit(50).toPandas().head()

StructType([StructField('approvals', ArrayType(StructType([StructField('reviewedAt', LongType(), True), StructField('reviewedAutomatically', BooleanType(), True), StructField('state', StringType(), True), StructField('submissionDetails', StructType([StructField('permissionType', StringType(), True)]), True), StructField('submissionId', LongType(), True), StructField('submissionObject', StringType(), True), StructField('submissionOutcome', StringType(), True), StructField('submissionOutcomeApplication', StructType([StructField('failureCount', LongType(), True), StructField('status', StringType(), True)]), True), StructField('submittedAt', LongType(), True), StructField('submitter', StructType([StructField('displayName', StringType(), True), StructField('id', StringType(), True)]), True), StructField('workflowId', LongType(), True)]), True), True), StructField('assetType', StringType(), True), StructField('attribution', StringType(), True), StructField('averageRating', LongType(), True),

Unnamed: 0,assetType,attribution,averageRating,category,createdAt,description,displayType,downloadCount,flags,hideFromCatalog,...,tableAuthor_profileImageUrlLarge,tableAuthor_profileImageUrlMedium,tableAuthor_profileImageUrlSmall,tableAuthor_screenName,tableAuthor_type,approvals_submissionDetails_permissionType,approvals_submissionOutcomeApplication_failureCount,approvals_submissionOutcomeApplication_status,approvals_submitter_displayName,approvals_submitter_id
0,dataset,Washington State Department of Licensing,0,Transportation,1555435581,This dataset shows the Battery Electric Vehicl...,table,52628,default,False,...,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,READ,0,success,Department of Licensing,eagg-6py7
1,dataset,Washington State Department of Licensing,0,Transportation,1555435581,This dataset shows the Battery Electric Vehicl...,table,52628,default,False,...,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,READ,0,success,Department of Licensing,eagg-6py7
2,dataset,Washington State Department of Licensing,0,Transportation,1555435581,This dataset shows the Battery Electric Vehicl...,table,52628,default,False,...,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,READ,0,success,Department of Licensing,eagg-6py7
3,dataset,Washington State Department of Licensing,0,Transportation,1555435581,This dataset shows the Battery Electric Vehicl...,table,52628,default,False,...,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,READ,0,success,Department of Licensing,eagg-6py7
4,dataset,Washington State Department of Licensing,0,Transportation,1555435581,This dataset shows the Battery Electric Vehicl...,table,52628,default,False,...,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,READ,0,success,Department of Licensing,eagg-6py7


In [6]:
columns_metadata.limit(50).toPandas().head()

Unnamed: 0,computationStrategy,dataTypeName,description,fieldName,flags,format,id,name,position,renderTypeName,tableColumnId
0,,meta_data,,:sid,[hidden],"(None,)",-1,sid,0,meta_data,
1,,meta_data,,:id,[hidden],"(None,)",-1,id,0,meta_data,
2,,meta_data,,:position,[hidden],"(None,)",-1,position,0,meta_data,
3,,meta_data,,:created_at,[hidden],"(None,)",-1,created_at,0,meta_data,
4,,meta_data,,:created_meta,[hidden],"(None,)",-1,created_meta,0,meta_data,


In [7]:
vehicle_data.limit(50).toPandas().head()

Unnamed: 0,row_data
0,"[row-zt4k~iszy.uhv6, 00000000-0000-0000-62B4-C..."
1,"[row-5r58~kb8y.789r, 00000000-0000-0000-B54E-F..."
2,"[row-84ix~3wif_u9ju, 00000000-0000-0000-F67B-B..."
3,"[row-wiar-siae_sed9, 00000000-0000-0000-0360-7..."
4,"[row-abd5~finn.nzkg, 00000000-0000-0000-3182-A..."


In [8]:
flatten_table_meta_df.coalesce(1).write.mode("overwrite").parquet("/home/jovyan/work/data/raw/derived/table_metadata")
columns_metadata.coalesce(1).write.mode("overwrite").parquet("/home/jovyan/work/data/raw/derived/columns_metadata")



In [12]:
data_headers = [row['name'] for row in columns_metadata.select("name").collect()]
vehicle_data_exploded = vehicle_data.select(*[col('row_data').getItem(i).alias(f'row_data{i+1}') for i in range(0, 28)])
vehicle_data_exploded = vehicle_data_exploded.toDF(*data_headers)
vehicle_data_exploded.write.mode("overwrite").parquet("/home/jovyan/work/data/raw/derived/vehicle_data")



# Findings 

JSON has two elements 
1. meta
2. data 

meta consists of metadata information that has data set metadata - referring this as table metadata going forward and column metadata. 

table metadata consists of various types of arrays and struct fields of which approvals, submission information is also part of it. Table metadata is flattened before being written to S3 bucket

column metadata consists of name, data type, description, position and other details. 

names from column metadata are extracted as first row and stitched together with vehicle data and written as a parquet file to the derived folder

![](/Volumes/de_demo/default/ev_data/S3 File processing.png)