In [None]:
#Intended use is to use printSchema and
#get the inferred schema of a json file
#then input it as a Schema without needing manual formatting
#to speed up pipeline creation

In [1]:
#need datatypes to use for pretty print formatting
from pyspark.sql.types import *

In [2]:
#example of a printSchema from crime data 2020-2026-01-30
inferred_schema_text = """
root
 |-- area: string (nullable = true)
 |-- area_name: string (nullable = true)
 |-- crm_cd: string (nullable = true)
 |-- crm_cd_1: string (nullable = true)
 |-- crm_cd_2: string (nullable = true)
 |-- crm_cd_3: string (nullable = true)
 |-- crm_cd_4: string (nullable = true)
 |-- crm_cd_desc: string (nullable = true)
 |-- cross_street: string (nullable = true)
 |-- date_occ: string (nullable = true)
 |-- date_rptd: string (nullable = true)
 |-- dr_no: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- location: string (nullable = true)
 |-- lon: string (nullable = true)
 |-- mocodes: string (nullable = true)
 |-- part_1_2: string (nullable = true)
 |-- premis_cd: string (nullable = true)
 |-- premis_desc: string (nullable = true)
 |-- rpt_dist_no: string (nullable = true)
 |-- status: string (nullable = true)
 |-- status_desc: string (nullable = true)
 |-- time_occ: string (nullable = true)
 |-- vict_age: string (nullable = true)
 |-- vict_descent: string (nullable = true)
 |-- vict_sex: string (nullable = true)
 |-- weapon_desc: string (nullable = true)
 |-- weapon_used_cd: string (nullable = true)
"""

In [3]:
#function to parse the printSchema output into easier format to use
def inferred_schema_formatter(inferred_schema_text, type_map):
    
    inferred_schema_fields = []
    
    for raw_line in inferred_schema_text.split('\n'):
        line = raw_line.strip()
        if line.startswith('|--'):
            line = line[4:]
            line_2_parts = line.split(':', 2)
            col_name = line_2_parts[0]
    
            line_2nd_2_parts = line_2_parts[1].strip().split(' ', 2)
            data_type = line_2nd_2_parts[0].lower()
            if data_type in type_map:
                spark_type = type_map[data_type]
            else:
                raise ValueError(f"Unsupported type: {data_type}")
    
            is_nullable = 'true' in line_2nd_2_parts[2].lower()
    
            inferred_schema_fields.append(StructField(col_name, spark_type, is_nullable))
            
    return inferred_schema_fields



In [11]:
#converts into a copy-paste format for a json structure
def pretty_print_inferred_schema(inferred_schema_fields):
    
    schema_output = ['StructType([']

    for line in inferred_schema_fields:
        schema_output.append(f"    StructField('{line.name}', {line.dataType}, {line.nullable}),")
        
    schema_output.append("])")
    
    return '\n'.join(schema_output)

In [9]:
#main function to house everything together
def main(inferred_schema_text):

    type_map = {
        "string": StringType(),
        "int": IntegerType(),
        "integer": IntegerType(),
        "double": DoubleType(),
        "boolean": BooleanType(),
        "timestamp": TimestampType()
    }

    inferred_schema_fields = inferred_schema_formatter(inferred_schema_text, type_map)

    schema_str = pretty_print_inferred_schema(inferred_schema_fields)
    
    print(schema_str)

In [10]:
#example output

main(inferred_schema_text)

StructType([
    StructField('area', StringType(), True),
    StructField('area_name', StringType(), True),
    StructField('crm_cd', StringType(), True),
    StructField('crm_cd_1', StringType(), True),
    StructField('crm_cd_2', StringType(), True),
    StructField('crm_cd_3', StringType(), True),
    StructField('crm_cd_4', StringType(), True),
    StructField('crm_cd_desc', StringType(), True),
    StructField('cross_street', StringType(), True),
    StructField('date_occ', StringType(), True),
    StructField('date_rptd', StringType(), True),
    StructField('dr_no', StringType(), True),
    StructField('lat', StringType(), True),
    StructField('location', StringType(), True),
    StructField('lon', StringType(), True),
    StructField('mocodes', StringType(), True),
    StructField('part_1_2', StringType(), True),
    StructField('premis_cd', StringType(), True),
    StructField('premis_desc', StringType(), True),
    StructField('rpt_dist_no', StringType(), True),
    StructF