In [0]:
flight_df = spark.read.format('csv')\
            .option('inferschema','true')\
            .option('header','true')\
            .option('mode','failfast')\
            .load('/FileStore/tables/flight_data.csv')

In [0]:
display(flight_df.limit(10))

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [0]:
my_schema = StructType([
    StructField('DEST_COUNTRY_NAME', StringType(), True),
    StructField('ORIGIN_COUNTRY_NAME', StringType(), True),
    StructField('count', IntegerType(), True)
])

In [0]:
flight_df_schema = spark.read.format('csv')\
                        .option('header','false')\
                        .option('skipRows',1)\
                        .option('inferschema','false')\
                        .schema(my_schema)\
                        .option('mode','permissive')\
                        .load('/FileStore/tables/flight_data.csv')

display(flight_df_schema.limit(10))

In [0]:
#run in permissive mode
employee_df = spark.read.format('csv')\
                        .option('header','true')\
                        .option('inferSchema','true')\
                        .option('mode','permissive')\
                        .load("/FileStore/tables/employee_file.csv")

employee_df.show()

In [0]:
#run in dropmalformed mode
employee_df_malformed = spark.read.format('csv')\
                        .option('header','true')\
                        .option('inferSchema','true')\
                        .option('mode','dropmalformed')\
                        .load("/FileStore/tables/employee_file.csv")

employee_df_malformed.show()

In [0]:
#run in failfast mode
employee_df_failfast = spark.read.format('csv')\
                        .option('header','true')\
                        .option('inferSchema','true')\
                        .option('mode','failfast')\
                        .load("/FileStore/tables/employee_file.csv")

employee_df_failfast.show()

In [0]:
#to print corrupted recrods
#first import function

from pyspark.sql.types import StructType,StructField,StringType,IntegerType

In [0]:
emp_schema = StructType([
                            StructField('id',IntegerType(),True),
                            StructField('name',StringType(),True),
                            StructField('age',IntegerType(),True),
                            StructField('salary',IntegerType(),True),
                            StructField('address',StringType(),True),
                            StructField('nominee',StringType(),True),
                            StructField('_corrupt_record',StringType(),True)
])

In [0]:
#run in permissive mode
employee_df_new = spark.read.format('csv')\
                        .option('header','true')\
                        .option('inferSchema','true')\
                        .option('mode','permissive')\
                        .schema(emp_schema)\
                        .load("/FileStore/tables/employee_file.csv")

employee_df_new.show(truncate = False)

In [0]:
#run in permissive mode
employee_df_new = spark.read.format('csv')\
                        .option('header','true')\
                        .option('inferSchema','true')\
                        .schema(emp_schema)\
                        .option('badRecordsPath','/FileStore/tables/bad_records')\
                        .load("/FileStore/tables/employee_file.csv")

employee_df_new.show(truncate = False)

In [0]:
%fs ls /FileStore/tables/bad_records/20250204T113407/bad_records/part-00000-13ea118f-330a-4f93-9165-1852bf3cbc5a

In [0]:
bad_data_df = spark.read.format('json').load('/FileStore/tables/bad_records/20250204T113407/bad_records/part-00000-13ea118f-330a-4f93-9165-1852bf3cbc5a')
bad_data_df.show(truncate=False)

Transformations

In [0]:
#run in permissive mode
employee_df = spark.read.format('csv')\
                        .option('header','true')\
                        .option('inferSchema','true')\
                        .option('mode','permissive')\
                        .load("/FileStore/tables/employee_file.csv")

employee_df.show()

In [0]:
# how to use col
from pyspark.sql.functions import col

employee_df.select(col('id')+5).show()

In [0]:
employee_df.select('id',col('name'),employee_df["salary"],employee_df.address).show()

expressions

In [0]:
from pyspark.sql.functions import expr

employee_df.select(expr("id+5")).show()

In [0]:
display(employee_df.select(expr("id+5").alias('new_id')))

In [0]:
display(employee_df.select(expr("id as employee_id"),expr("name as employee_name"),expr("concat(name,address)")))

spark SQL

In [0]:
employee_df.createOrReplaceTempView('employee_table')

In [0]:
display(spark.sql("""select * from employee_table"""))