In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType


In [3]:
spark = SparkSession.builder.appName('FirstApp').getOrCreate()

In [13]:
employee_df = spark.read.format('csv')\
                .option('header', 'true')\
                .load('emp_id.csv')
employee_df.count()
employee_df.show()

+---+-------+---+----------+------+
| id|   Name|age|Experience|Salary|
+---+-------+---+----------+------+
|  1|shambhu| 33|        10| 20000|
|  2| mahesh| 34|         8| 35000|
|  3| natraj| 35|         5| 25000|
|  4| kishor| 28|         3| 30000|
|  5| vishal| 29|         5| 15000|
|  6|vaibhav| 27|         2| 18000|
|  7|vaibhav| 34|         2| 27000|
|  8|vaibhav| 28|         2| 30000|
|  9|vaibhav| 35|         2| 29000|
+---+-------+---+----------+------+



In [5]:
print(employee_df.rdd.getNumPartitions())

1


In [7]:
employee_df = employee_df.repartition(2) 
print(employee_df.rdd.getNumPartitions())

2


In [14]:
employee_df=employee_df.filter(col("Salary")>1000)\
                       .select('id','Name','age','Salary')\
                       .groupby('age').count()
employee_df.collect()

[Row(age='29', count=1),
 Row(age='34', count=2),
 Row(age='28', count=2),
 Row(age='35', count=2),
 Row(age='27', count=1),
 Row(age='33', count=1)]

In [15]:
employee_df.show()

+---+-----+
|age|count|
+---+-----+
| 29|    1|
| 34|    2|
| 28|    2|
| 35|    2|
| 27|    1|
| 33|    1|
+---+-----+



# Repartition vs coalesce

In [10]:
flight_df = spark.read.format('csv')\
                .option('header', 'true')\
                .option('inferSchema', 'true')\
                .load('2010-summary.csv')
flight_df.count()

255

In [11]:
flight_df.rdd.getNumPartitions()

1

In [12]:
partition_flight_df = flight_df.repartition(4)

In [13]:
partition_flight_df.rdd.getNumPartitions()

4

In [14]:
partition_flight_df.withColumn("partitionId", spark_partition_id()).groupBy("partitionId").count().show()
# evenly distributed data

+-----------+-----+
|partitionId|count|
+-----------+-----+
|          0|   63|
|          1|   64|
|          2|   64|
|          3|   64|
+-----------+-----+



In [15]:
partitioned_on_column = flight_df.repartition(300, "ORIGIN_COUNTRY_NAME")

In [16]:
partitioned_on_column.rdd.getNumPartitions()

300

In [17]:
partitioned_on_column.withColumn("partitionId", spark_partition_id()).groupby("partitionId").count().show(300)

+-----------+-----+
|partitionId|count|
+-----------+-----+
|          0|    1|
|          2|    1|
|          7|    1|
|         10|    1|
|         13|    1|
|         15|    2|
|         16|    2|
|         19|    1|
|         21|    1|
|         22|    1|
|         28|    1|
|         31|    1|
|         39|    1|
|         42|    1|
|         43|    1|
|         44|    1|
|         45|    2|
|         48|    1|
|         53|    1|
|         54|    1|
|         55|    1|
|         65|    1|
|         70|    1|
|         73|    1|
|         75|    1|
|         76|    2|
|         81|    1|
|         84|    2|
|         86|    1|
|         87|    1|
|         90|    1|
|         91|    1|
|         97|    2|
|        100|    1|
|        103|    2|
|        104|    1|
|        108|    1|
|        112|    2|
|        115|    1|
|        117|    2|
|        126|    1|
|        127|    2|
|        129|    1|
|        130|    2|
|        132|    1|
|        133|    1|
|        138|    1|


In [18]:
coalesce_flight_df = flight_df.repartition(8)

In [19]:
coalesce_flight_df.withColumn("partitionId", spark_partition_id()).groupby("partitionId").count().show()

+-----------+-----+
|partitionId|count|
+-----------+-----+
|          0|   32|
|          1|   31|
|          2|   32|
|          3|   32|
|          4|   32|
|          5|   32|
|          6|   32|
|          7|   32|
+-----------+-----+



In [20]:
three_coalesce_df = coalesce_flight_df.coalesce(3)

In [21]:
three_coalesce_df.withColumn("partitionId", spark_partition_id()).groupby("partitionId").count().show()

# uneven distribution

+-----------+-----+
|partitionId|count|
+-----------+-----+
|          0|   64|
|          1|   95|
|          2|   96|
+-----------+-----+



In [22]:
three_repartition_df = flight_df.repartition(3)
three_repartition_df.withColumn("partitionId", spark_partition_id()).groupby("partitionId").count().show()


+-----------+-----+
|partitionId|count|
+-----------+-----+
|          0|   85|
|          1|   85|
|          2|   85|
+-----------+-----+



In [24]:
# In coalesce we can decrease partition but not increase
coalesce_flight_df.repartition(8).rdd.getNumPartitions()

8

In [25]:
three_repartition_df.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Exchange RoundRobinPartitioning(3), REPARTITION_BY_NUM, [plan_id=662]
   +- FileScan csv [DEST_COUNTRY_NAME#140,ORIGIN_COUNTRY_NAME#141,count#142] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/C:/Users/KISHOR/Documents/Python/pyspark/2010-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




In [29]:
l = [1,4,6,7,6,8,1,14,18]
for i in range (0, len(l)):
    if (l[i] + l[1+i]) == 10:
        print('hi')

hi


IndexError: list index out of range

In [4]:
df = spark.createDataFrame([6,7,3,8,2,9],"int").toDF("number")
df.show()

+------+
|number|
+------+
|     6|
|     7|
|     3|
|     8|
|     2|
|     9|
+------+



# PERMISSIVE, FAILFAST, DROPMALFORMED

In [6]:
schema = StructType([
        StructField("TYPE", StringType()),
        StructField("COUNTRY", StringType()),
        StructField("CITY", StringType()),
        StructField("ENGINES", IntegerType()),
        StructField("FIRST_FLIGHT", StringType()),
        StructField("NUMBER_BUILT", IntegerType())
    ])

In [None]:
df1 = spark.read.csv('employee_data.csv', header = True, inferSchema = True, mode = 'PERMISSIVE')
df1.show()

In [7]:
read_df = spark.read.format('csv') \
        .option("header", "true") \
        .option("mode", "PERMISSIVE") \
        .schema(schema) \
        .load("airplanes.csv")
read_df.show()

+-----------+-------+--------+-------+------------+------------+
|       TYPE|COUNTRY|    CITY|ENGINES|FIRST_FLIGHT|NUMBER_BUILT|
+-----------+-------+--------+-------+------------+------------+
|Airbus A220| Canada| Calgary|      2|  02-03-2013|         179|
|Airbus A220| Canada| Calgary|   NULL|  02-03-2013|         179|
|Airbus A220| Canada| Calgary|      2|  02-03-2013|         179|
|Airbus A320| France|    Lyon|   NULL|  10-06-1986|       10066|
|Airbus A330| France|    Lyon|   NULL|  02-01-1992|        1521|
| Boeing 737|    USA|New York|   NULL|  03-08-1967|       10636|
| Boeing 737|    USA|New York|   NULL|  03-08-1967|       10636|
| Boeing 737|    USA|New York|      2|  03-08-1967|       10636|
|Airbus A220| Canada| Calgary|      2|  02-03-2013|         179|
+-----------+-------+--------+-------+------------+------------+



In [8]:
read_df1 = spark.read.format('csv') \
        .option("header", "true") \
        .option("mode", "DROPMALFORMED") \
        .schema(schema) \
        .load("airplanes.csv")
read_df1.show()

+-----------+-------+--------+-------+------------+------------+
|       TYPE|COUNTRY|    CITY|ENGINES|FIRST_FLIGHT|NUMBER_BUILT|
+-----------+-------+--------+-------+------------+------------+
|Airbus A220| Canada| Calgary|      2|  02-03-2013|         179|
|Airbus A220| Canada| Calgary|      2|  02-03-2013|         179|
| Boeing 737|    USA|New York|      2|  03-08-1967|       10636|
|Airbus A220| Canada| Calgary|      2|  02-03-2013|         179|
+-----------+-------+--------+-------+------------+------------+



In [9]:
read_df2 = spark.read.format('csv') \
        .option("header", "true") \
        .option("mode", "FAILFAST") \
        .schema(schema) \
        .load("airplanes.csv")
read_df2.show()

Py4JJavaError: An error occurred while calling o79.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 5.0 failed 1 times, most recent failure: Lost task 0.0 in stage 5.0 (TID 10) (LAPTOP-3V2ROQ70 executor driver): org.apache.spark.SparkException: [MALFORMED_RECORD_IN_PARSING.WITHOUT_SUGGESTION] Malformed records are detected in record parsing: [Airbus A220,Canada,Calgary,null,02-03-2013,179].
Parse Mode: FAILFAST. To process malformed records as null result, try setting the option 'mode' as 'PERMISSIVE'. 
	at org.apache.spark.sql.errors.QueryExecutionErrors$.malformedRecordsDetectedInRecordParsingError(QueryExecutionErrors.scala:1610)
	at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:79)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser$.$anonfun$parseIterator$2(UnivocityParser.scala:456)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
	at java.base/java.lang.Thread.run(Thread.java:832)
Caused by: org.apache.spark.sql.catalyst.util.BadRecordException: java.lang.NumberFormatException: For input string: "two"
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.org$apache$spark$sql$catalyst$csv$UnivocityParser$$convert(UnivocityParser.scala:365)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$parse$2(UnivocityParser.scala:307)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser$.$anonfun$parseIterator$1(UnivocityParser.scala:452)
	at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:60)
	... 26 more
Caused by: java.lang.NumberFormatException: For input string: "two"
	at java.base/java.lang.NumberFormatException.forInputString(NumberFormatException.java:68)
	at java.base/java.lang.Integer.parseInt(Integer.java:652)
	at java.base/java.lang.Integer.parseInt(Integer.java:770)
	at scala.collection.immutable.StringLike.toInt(StringLike.scala:310)
	at scala.collection.immutable.StringLike.toInt$(StringLike.scala:310)
	at scala.collection.immutable.StringOps.toInt(StringOps.scala:33)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$6(UnivocityParser.scala:189)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$6$adapted(UnivocityParser.scala:189)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.nullSafeDatum(UnivocityParser.scala:291)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$5(UnivocityParser.scala:189)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.org$apache$spark$sql$catalyst$csv$UnivocityParser$$convert(UnivocityParser.scala:346)
	... 29 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4332)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3314)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4322)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4320)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4320)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3314)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3537)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:64)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:564)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:832)
Caused by: org.apache.spark.SparkException: [MALFORMED_RECORD_IN_PARSING.WITHOUT_SUGGESTION] Malformed records are detected in record parsing: [Airbus A220,Canada,Calgary,null,02-03-2013,179].
Parse Mode: FAILFAST. To process malformed records as null result, try setting the option 'mode' as 'PERMISSIVE'. 
	at org.apache.spark.sql.errors.QueryExecutionErrors$.malformedRecordsDetectedInRecordParsingError(QueryExecutionErrors.scala:1610)
	at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:79)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser$.$anonfun$parseIterator$2(UnivocityParser.scala:456)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
	... 1 more
Caused by: org.apache.spark.sql.catalyst.util.BadRecordException: java.lang.NumberFormatException: For input string: "two"
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.org$apache$spark$sql$catalyst$csv$UnivocityParser$$convert(UnivocityParser.scala:365)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$parse$2(UnivocityParser.scala:307)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser$.$anonfun$parseIterator$1(UnivocityParser.scala:452)
	at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:60)
	... 26 more
Caused by: java.lang.NumberFormatException: For input string: "two"
	at java.base/java.lang.NumberFormatException.forInputString(NumberFormatException.java:68)
	at java.base/java.lang.Integer.parseInt(Integer.java:652)
	at java.base/java.lang.Integer.parseInt(Integer.java:770)
	at scala.collection.immutable.StringLike.toInt(StringLike.scala:310)
	at scala.collection.immutable.StringLike.toInt$(StringLike.scala:310)
	at scala.collection.immutable.StringOps.toInt(StringOps.scala:33)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$6(UnivocityParser.scala:189)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$6$adapted(UnivocityParser.scala:189)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.nullSafeDatum(UnivocityParser.scala:291)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$5(UnivocityParser.scala:189)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.org$apache$spark$sql$catalyst$csv$UnivocityParser$$convert(UnivocityParser.scala:346)
	... 29 more


In [14]:
myschema = StructType([
        StructField("id", IntegerType()),
        StructField("name", StringType()),
        StructField("age", StringType()),
        StructField("salary", StringType()),
        StructField("address", StringType()),
        StructField("nominee", StringType()),
        StructField("bad_record", StringType())
])

read_df = spark.read.format('csv') \
        .option("header", "true") \
        .option('inferschema', True)\
        .option("mode", "DROPMALFORMED") \
        .schema(myschema)\
        .load("employee_data.csv")
read_df.show()

+---+--------+---+------+------------+--------+----------+
| id|    name|age|salary|     address| nominee|bad_record|
+---+--------+---+------+------------+--------+----------+
|  1|  Manish| 26| 75000|       bihar|nominee1|      NULL|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|      NULL|
|  3|  Pritam| 22|150000|   Bangalore|   India|  nominee3|
|  4|Prantosh| 17|200000|     Kolkata|   India|  nominee4|
|  5|  Vikash| 31|300000|        NULL|nominee5|      NULL|
+---+--------+---+------+------------+--------+----------+



In [16]:
spark.stop()