In [0]:
spark

In [0]:
flight_df=spark.read.format("CSV")\
                  .option("header","false")\
                  .option("inferschema","false")\
                  .option("mode","FAILFAST")\
                  .load("/FileStore/tables/flight_data.csv")
flight_df.show(5)

+-----------------+-------------------+-----+
|              _c0|                _c1|  _c2|
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
+-----------------+-------------------+-----+
only showing top 5 rows



In [0]:
flight_df_withHeader=spark.read.format("CSV")\
                  .option("header","true")\
                  .option("inferschema","false")\
                  .option("mode","FAILFAST")\
                  .load("/FileStore/tables/flight_data.csv")
flight_df_withHeader.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [0]:
flight_df_withHeader.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: string (nullable = true)



In [0]:
flight_df_withSchema=spark.read.format("CSV")\
                  .option("header","true")\
                  .option("inferschema","true")\
                  .option("mode","FAILFAST")\
                  .load("/FileStore/tables/flight_data.csv")
flight_df_withSchema.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [0]:
flight_df_withSchema.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [0]:
my_schema = StructType([
                        StructField("DEST_COUNTRY_NAME",StringType(),True),
                        StructField("ORIGIN_COUNTRY_NAME",StringType(),True),
                        StructField("cpunt",IntegerType(),True),
                        ])

In [0]:
flight_df=spark.read.format("CSV")\
                  .option("header","false")\
                  .option("skipRows",1)\
                  .option("inferschema","false")\
                  .schema(my_schema)\
                  .option("mode","PERMISSIVE")\
                  .load("/FileStore/tables/flight_data.csv")
flight_df.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|cpunt|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



# skipRows: Let us skip rows starting from first record
# option("mode","PERMISSIVE"): Had to change from FAILFAST to PERMISSIVE because inferschema is false and in my_schema we have count as integer type