In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType

#intialising the SparkSession
#using maxPartitionBytes helps us to control parallesiom. if not using this spark may read huge chunks of data and may cause outOfMemory(OOM)
Spark = SparkSession.builder\
        .appName("NYC_Taxi_Analytics")\
        .config("spark.sql.files.maxPartitionBytes", "128m")\
        .getOrCreate()




In [0]:
#defining the schema explicitly
#It helps the spark not to read whole data for schema and helps in performance 
taxi_schema = StructType([
    StructField("VendorID", IntegerType(), True),
    StructField("tpep_pickup_datetime", TimestampType(), True),
    StructField("tpep_dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", DoubleType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("PULocationID", IntegerType(), True),
    StructField("DOLocationID", IntegerType(), True),
    StructField("rateCodeId", IntegerType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("payment_type", IntegerType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("congestion_surcharge", DoubleType(), True)
])


In [0]:
taxi_df = spark.read\
    .schema(taxi_schema)\
        .option("mode","DROPMALFORMED")\
            .csv("dbfs:/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2019-01.csv.gz")
print(f"Initial Count : {taxi_df.count()}")

#For df.count() spark doesnot pull all data at option. if it does it will crash memory. instead, it peforms a partial sum 
#As we mentioned maxPartiitonsBytes is 128m
#Driver gives instrcution to executors to count rows in your slice.
#After counting executors do not send all rows back to driver. it will send only that integer count to driver in his(executor) slice
#In reading CSV. spark must read all data to find new line characters(\n) to count rows.
#When it is parquet file spark will skip data and reads the metadata at footer of file. it will complete it in milliseconds
#As we are using "DROPMALFORMED" mode. it must physically scan the data to see if any rows are malformed (corrupt).