# Instancing Spark

In [1]:
# Import the PySpark library and the `SparkSession` class

import pyspark
from pyspark.sql import SparkSession

In [2]:
# Instance a session

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

24/03/02 13:32:55 WARN Utils: Your hostname, Desktop-Gar resolves to a loopback address: 127.0.1.1; using 172.25.243.204 instead (on interface eth0)
24/03/02 13:32:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/02 13:32:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Read data

In [None]:
# Download some large data

# !wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-01.csv.gz

In [3]:
# Read the data. Note that the datatypes are not inferred; everything is a string.

df = spark.read \
    .option("header", "true") \
    .csv('data/csv/green_tripdata_2020-01.csv.gz')

                                                                                

In [4]:
df.show()

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2019-12-18 15:52:30|  2019-12-18 15:54:39|                 N|         1|         264|         264|              5|          .00|        3.5|  0.5|    0.

In [5]:
df.head(5)

[Row(VendorID='2', lpep_pickup_datetime='2019-12-18 15:52:30', lpep_dropoff_datetime='2019-12-18 15:54:39', store_and_fwd_flag='N', RatecodeID='1', PULocationID='264', DOLocationID='264', passenger_count='5', trip_distance='.00', fare_amount='3.5', extra='0.5', mta_tax='0.5', tip_amount='0.01', tolls_amount='0', ehail_fee=None, improvement_surcharge='0.3', total_amount='4.81', payment_type='1', trip_type='1', congestion_surcharge='0'),
 Row(VendorID='2', lpep_pickup_datetime='2020-01-01 00:45:58', lpep_dropoff_datetime='2020-01-01 00:56:39', store_and_fwd_flag='N', RatecodeID='5', PULocationID='66', DOLocationID='65', passenger_count='2', trip_distance='1.28', fare_amount='20', extra='0', mta_tax='0', tip_amount='4.06', tolls_amount='0', ehail_fee=None, improvement_surcharge='0.3', total_amount='24.36', payment_type='1', trip_type='2', congestion_surcharge='0'),
 Row(VendorID='2', lpep_pickup_datetime='2020-01-01 00:41:38', lpep_dropoff_datetime='2020-01-01 00:52:49', store_and_fwd_fla

In [6]:
# Read the schema

df.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- lpep_pickup_datetime: string (nullable = true)
 |-- lpep_dropoff_datetime: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- ehail_fee: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- trip_type: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)



# Creating a schema with Pandas

In [7]:
from pyspark.sql import types

In [8]:
green_schema = types.StructType([
    types.StructField("VendorID", types.IntegerType(), True),
    types.StructField("lpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("lpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("RatecodeID", types.IntegerType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("passenger_count", types.IntegerType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("ehail_fee", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("payment_type", types.IntegerType(), True),
    types.StructField("trip_type", types.IntegerType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

In [9]:
# With this new schema, we can now create a dataframe with inferred datatypes.

df = spark.read \
    .option("header", "true") \
    .schema(green_schema) \
    .csv('data/csv/green_tripdata_2020-01.csv.gz')

In [10]:
df.show()

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2019-12-18 15:52:30|  2019-12-18 15:54:39|                 N|         1|         264|         264|              5|          0.0|        3.5|  0.5|    0.

In [11]:
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



# Partitions

In [12]:
# We will now create 24 partitions in our dataframe

df = df.repartition(24)

In [13]:
# Let's parquetize the dataframe. This will create 24 smaller parquet files.
# This operation may take a while.

df.write.csv('data/raw/green/2020/01/', mode='overwrite')

                                                                                

Check the created files

In [14]:
!ls -lh data/raw/green/2020/01/

total 51M
-rwxrwxrwx 1 root root    0 Mar  2 13:35 _SUCCESS
-rwxrwxrwx 1 root root 2.1M Mar  2 13:34 part-00000-4de89aaa-a710-43f9-b28d-fb5a2dc538fe-c000.csv
-rwxrwxrwx 1 root root 2.1M Mar  2 13:34 part-00001-4de89aaa-a710-43f9-b28d-fb5a2dc538fe-c000.csv
-rwxrwxrwx 1 root root 2.1M Mar  2 13:34 part-00002-4de89aaa-a710-43f9-b28d-fb5a2dc538fe-c000.csv
-rwxrwxrwx 1 root root 2.1M Mar  2 13:34 part-00003-4de89aaa-a710-43f9-b28d-fb5a2dc538fe-c000.csv
-rwxrwxrwx 1 root root 2.1M Mar  2 13:34 part-00004-4de89aaa-a710-43f9-b28d-fb5a2dc538fe-c000.csv
-rwxrwxrwx 1 root root 2.1M Mar  2 13:34 part-00005-4de89aaa-a710-43f9-b28d-fb5a2dc538fe-c000.csv
-rwxrwxrwx 1 root root 2.1M Mar  2 13:34 part-00006-4de89aaa-a710-43f9-b28d-fb5a2dc538fe-c000.csv
-rwxrwxrwx 1 root root 2.1M Mar  2 13:34 part-00007-4de89aaa-a710-43f9-b28d-fb5a2dc538fe-c000.csv
-rwxrwxrwx 1 root root 2.1M Mar  2 13:34 part-00008-4de89aaa-a710-43f9-b28d-fb5a2dc538fe-c000.csv
-rwxrwxrwx 1 root root 2.1M Mar  2 13:34 part-0

# Spark Dataframes

In [15]:
# # With this new schema, we can now create a dataframe with inferred datatypes.

spark.read.csv('data/raw/green/2020/01/')

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string, _c14: string, _c15: string, _c16: string, _c17: string, _c18: string, _c19: string]

In [16]:
# # With this new schema, we can now create a dataframe with inferred datatypes.

df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



# Functions and UDFs

In [17]:
from pyspark.sql import functions as F

In [18]:
df.show()

[Stage 8:>                                                          (0 + 1) / 1]

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2020-01-27 21:32:52|  2020-01-27 21:43:54|                 N|         1|          74|         213|              1|         5.93|       18.0|  0.5|    0.

                                                                                

In [19]:
# # With this new schema, we can now create a dataframe with inferred datatypes.

df \
    .withColumn('pickup_date', F.to_date(df.lpep_pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.lpep_dropoff_datetime)) \
    .select('pickup_date', 'dropoff_date', 'PULocationID', 'DOLocationID') \
    .show()

[Stage 11:>                                                         (0 + 1) / 1]

+-----------+------------+------------+------------+
|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-----------+------------+------------+------------+
| 2020-01-12|  2020-01-12|         218|          95|
| 2020-01-08|  2020-01-08|          41|         185|
| 2020-01-26|  2020-01-26|          74|         166|
| 2020-01-24|  2020-01-24|           7|           7|
| 2020-01-04|  2020-01-04|         260|         173|
| 2020-01-06|  2020-01-06|         242|         254|
| 2020-01-26|  2020-01-26|          76|          61|
| 2020-01-16|  2020-01-16|         159|         212|
| 2020-01-27|  2020-01-27|          82|         258|
| 2020-01-15|  2020-01-15|          37|          35|
| 2020-01-17|  2020-01-17|          95|         216|
| 2020-01-17|  2020-01-17|         198|         124|
| 2020-01-07|  2020-01-07|          65|          52|
| 2020-01-12|  2020-01-12|          69|          75|
| 2020-01-23|  2020-01-23|          25|          17|
| 2020-01-10|  2020-01-10|         265|       

                                                                                

In [20]:
# # With this new schema, we can now create a dataframe with inferred datatypes.

df \
    .withColumn('pickup_date', F.to_date(df.lpep_pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.lpep_dropoff_datetime)) \
    .select('VendorID', 'pickup_date', 'dropoff_date', 'PULocationID', 'DOLocationID') \
    .show()

[Stage 14:>                                                         (0 + 1) / 1]

+--------+-----------+------------+------------+------------+
|VendorID|pickup_date|dropoff_date|PULocationID|DOLocationID|
+--------+-----------+------------+------------+------------+
|       2| 2020-01-04|  2020-01-04|          41|          41|
|    null| 2020-01-24|  2020-01-24|          72|          35|
|       2| 2020-01-03|  2020-01-03|          75|         238|
|       2| 2020-01-14|  2020-01-14|         145|         145|
|       2| 2020-01-08|  2020-01-08|          74|         151|
|       2| 2020-01-08|  2020-01-08|          95|         198|
|       2| 2020-01-06|  2020-01-06|           7|         223|
|       2| 2020-01-02|  2020-01-02|          82|         260|
|       2| 2020-01-06|  2020-01-06|          33|          52|
|       2| 2020-01-05|  2020-01-05|          82|          82|
|       1| 2020-01-18|  2020-01-18|          82|          95|
|       2| 2020-01-16|  2020-01-16|          75|         238|
|       2| 2020-01-01|  2020-01-01|         189|          89|
|       

                                                                                