# Spark Dataframes

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types

In [2]:
spark = SparkSession.builder.master(
    "local[*]"
).appName(
    "test"
).getOrCreate()

24/09/25 18:15:42 WARN Utils: Your hostname, DESKTOP-CSJ1S7Q resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/09/25 18:15:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/09/25 18:15:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/25 18:15:47 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [19]:
df = spark.read.parquet("fhvhv/2021/01/")

df.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp (nullable = true)
 |-- on_scene_datetime: timestamp (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_ride_flag: string (nul

## Actions and Transformations

In [21]:
df.select("pickup_datetime", "dropoff_datetime", "PULocationID", "DOLocationID") \
    .filter(df.hvfhs_license_num == "HV0003") \
    .show()

+-------------------+-------------------+------------+------------+
|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|
+-------------------+-------------------+------------+------------+
|2021-01-11 19:40:22|2021-01-11 20:15:49|         262|         231|
|2021-01-05 16:13:22|2021-01-05 16:27:50|          61|         181|
|2021-01-31 19:42:09|2021-01-31 19:59:52|         232|           4|
|2021-01-27 23:24:36|2021-01-27 23:26:43|          68|          68|
|2021-01-30 09:35:46|2021-01-30 09:39:42|         256|         255|
|2021-01-16 03:25:35|2021-01-16 03:34:21|          89|          91|
|2021-01-11 12:58:23|2021-01-11 13:14:19|          97|          61|
|2021-01-03 08:44:58|2021-01-03 09:04:45|          26|         178|
|2021-01-14 19:52:00|2021-01-14 20:19:00|         181|         198|
|2021-01-08 21:35:35|2021-01-08 22:06:33|          76|          91|
|2021-01-15 14:49:48|2021-01-15 15:35:23|         246|          16|
|2021-01-27 11:37:56|2021-01-27 11:53:35|       

Transformations: Are not executed immediately but only when needed -> lazy
* show, take, head
* write

Actions: Make the computations happen -> eager
* filter
* select
* join
* ...

## User defined functions

In [22]:
from pyspark.sql import functions as F

In [29]:
def stuff(base_num):
    num = int(base_num[1:])
    if num % 7 == 0:
        return f"s/{num}:x"
    return f"e/{num}:x"

In [31]:
stuff_udf = F.udf(stuff, returnType=types.StringType())

In [33]:
df \
    .withColumn("pickup_date", F.to_date(df.pickup_datetime)) \
    .withColumn("dropoff_date", F.to_date(df.dropoff_datetime)) \
    .withColumn("base_id", stuff_udf(df.dispatching_base_num)) \
    .select("pickup_date", "dropoff_date", "PULocationID", "DOLocationID", "base_id") \
    .show(5)

+-----------+------------+------------+------------+--------+
|pickup_date|dropoff_date|PULocationID|DOLocationID| base_id|
+-----------+------------+------------+------------+--------+
| 2021-01-11|  2021-01-11|         262|         231|e/2764:x|
| 2021-01-05|  2021-01-05|          61|         181|e/2617:x|
| 2021-01-02|  2021-01-02|         100|           1|e/2510:x|
| 2021-01-31|  2021-01-31|         232|           4|e/2882:x|
| 2021-01-05|  2021-01-05|         162|           1|s/2800:x|
+-----------+------------+------------+------------+--------+
only showing top 5 rows



                                                                                