In [27]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types

In [4]:
import os

# First Look at Spark

In [3]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("test") \
    .getOrCreate()

23/06/15 15:29:07 WARN Utils: Your hostname, jose-MacBookPro resolves to a loopback address: 127.0.1.1; using 192.168.1.224 instead (on interface wlp2s0)
23/06/15 15:29:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/06/15 15:29:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Using pandas to derive a schema quickly

If you're dealing with a large csv, you may want to start with a chunk of it first in pandas.

Here's a bash/linux terminal command to cut things down to size, the exclamation mark should be used if you want to use the command in the context of the jupyter notebook:

```
!head -n 1001 file_name.csv > just_head.csv
```

Create a pandas dataframe with the reduced dataset.

After this, you can use the method below to create a spark dataframe from a pandas dataframe:

```
spark.createDataFrame(pandas_df)
```


### Defining a schema manually
```python
from pyspark.sql import types

# the last argument in the StructField type corresponds to whether the field is nullable
schema = types.StructType([
    types.StructField('id', types.IntegerType(), True),
    types.StructField('name', types.StringType(), True),
    types.StructField('date', types.TimestampType(), True)
])

df = spark.read \
    .option("header", "true")
    .schema(schema)
    .csv('some_csv_file.csv', 
```

In [7]:
df = spark.read.parquet("data/fhvhv_tripdata_2023-01.parquet")

                                                                                

In [8]:
df.show(5)



+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+----+----------+-------------------+-----------------+------------------+----------------+--------------+
|hvfhs_license_num|dispatching_base_num|originating_base_num|   request_datetime|  on_scene_datetime|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|trip_miles|trip_time|base_passenger_fare|tolls| bcf|sales_tax|congestion_surcharge|airport_fee|tips|driver_pay|shared_request_flag|shared_match_flag|access_a_ride_flag|wav_request_flag|wav_match_flag|
+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+--

                                                                                

In [33]:
# this will usually just be a bunch of strings
# when you're using a csv, spark doesn't infer
# the schema the way pandas does. With parquet
# however, the schema *is* read
# you can also set "inferSchema" option to "true"

df.head()

Row(hvfhs_license_num='HV0003', dispatching_base_num='B03404', originating_base_num='B03404', request_datetime=datetime.datetime(2023, 1, 1, 1, 18, 6), on_scene_datetime=datetime.datetime(2023, 1, 1, 1, 19, 24), pickup_datetime=datetime.datetime(2023, 1, 1, 1, 19, 38), dropoff_datetime=datetime.datetime(2023, 1, 1, 1, 48, 7), PULocationID=48, DOLocationID=68, trip_miles=0.94, trip_time=1709, base_passenger_fare=25.95, tolls=0.0, bcf=0.78, sales_tax=2.3, congestion_surcharge=2.75, airport_fee=0.0, tips=5.22, driver_pay=27.83, shared_request_flag='N', shared_match_flag='N', access_a_ride_flag=' ', wav_request_flag='N', wav_match_flag='N')

In [11]:
df.repartition(24)

DataFrame[hvfhs_license_num: string, dispatching_base_num: string, originating_base_num: string, request_datetime: timestamp, on_scene_datetime: timestamp, pickup_datetime: timestamp, dropoff_datetime: timestamp, PULocationID: bigint, DOLocationID: bigint, trip_miles: double, trip_time: bigint, base_passenger_fare: double, tolls: double, bcf: double, sales_tax: double, congestion_surcharge: double, airport_fee: double, tips: double, driver_pay: double, shared_request_flag: string, shared_match_flag: string, access_a_ride_flag: string, wav_request_flag: string, wav_match_flag: string]

In [12]:
df.write.parquet("data/fhv/2023/06") # mode=' overwrite'

                                                                                

# Working With DataFrames

In [13]:
df = spark.read.parquet("data/fhv/2023/06")

In [15]:
df.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp (nullable = true)
 |-- on_scene_datetime: timestamp (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_ride_flag: string (nul

## Using SQL Functions

In [23]:
df.select('pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID') \
    .filter(F.col('hvfhs_license_num') == 'HV0003') \
    .withColumn('pu_date', F.to_date(F.col('pickup_datetime'))) \
    .show()

+-------------------+-------------------+------------+------------+----------+
|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|   pu_date|
+-------------------+-------------------+------------+------------+----------+
|2023-01-01 01:19:38|2023-01-01 01:48:07|          48|          68|2023-01-01|
|2023-01-01 01:58:39|2023-01-01 02:33:08|         246|         163|2023-01-01|
|2023-01-01 01:20:27|2023-01-01 01:37:54|           9|         129|2023-01-01|
|2023-01-01 01:41:05|2023-01-01 01:48:16|         129|         129|2023-01-01|
|2023-01-01 01:52:47|2023-01-01 02:04:51|         129|          92|2023-01-01|
|2023-01-01 01:10:29|2023-01-01 01:18:22|          90|         231|2023-01-01|
|2023-01-01 01:22:10|2023-01-01 01:33:14|         125|         246|2023-01-01|
|2023-01-01 01:39:09|2023-01-01 02:03:50|          68|         231|2023-01-01|
|2023-01-01 01:14:35|2023-01-01 01:49:13|          79|          50|2023-01-01|
|2023-01-01 01:52:15|2023-01-01 02:31:11|         14

### Using UDF
UDF == user defined functions (not always the best to use unless you're pretty sure that they are well written and performant AND necessary)

In [24]:
def crazy_requirement(base_num):
    """
    {num:03x} - this zero pads the integer below to three digits
                while the x indicates that
    """
    num = int(base_num[1:])
    
    if num % 7 == 0:
        return f's/{num:03x}'
    elif num % 3 == 0:
        return f'a/{num:03x}'
    else:
        return f'e/{num:03x}'

In [25]:
crazy_requirement('B02884')

's/b44'

In [28]:
crazy_udf = F.udf(crazy_requirement, returnType=types.StringType())

In [32]:
df.select('pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID', 'dispatching_base_num') \
    .filter(F.col('hvfhs_license_num') == 'HV0003') \
    .withColumn('pu_date', F.to_date(F.col('pickup_datetime'))) \
    .withColumn('crazy_col', crazy_udf(F.col('dispatching_base_num'))) \
    .show()

[Stage 12:>                                                         (0 + 1) / 1]

+-------------------+-------------------+------------+------------+--------------------+----------+---------+
|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|dispatching_base_num|   pu_date|crazy_col|
+-------------------+-------------------+------------+------------+--------------------+----------+---------+
|2023-01-01 01:19:38|2023-01-01 01:48:07|          48|          68|              B03404|2023-01-01|    e/d4c|
|2023-01-01 01:58:39|2023-01-01 02:33:08|         246|         163|              B03404|2023-01-01|    e/d4c|
|2023-01-01 01:20:27|2023-01-01 01:37:54|           9|         129|              B03404|2023-01-01|    e/d4c|
|2023-01-01 01:41:05|2023-01-01 01:48:16|         129|         129|              B03404|2023-01-01|    e/d4c|
|2023-01-01 01:52:47|2023-01-01 02:04:51|         129|          92|              B03404|2023-01-01|    e/d4c|
|2023-01-01 01:10:29|2023-01-01 01:18:22|          90|         231|              B03404|2023-01-01|    e/d4c|
|2023-01-0

                                                                                