Some useful manuals to start Spark in Colab:
*   https://levelup.gitconnected.com/launch-spark-on-google-colab-and-connect-to-sparkui-342cad19b304
*   https://medium.com/@TheITspace/running-pyspark-on-google-colab-2552435972b3
*   https://colab.research.google.com/drive/1fa2G3YuXx3Isqyby5kFETqmWotFwtqlH?usp=sharing#scrollTo=-JgkMmYgS0Za

For accessing web ui from outside of Colab we may use ngrok: https://ngrok.com/docs/getting-started/





In [8]:
%%capture
!pip install pyspark
!pip install findspark
!pip install pyngrok

In [9]:
import findspark

findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName('myColabSpark') \
        .getOrCreate()

In [10]:
from pyngrok import ngrok, conf
import getpass

print("Enter your authtoken, which can be copied from https://dashboard.ngrok.com/get-started/your-authtoken")
conf.get_default().auth_token = getpass.getpass()

tunnel = ngrok.connect(addr=spark.sparkContext.uiWebUrl, bind_tls=True)
print(f" * ngrok tunnel \"{tunnel.public_url}\" -> \"{spark.sparkContext.uiWebUrl}\"")

Enter your authtoken, which can be copied from https://dashboard.ngrok.com/get-started/your-authtoken
··········
 * ngrok tunnel "https://d557-34-138-57-80.ngrok-free.app" -> "http://4b5d4d978470:4040"


In [16]:
from pyspark import SparkFiles

file_url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-10.parquet'

spark.sparkContext.addFile(file_url)

df = spark.read.parquet(SparkFiles.get('yellow_tripdata_2024-10.parquet'), header=True)

df.count()

3833771

# Question 1

In [17]:
spark.version

'3.5.5'

# Question 2

In [14]:
dir_to_save_parquets = 'yellow_tripdata_2024-10_parquets'
df = df.repartition(4)
df.write.parquet(dir_to_save_parquets)

In [59]:
import os
files = os.listdir(dir_to_save_parquets)
print('Saved files:')
print('\n'.join(files))

parquet_files = [f for f in files if f.endswith('.parquet')]
parquet_file_sizes_in_bytes = [(f, os.path.getsize(os.path.join(dir_to_save_parquets, f))) for f in parquet_files]
parquet_file_sizes_in_MB = [(f, size / (1024 ** 2)) for f, size in parquet_file_sizes_in_bytes]

print('\nParquets sizes:')
for file, size in parquet_file_sizes_in_MB:
    print(f"File: {file}, Size: {size:.2f} MB")

average_in_bytes = sum([size for _,size in parquet_file_sizes_in_bytes]) / len(parquet_files) if parquet_files else 0
print('\nAvg size of parquet file (Mb):', average_in_bytes/(1024**2))

Saved files:
.part-00001-fe83042f-56bf-497a-a1ff-7aed32de19bf-c000.snappy.parquet.crc
part-00001-fe83042f-56bf-497a-a1ff-7aed32de19bf-c000.snappy.parquet
.part-00002-fe83042f-56bf-497a-a1ff-7aed32de19bf-c000.snappy.parquet.crc
._SUCCESS.crc
_SUCCESS
part-00003-fe83042f-56bf-497a-a1ff-7aed32de19bf-c000.snappy.parquet
part-00002-fe83042f-56bf-497a-a1ff-7aed32de19bf-c000.snappy.parquet
part-00000-fe83042f-56bf-497a-a1ff-7aed32de19bf-c000.snappy.parquet
.part-00000-fe83042f-56bf-497a-a1ff-7aed32de19bf-c000.snappy.parquet.crc
.part-00003-fe83042f-56bf-497a-a1ff-7aed32de19bf-c000.snappy.parquet.crc

Parquets sizes:
File: part-00001-fe83042f-56bf-497a-a1ff-7aed32de19bf-c000.snappy.parquet, Size: 23.02 MB
File: part-00003-fe83042f-56bf-497a-a1ff-7aed32de19bf-c000.snappy.parquet, Size: 23.05 MB
File: part-00002-fe83042f-56bf-497a-a1ff-7aed32de19bf-c000.snappy.parquet, Size: 23.06 MB
File: part-00000-fe83042f-56bf-497a-a1ff-7aed32de19bf-c000.snappy.parquet, Size: 23.04 MB

Avg size of parquet fi

# Question 3

In [107]:
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- time_diff_seconds: long (nullable = true)
 |-- time_diff_hours: double

In [108]:
from pyspark.sql.functions import col, to_timestamp

df = df.withColumn("lpep_pickup_datetime", to_timestamp(col="tpep_pickup_datetime", format="yyyy-MM-dd HH:mm:ss"))
df = df.withColumn("tpep_dropoff_datetime", to_timestamp(col="tpep_dropoff_datetime", format="yyyy-MM-dd HH:mm:ss"))

filtered_df = df.filter(
    (col("tpep_pickup_datetime") >= "2024-10-15 00:00:00") &
    (col("tpep_pickup_datetime") < "2024-10-16 00:00:00")
)
# filtered_df = df.filter(col("lpep_pickup_datetime").between("2024-10-15 00:00:00", "2024-10-15 23:59:59"))

print(f"15th of October trips: {filtered_df.count()}")

15th of October trips: 128893


# Question 4

In [74]:
from pyspark.sql.functions import col, unix_timestamp, expr

df = df.withColumn("time_diff_seconds",
                   unix_timestamp(col("tpep_dropoff_datetime")) - unix_timestamp(col("lpep_pickup_datetime")))

df = df.withColumn("time_diff_hours", expr("time_diff_seconds / 3600"))

longest_trip_in_hours = df.orderBy(col("time_diff_hours").desc()).first()

print(f"Longest trip in hours: {longest_trip_in_hours['time_diff_hours']:.2f}")

Longest trip in hours: 162.62 


# Question 5

In [76]:
print(f'Local spark url (with port) is: {spark.sparkContext.uiWebUrl}')

Local spark url (with port) is: http://4b5d4d978470:4040


# Question 6

In [111]:
file_zones_url = 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv'
spark.sparkContext.addFile(file_zones_url)

zone_df = spark.read.csv(SparkFiles.get(os.path.basename(file_zones_url)),
                         header=True,
                         inferSchema=True)

zone_df.printSchema()

zone_df.show(10, truncate=False)

# from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# schema = StructType([
#     StructField("LocationID", IntegerType(), True),
#     StructField("Borough", StringType(), True),
#     StructField("Zone", StringType(), True),
#     StructField("service_zone", StringType(), True)
# ])
# zone_df = spark.read.csv(SparkFiles.get('taxi_zone_lookup.csv'),
#                          header=True,
#                          schema=schema)


root
 |-- LocationID: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)

+----------+-------------+-----------------------+------------+
|LocationID|Borough      |Zone                   |service_zone|
+----------+-------------+-----------------------+------------+
|1         |EWR          |Newark Airport         |EWR         |
|2         |Queens       |Jamaica Bay            |Boro Zone   |
|3         |Bronx        |Allerton/Pelham Gardens|Boro Zone   |
|4         |Manhattan    |Alphabet City          |Yellow Zone |
|5         |Staten Island|Arden Heights          |Boro Zone   |
|6         |Staten Island|Arrochar/Fort Wadsworth|Boro Zone   |
|7         |Queens       |Astoria                |Boro Zone   |
|8         |Queens       |Astoria Park           |Boro Zone   |
|9         |Queens       |Auburndale             |Boro Zone   |
|10        |Queens       |Baisley Park           |Boro Zone   |


In [112]:
zone_df.createOrReplaceTempView("zone_table")
df.createOrReplaceTempView("yellow_tripdata")

query_str = """
    SELECT
        zones.Zone,
        COUNT(*) AS trip_count
    FROM
        yellow_tripdata AS yt
    INNER JOIN
        zone_table AS zones
    ON
        yt.PULocationID = zones.LocationID
    GROUP BY
        zones.Zone
    ORDER BY
        trip_count ASC
    LIMIT 10
"""

least_frequent_zone = spark.sql(query_str)

least_frequent_zone.show(1, truncate=False)

+---------------------------------------------+----------+
|Zone                                         |trip_count|
+---------------------------------------------+----------+
|Governor's Island/Ellis Island/Liberty Island|1         |
+---------------------------------------------+----------+
only showing top 1 row

