In [1]:
# Adjust Python path within the notebook
import sys
project_root = '/Users/thangnguyen/Documents/GitHub/project-1-individual-knam2609'
if project_root not in sys.path:
    sys.path.insert(0, project_root)

import scripts

In [2]:
import matplotlib.pyplot as plt
import seaborn as sb
import pandas
import os

In [3]:
# Create SparkSession
spark = scripts.clean_base.create_spark_session()

24/08/27 17:20:01 WARN Utils: Your hostname, THANGs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 100.86.89.69 instead (on interface en0)
24/08/27 17:20:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/27 17:20:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/27 17:20:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
# Create directories for plots
scripts.download.make_directories(["../plots/"], ["uber"])

In [5]:
# Create directories for plots
scripts.download.make_directories(["../plots/uber/"], ["correlation", "histogram", "daily", "hourly"])

In [6]:
uber_files = scripts.clean_base.list_parquet_directories("../data/raw/uber/")
uber_dfs = [spark.read.parquet(file) for file in uber_files]

In [7]:
uber_files

['../data/raw/uber/2023-08.parquet',
 '../data/raw/uber/2023-11.parquet',
 '../data/raw/uber/2023-10.parquet',
 '../data/raw/uber/2023-09.parquet',
 '../data/raw/uber/2023-06.parquet',
 '../data/raw/uber/2023-07.parquet']

In [8]:
# Extract the schema from the existing DataFrame
schema = uber_dfs[0].schema

# Create an empty DataFrame using the extracted schema
unioned_df = spark.createDataFrame([], schema)

unioned_df # merge sub-sample of each datasets

dispatching_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_distance,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,total_amount,waiting_time,fare_per_miles


In [9]:
# Plotting correlation heatmap of every datasets
for df in uber_dfs:
    file_path = uber_files[uber_dfs.index(df)]
    # Extract the base name of the file
    file_name = os.path.basename(file_path)
    # Remove the file extension to get only the date part
    date_part = os.path.splitext(file_name)[0]
    print(date_part)
    scripts.manipulate_data.find_min_max_df(df, scripts.clean_high_volume.COLUMNS)
    df.count()
    df = scripts.clean_base.sampling_data(df, "dispatching_base_num", 0.05)
    df.count()
    unioned_df = unioned_df.union(df)
    scripts.plot_data.plot_correlation_heatmap(df, scripts.clean_high_volume.COLUMNS, "uber", "../plots/uber/correlation/", date_part)

2023-08




../plots/uber/correlation/2023-08.png
2023-11




../plots/uber/correlation/2023-11.png
2023-10


24/08/27 17:20:20 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


../plots/uber/correlation/2023-10.png
2023-09




../plots/uber/correlation/2023-09.png
2023-06




../plots/uber/correlation/2023-06.png
2023-07




../plots/uber/correlation/2023-07.png


In [10]:
unioned_df.count()




4067358

In [11]:
# Plotting distribution of continous columns
for col in scripts.clean_high_volume.COLUMNS:
    scripts.plot_data.plot_histogram(unioned_df, col, "uber", "../plots/uber/histogram/", 50)



0 104.8
../plots/uber/histogram/trip_distance_histogram.png




0 202.55
../plots/uber/histogram/trip_time_histogram.png




0 317.55
../plots/uber/histogram/base_passenger_fare_histogram.png




0.0 51.0
../plots/uber/histogram/tolls_histogram.png




0.0 19.29
../plots/uber/histogram/bcf_histogram.png




0.0 35.03
../plots/uber/histogram/sales_tax_histogram.png




0.0 15.5
../plots/uber/histogram/congestion_surcharge_histogram.png




0.0 15.0
../plots/uber/histogram/airport_fee_histogram.png




0.0 64.28
../plots/uber/histogram/tips_histogram.png




0.0 271.68
../plots/uber/histogram/driver_pay_histogram.png




0 357.19000000000005
../plots/uber/histogram/total_amount_histogram.png




0 52.88333333333333
../plots/uber/histogram/waiting_time_histogram.png




0 42.462809917355365
../plots/uber/histogram/fare_per_miles_histogram.png


Plotting continuous columns against time series like days and hours

In [12]:
from pyspark.sql.functions import avg, to_date, hour

# Convert timestamp to date
daily_avg_df = unioned_df.withColumn("pickup_date", to_date(unioned_df["pickup_datetime"]))


In [13]:
# Daily
for col in scripts.clean_high_volume.COLUMNS:
    # Group by date and calculate average
    average_df = daily_avg_df.groupBy("pickup_date").agg(avg(col).alias(col))
    average_df.count()
    scripts.plot_data.scatter_plot(average_df, "pickup_date", col, "uber", "../plots/uber/daily/")



TypeError: scatter_plot() missing 1 required positional argument: 'output_path'

In [None]:
# Hourly
for col in scripts.clean_high_volume.COLUMNS:
    # Extract hour from timestamp and group by it
    hourly_avg_df = df.groupBy(hour(df["pickup_datetime"]).alias("hour")).agg(avg(col).alias(col))
    scripts.plot_data.scatter_plot(hourly_avg_df, "hour", col, "uber", "../plots/uber/hourly/")

Compare average values of continuous columns grouped by categorical columns

In [None]:
print("Grouped by: dispatching_base_num")
print(scripts.manipulate_data.group_by_and_avg(unioned_df, "dispatching_base_num", scripts.clean_high_volume.COLUMNS))

Grouped by: dispatching_base_num




   dispatching_base_num  average_trip_distance  average_trip_time  \
0                B03404               5.162139        1223.020417   
1                B03556               1.460000         542.000000   
2                B03438               1.420000        1965.000000   
3                B03472               1.270000         571.000000   
4                B03408               1.420000         617.000000   
5                B03430               2.321667         830.833333   
6                B03493               4.997895        1033.263158   
7                B02764               5.563378        1278.783784   
8                B02835               3.143000         969.400000   
9                B02872              12.310000        1583.846154   
10               B02871               3.372778         930.722222   
11               B02889               5.495500        1302.250000   
12               B02887               6.094167        1350.416667   
13               B02870           



In [None]:
# Write curated dataset
scripts.clean_base.write_data(unioned_df, "../data/curated/uber/uber.parquet")



dispatching_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_distance,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay
B03404,2023-08-27 10:13:23,2023-08-27 10:15:38,2023-08-27 10:16:45,2023-08-27 10:34:44,60,7,7.31,1079,18.43,6.94,0.7,2.25,0.0,0.0,0.0,19.97
B03404,2023-08-27 10:38:13,2023-08-27 10:40:35,2023-08-27 10:41:23,2023-08-27 10:59:49,249,161,3.12,1106,14.49,0.0,0.4,1.29,2.75,0.0,3.0,14.49
B03404,2023-08-27 10:01:41,2023-08-27 10:06:26,2023-08-27 10:06:57,2023-08-27 10:22:26,234,231,3.0,929,23.74,0.0,0.65,2.11,2.75,0.0,2.0,13.29
B03404,2023-08-27 10:13:13,2023-08-27 10:15:27,2023-08-27 10:15:40,2023-08-27 10:24:43,83,223,3.2,543,12.72,0.0,0.35,1.13,0.0,0.0,0.0,9.3
B03404,2023-08-27 10:24:47,2023-08-27 10:26:51,2023-08-27 10:28:44,2023-08-27 10:40:50,102,102,1.54,726,11.8,0.0,0.32,1.05,0.0,0.0,0.0,8.84
B03404,2023-08-27 10:47:16,2023-08-27 10:48:51,2023-08-27 10:50:51,2023-08-27 11:24:29,236,265,12.35,2018,51.91,20.0,1.98,0.0,0.0,0.0,0.0,39.04
B03404,2023-08-27 10:44:00,2023-08-27 10:45:38,2023-08-27 10:46:23,2023-08-27 10:53:01,41,236,1.88,398,12.72,0.0,0.35,1.13,2.75,0.0,0.0,6.21
B03404,2023-08-27 10:14:58,2023-08-27 10:19:12,2023-08-27 10:21:13,2023-08-27 10:49:12,61,256,4.78,1679,23.49,0.0,0.65,2.08,0.0,0.0,0.0,22.06
B03404,2023-08-27 10:55:49,2023-08-27 10:58:19,2023-08-27 10:58:27,2023-08-27 11:15:55,149,22,3.21,1048,17.84,0.0,0.49,1.58,0.0,0.0,0.0,14.07
B03404,2023-08-27 10:44:58,2023-08-27 10:50:20,2023-08-27 10:50:49,2023-08-27 11:31:54,72,132,9.86,2465,35.7,0.0,1.05,3.39,0.0,2.5,6.39,38.17
