In [22]:
import configparser
import os, glob
from datetime import datetime
import pandas as pd 
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, StringType


In [23]:
spark = SparkSession.builder.config(
        "spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0"
        ).getOrCreate()

### Read data and create tempview

In [101]:
spark.read.parquet("hvfhs-data-lake/hvl/hvl_table.parquet").createOrReplaceTempView("hvl")
spark.read.parquet("hvfhs-data-lake/weather/weather_table.parquet").createOrReplaceTempView("weather")
spark.read.parquet("hvfhs-data-lake/trip/trip_table.parquet").createOrReplaceTempView("trip")
spark.read.parquet("hvfhs-data-lake/location/location_table.parquet").createOrReplaceTempView("location")
spark.read.parquet("hvfhs-data-lake/time/datetime_table.parquet").createOrReplaceTempView("datetime")


##### Get the number of trips for each weather condition

In [51]:
spark.sql("""SELECT w.condition, COUNT(t.trip_id) as trips
FROM trip t
JOIN weather w ON t.weather = w.weather_id
GROUP BY w.condition
ORDER BY trips DESC
""").toPandas().head(20)

                                                                                

Unnamed: 0,condition,trips
0,Mostly Cloudy / Windy,9262750.0
1,Wintry Mix,9254740.0
2,Light Snow,9251317.0
3,Light Rain,9244854.0
4,Heavy Rain,9244330.0
5,Partly Cloudy,9243314.0
6,Cloudy,9241598.0
7,Mostly Cloudy,9238730.0
8,Fair,9237345.0
9,Rain,9235870.0


In [55]:
spark.sql("""
SELECT 
    weather.condition, 
    COUNT(*) as trips_count, 
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) AS trips_percentage
FROM 
    trip
    JOIN weather ON trip.weather = weather.weather_id
GROUP BY 
    weather.condition
ORDER BY 
    trips_count DESC;
""").toPandas().head(20)

23/04/15 02:06:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/15 02:06:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/15 02:06:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/15 02:06:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.




23/04/15 02:06:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/15 02:06:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/15 02:06:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/15 02:06:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


                                                                                

Unnamed: 0,condition,trips_count,trips_percentage
0,Mostly Cloudy,5582649,30.33
1,Cloudy,5093705,27.68
2,Fair,1889220,10.26
3,Light Rain,1659050,9.01
4,Partly Cloudy,1618489,8.79
5,Mostly Cloudy / Windy,447505,2.43
6,Cloudy / Windy,434878,2.36
7,Rain,397843,2.16
8,Partly Cloudy / Windy,291107,1.58
9,Light Snow,171222,0.93


#### Get the total revenue for each weather condition:

In [61]:
spark.sql("""
SELECT w.condition, 
SUM(t.base_passenger_fare + t.tolls + t.bcf + t.sales_tax + t.congestion_surcharge + t.airport_fee + t.tips) as total_revenue
FROM trip t
JOIN weather w ON t.weather = w.weather_id
GROUP BY w.condition
ORDER BY total_revenue DESC
 """).toPandas().head(21)

                                                                                

Unnamed: 0,condition,total_revenue
0,Mostly Cloudy,150722200.0
1,Cloudy,139595700.0
2,Fair,50726250.0
3,Light Rain,49069660.0
4,Partly Cloudy,45016320.0
5,Mostly Cloudy / Windy,12130650.0
6,Cloudy / Windy,11366680.0
7,Rain,10562770.0
8,Partly Cloudy / Windy,7811550.0
9,Light Snow,4593121.0


#### Get the average trip distance and time for each weather condition:

In [64]:
spark.sql("""SELECT w.condition, AVG(t.trip_miles) as avg_miles, AVG(t.trip_time) as avg_time
FROM trip t
JOIN weather w ON t.weather = w.weather_id
GROUP BY w.condition
ORDER BY avg_time DESC
""").toPandas().head(21)

                                                                                

Unnamed: 0,condition,avg_miles,avg_time
0,Heavy Rain / Windy,4.168001,1326.25509
1,Wintry Mix,4.51677,1167.233777
2,Light Rain,4.706033,1132.237915
3,Mostly Cloudy / Windy,4.816865,1119.079988
4,Mostly Cloudy,4.852299,1116.661441
5,Cloudy,4.937213,1093.250481
6,Partly Cloudy,5.020725,1083.621903
7,Rain,4.704648,1075.78853
8,Light Rain with Thunder,4.954317,1064.518452
9,Fog,5.050995,1062.612151


#### Get the total number of taxi trips for each hour of the day, broken down by weather condition:

In [67]:
spark.sql("""
SELECT w.condition, d.hour, COUNT(t.trip_id) as num_trips
FROM trip t
JOIN weather w ON t.weather = w.weather_id
JOIN datetime d ON t.request_datetime = d.datetime_id
GROUP BY w.condition, d.hour
ORDER BY num_trips DESC
""").toPandas().head(50)

[Stage 200:>                                                       (0 + 4) / 10]

23/04/15 02:29:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:29:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:29:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:29:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:29:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:29:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:29:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:29:42 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:29:43 WARN RowBasedKeyValueBatch: Calling spill() on



23/04/15 02:30:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/15 02:30:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:30:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/15 02:30:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:30:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:30:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:30:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:30:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:30:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:30:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:30:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:30:38 WARN RowBasedKeyValueBatch: Calling spill() on



23/04/15 02:31:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/15 02:31:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:31:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

Unnamed: 0,condition,hour,num_trips
0,Mostly Cloudy,19,15125585
1,Mostly Cloudy,20,13613640
2,Mostly Cloudy,21,11715962
3,Cloudy,10,11223512
4,Cloudy,0,10458931
5,Mostly Cloudy,18,10271090
6,Mostly Cloudy,17,9200646
7,Mostly Cloudy,22,9097092
8,Mostly Cloudy,16,8350881
9,Cloudy,21,8337301


#### Get total number of trips in data

In [68]:
spark.sql("SELECT COUNT(trip_id) AS total_trips FROM trip").toPandas().head()

                                                                                

Unnamed: 0,total_trips
0,18479031


#### Get the total revenue generated by each base:

In [103]:
spark.sql(
    """SELECT h.affiliation, SUM(t.base_passenger_fare + t.tolls + t.bcf + t.sales_tax + t.congestion_surcharge + t.airport_fee + t.tips + t.driver_pay) AS total_revenue
FROM trip t
JOIN hvl h ON t.hvfhs_license_num = h.hv_license_number
GROUP BY h.affiliation
ORDER BY total_revenue DESC
    """
).toPandas().head()

                                                                                

Unnamed: 0,affiliation,total_revenue
0,Uber,18497890000.0
1,Lyft,399290200.0


#### Get the total number of trips per day of the week:

In [78]:
spark.sql(
"""SELECT d.dow, COUNT(*) AS total_trips
FROM trip t
JOIN datetime d ON t.pickup_datetime = d.datetime_id
GROUP BY d.dow
ORDER BY total_trips DESC
"""
).toPandas().head()

[Stage 238:>                                                        (0 + 4) / 6]

23/04/15 02:45:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:45:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:45:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:45:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:45:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:45:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:45:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:45:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:45:41 WARN RowBasedKeyValueBatch: Calling spill() on



23/04/15 02:46:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:46:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/15 02:46:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:46:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:46:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:46:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:46:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:46:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/15 02:46:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

Unnamed: 0,dow,total_trips
0,1,85836743
1,7,81474201
2,6,66846355
3,5,61141149
4,3,57153664


####  get the top 10 busiest pickup locations:

In [94]:
spark.sql(
"""SELECT t.PULocationID, l.borough, l.zone, COUNT(t.trip_id) AS total_trips
FROM trip t
JOIN location l ON t.PULocationID = l.location_id
GROUP BY t.PULocationID, l.borough, l.zone
ORDER BY total_trips DESC
LIMIT 10;
"""

).toPandas().head()

                                                                                

Unnamed: 0,PULocationID,borough,zone,total_trips
0,132,Queens,JFK Airport,346138
1,138,Queens,LaGuardia Airport,307590
2,79,Manhattan,East Village,276059
3,230,Manhattan,Times Sq/Theatre District,242050
4,61,Brooklyn,Crown Heights North,241050
