## Install Dependencies (if needed) ## 

In [5]:
## !pip install pyspark
## !pip install pandas
## !pip install plotly

## Create Spark Session ##

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import plotly.express as px

spark = SparkSession.builder \
    .appName("NYC Taxi — Analysis Only") \
    .config("spark.driver.memory", "6g") \
    .config("spark.sql.shuffle.partitions", "50") \
    .getOrCreate()

print("Spark session created.")

Spark session created.


## Load Data from Parquet ##

In [29]:
combined_df = spark.read.parquet("Cleaned_Parquet/combined/")
city_hourly = spark.read.parquet("Cleaned_Parquet/city_hourly/")
zone_hourly = spark.read.parquet("Cleaned_Parquet/zone_hourly/")
taxi_df = spark.read.parquet("Cleaned_Parquet/ taxi_clean/")
weather_df = spark.read.parquet("Cleaned_Parquet/weather_clean/")

## Question 1: How Does Hourly Taxi Demand Vary Throughout the Day? ##

In [8]:
# Q1: How does hourly taxi demand vary throughout the day?
demand_by_hour = city_hourly.groupBy('hour').agg(avg('trip_count').alias('avg_trips')).orderBy('hour')
demand_by_hour.show(24)

# Export
demand_by_hour.coalesce(1).write.mode('overwrite').csv('outputs/demand_by_hour.csv', header=True)

+----+------------------+
|hour|         avg_trips|
+----+------------------+
|   0| 5275.260791366906|
|   1|3700.6666666666665|
|   2| 2600.190476190476|
|   3|1863.6003649635036|
|   4|1433.6386861313867|
|   5|1692.2531876138432|
|   6|3846.6854545454544|
|   7| 6764.540983606557|
|   8| 8462.445454545454|
|   9| 8613.586520947176|
|  10| 8651.165154264972|
|  11| 9092.161818181818|
|  12| 9647.972826086956|
|  13| 9787.379310344828|
|  14| 10319.35390199637|
|  15| 10372.43761301989|
|  16| 9820.644283121597|
|  17|11061.562613430127|
|  18| 12122.48731884058|
|  19|11411.127504553733|
|  20|10282.623188405798|
|  21|10133.723327305606|
|  22|  9419.59891107078|
|  23| 7380.220614828209|
+----+------------------+



In [28]:
import plotly.express as px

# Convert to pandas (24 rows — safe)
df_hourly = demand_by_hour.toPandas()

fig = px.line(
    df_hourly,
    x="hour",
    y="avg_trips",
    title="Average NYC Taxi Demand by Hour of Day",
    markers=True,
    labels={"hour": "Hour of Day (0–23)", "avg_trips": "Avg Trips per Hour"},
    template="plotly_dark"
)

fig.update_layout(
    xaxis=dict(dtick=1),
    title_font_size=20,
    yaxis_title="Avg Trips",
    xaxis_title="Hour"
)

fig.show()


## Question 2: Which NYC zones have the highest pickup activity? ##

In [10]:
# Q2: Which NYC zones have the highest pickup activity?
zone_totals = combined_df.groupBy('PULocationID').agg(count('*').alias('total_trips')).orderBy(col('total_trips').desc())
zone_totals.show(20)

zone_totals.coalesce(1).write.mode('overwrite').csv('outputs/zone_totals.csv', header=True)


                                                                                

+------------+-----------+
|PULocationID|total_trips|
+------------+-----------+
|         237|    4381819|
|         161|    4150064|
|         236|    3993845|
|         162|    3646602|
|         186|    3616284|
|         230|    3465283|
|         132|    3237159|
|          48|    3192020|
|         170|    3112745|
|         142|    3035796|
|         234|    3022014|
|         163|    2704450|
|         239|    2640544|
|          79|    2558840|
|         138|    2479920|
|          68|    2431547|
|         141|    2386134|
|         164|    2325227|
|         107|    2276794|
|         100|    2051024|
+------------+-----------+
only showing top 20 rows


                                                                                

In [21]:
from pyspark.sql.functions import col, count

# Compute pickups per zone
zone_demand = (
    taxi_df
    .groupBy("PULocationID")
    .agg(count("*").alias("total_pickups"))
    .orderBy(col("total_pickups").desc())
)

zone_pd = zone_demand.toPandas()

import plotly.express as px
fig = px.bar(
    zone_pd.head(20),  # top 20 busiest zones
    x='PULocationID',
    y='total_pickups',
    title='Top 20 NYC Pickup Zones by Activity',
    text_auto=True,
    template='plotly_dark'
)
fig.update_layout(xaxis_title='Pickup Zone (ID)', yaxis_title='Total Pickups')
fig.show()


                                                                                

### Analysis ###


The results show that the highest taxi pickup activity is heavily concentrated in Manhattan, particularly in the Upper East Side and Midtown regions. The busiest zones include Upper East Side North (PULocationID 237), Midtown Center (161), and Upper East Side South (236), each recording around four million or more total pickups. Other major hotspots such as Midtown North, Murray Hill–Kips Bay, Times Square, and several Upper West Side zones also rank within the top 20. These areas are characterized by dense residential neighborhoods, major business districts, high tourism, and large transit hubs, all of which contribute to consistently strong taxi demand. Overall, the results highlight that Manhattan dominates taxi pickup activity across the city.

## Question 3: How does rain affect taxi demand / number of pickups ? ##

Does rain increase or decrease taxi demand on a per-hour basis? (Did raining hours have more pickups than non-rain hours?)

In [11]:
#using 2 groups: raining vs not raining 

from pyspark.sql import functions as F

hourly_counts = (
    combined_df
    .groupBy("pickup_hour", "hour", "is_rainy")
    .agg(F.count("*").alias("trip_count"))
)
avg_by_rain = (
    hourly_counts
    .groupBy("is_rainy")
    .agg(F.avg("trip_count").alias("avg_trips_per_hour"))
    .orderBy("is_rainy")  
)
avg_by_rain.show()

#4 groups based on precipitation amt 
combined_df_prec_levels = (
    combined_df
    .withColumn(
        "precip_level",
        F.when(F.col("rain_mm") == 0, "None")
         .when((F.col("rain_mm") > 0) & (F.col("rain_mm") <= 1), "Light")
         .when((F.col("rain_mm") > 1) & (F.col("rain_mm") <= 5), "Moderate")
         .otherwise("Heavy")
    )
)
hourly_counts_prec_levels = (
    combined_df_prec_levels
    .groupBy("pickup_hour", "hour", "precip_level")
    .agg(F.count("*").alias("trip_count"))
)
avg_by_precip = (
    hourly_counts_prec_levels
    .groupBy("precip_level")
    .agg(F.avg("trip_count").alias("avg_trips_per_hour"))
    .orderBy("precip_level")
)

avg_by_precip.show()


                                                                                

+--------+------------------+
|is_rainy|avg_trips_per_hour|
+--------+------------------+
|       0| 7694.475052521008|
|       1| 7453.512548800893|
+--------+------------------+





+------------+------------------+
|precip_level|avg_trips_per_hour|
+------------+------------------+
|       Heavy|           1962.98|
|       Light|     7394.17421875|
|    Moderate| 7606.591375770021|
|        None| 7744.544493392071|
+------------+------------------+



                                                                                

Does rain change the distribution of demand across time?

In [12]:
# 2 groups 
hourly_distribution = (
    hourly_counts
    .groupBy("hour", "is_rainy")
    .agg(F.avg("trip_count").alias("avg_trips"))
    .orderBy("hour", "is_rainy")
)

hourly_pivot = (
    hourly_distribution
    .groupBy("hour")
    .pivot("is_rainy", [0, 1])
    .agg(F.first("avg_trips"))
    .withColumnRenamed("0", "avg_trips_clear")
    .withColumnRenamed("1", "avg_trips_rain")
    .orderBy("hour")
)
hourly_pivot.show(24)

##Plot results:

import pandas
import plotly.express as px
hourly_pd = hourly_pivot.toPandas()
fig = px.line(
    hourly_pd,
    x="hour",
    y=["avg_trips_clear", "avg_trips_rain"],
    title="Hourly Taxi Demand: Rain vs Clear",
    labels={"value": "Average Trips", "variable": "Weather"},
    markers=True,
    template="plotly_dark"
)
fig.update_layout(
    xaxis=dict(dtick=1),
    legend_title_text="Condition"
)
fig.show()

                                                                                

+----+------------------+------------------+
|hour|   avg_trips_clear|    avg_trips_rain|
+----+------------------+------------------+
|   0| 5303.867219917012|5088.9324324324325|
|   1|3688.8402489626556|            3782.1|
|   2|2608.5379746835442|2545.2361111111113|
|   3| 1880.162886597938| 1736.095238095238|
|   4|1442.9248434237995|1369.1739130434783|
|   5|1694.1945031712473| 1680.171052631579|
|   6|3834.1365546218485|3927.4054054054054|
|   7| 6736.797872340426| 6929.594936708861|
|   8| 8567.191176470587| 7788.675675675676|
|   9|  8727.73206751055|7892.1866666666665|
|  10|  8685.09631147541| 8388.333333333334|
|  11|  9057.90368852459| 9361.806451612903|
|  12|  9583.44578313253|10243.055555555555|
|  13| 9778.281690140846| 9871.111111111111|
|  14|10351.713402061856|10081.560606060606|
|  15| 10422.42468619247|10053.853333333333|
|  16| 9890.517671517671| 9340.514285714286|
|  17|11196.117270788913|10291.975609756097|
|  18|12284.768085106383|11192.341463414634|
|  19|1154

                                                                                

In [13]:
## 4 groups 
import plotly.express as px

hourly_distribution_prec_levels = (
    hourly_counts_prec_levels
    .groupBy("hour", "precip_level")
    .agg(F.avg("trip_count").alias("avg_trips"))
    .orderBy("hour", "precip_level")
)
hourly_pivot_prec_levels = (
    hourly_distribution_prec_levels
    .groupBy("hour")
    .pivot("precip_level", ["None", "Light", "Moderate", "Heavy"])
    .agg(F.first("avg_trips"))
    .orderBy("hour")
)
hourly_pivot_prec_levels.show(24)

prec_levels_pd = hourly_pivot_prec_levels.toPandas()
prec_levels_pd = prec_levels_pd.rename(columns={
    "None": "No Rain",
    "Light": "Light Rain",
    "Moderate": "Moderate Rain",
    "Heavy": "Heavy Rain"
})
fig = px.line(
    prec_levels_pd,
    x="hour",
    y=["No Rain", "Light Rain", "Moderate Rain", "Heavy Rain"],
    title="Hourly Taxi Demand by Precipitation Level",
    labels={"value": "Average Trips", "variable": "Precipitation"},
    markers=True,
    template="plotly_dark"
)

fig.update_layout(xaxis=dict(dtick=1))
fig.show()

                                                                                

+----+------------------+------------------+------------------+------------------+
|hour|              None|             Light|          Moderate|             Heavy|
+----+------------------+------------------+------------------+------------------+
|   0| 5370.100840336135| 4468.411764705882| 6526.318181818182| 772.7142857142857|
|   1| 3727.356394129979|3554.7083333333335| 4340.428571428572| 507.3333333333333|
|   2|2613.9936575052852| 2303.769230769231| 3337.842105263158|              35.0|
|   3|1884.0330578512396|1609.5555555555557|2052.4444444444443|               7.0|
|   4|1445.9225941422594|1302.3958333333333| 1317.157894736842|            2314.0|
|   5|1697.7796610169491|1625.1607142857142|1840.8333333333333|1183.6666666666667|
|   6|3858.4418604651164|3981.2156862745096| 3808.086956521739|               2.0|
|   7| 6751.157782515991|           6794.15| 7357.315789473684|               2.0|
|   8| 8585.214736842105| 7907.018518518518|           7469.15|               6.0|
|   

                                                                                

### Written Analysis ##

These results show that heavy rain seems to greatly decrease taxi demand overall, and that clear weather, especially during peak hours, tends to have the highest demand compared to any amount of precipitation greater than 0. It appears that light and moderate precipitation do not affect the distribution of demand throughout the day, and thqt the peak and off-peak hours stay the same under these conditions. When there is light or moderate precipitaiton, the demand may be slightly higher in off-peak hours compared to when there is no rain. 

## Question 4: Do colder temperatures lead to higher taxi usage? ##

In [14]:
from pyspark.sql import functions as F

combined_df_temp = (
    combined_df
    .withColumn(
        "temp_level",
        F.when(F.col("temperature_c") < 0, "Very Cold")
         .when(F.col("temperature_c") < 5, "Cold")
         .when(F.col("temperature_c") < 15, "Cool")
         .otherwise("Warm")
    )
)

hourly_counts_temp = (
    combined_df_temp
    .groupBy("pickup_hour", "hour", "temp_level")
    .agg(F.count("*").alias("trip_count"))
)

avg_by_temp = (
    hourly_counts_temp
    .groupBy("temp_level")
    .agg(F.avg("trip_count").alias("avg_trips_per_hour"))
    .orderBy("avg_trips_per_hour", ascending=False)
)
avg_by_temp.show()
hourly_distribution_temp = (
    hourly_counts_temp
    .groupBy("hour", "temp_level")
    .agg(F.avg("trip_count").alias("avg_trips"))
    .orderBy("hour", "temp_level")
)
hourly_distribution_temp_pivot = (
    hourly_distribution_temp
    .groupBy("hour")
    .pivot("temp_level", ["Very Cold", "Cold", "Cool", "Warm"])
    .avg("avg_trips")
    .orderBy("hour")
)
hourly_distribution_temp_pivot.show()

                                                                                

+----------+------------------+
|temp_level|avg_trips_per_hour|
+----------+------------------+
| Very Cold| 9802.563636363637|
|      Cold| 8810.665818490246|
|      Warm| 7206.363498098859|
|      Cool| 6964.597600369174|
+----------+------------------+



                                                                                

+----+------------------+------------------+------------------+------------------+
|hour|         Very Cold|              Cold|              Cool|              Warm|
+----+------------------+------------------+------------------+------------------+
|   0|7398.2307692307695| 6432.765957446809| 4614.114130434783| 4982.581589958159|
|   1| 4924.954545454545| 4481.292929292929|3126.9840425531916| 3595.244343891403|
|   2|         3452.6875|2939.9791666666665| 2266.427807486631|2548.4418604651164|
|   3| 2389.698113207547| 2378.233644859813|1542.9304812834225|1749.2537313432836|
|   4|1892.3275862068965|1657.1214953271028|1225.6702702702703|1372.8181818181818|
|   5| 2078.064516129032|1931.1607142857142|1460.8833333333334|1645.9384615384615|
|   6| 4836.184615384615| 4324.017094017094| 3350.637931034483| 3672.185567010309|
|   7|  8703.89705882353| 7617.387931034483| 6147.005681818182| 6118.402116402116|
|   8|           11051.6| 9614.196581196582| 7457.897727272727| 7718.085561497326|
|   

In [15]:
import plotly.express as px

hourly_temp_pd = hourly_distribution_temp.toPandas()

fig = px.line(
    hourly_temp_pd,
    x="hour",
    y="avg_trips",
    color="temp_level",
    title="Taxi Demand by Hour Under Different Temperature Levels",
    markers=True,
    template="plotly_dark"
)
fig.update_layout(
    xaxis=dict(dtick=1),
    legend_title_text="Temperature Level"
)
fig.show()


                                                                                

In [25]:


combined_df.printSchema()
# Drop missing values for clarity
df_clean = combined_df.dropna(subset=['rain_mm', 'trip_count'])

# Compute correlation
corr = df_clean['rain_mm'].corr(df_clean['trips'])
print(f"Correlation between rain and taxi demand: {corr:.3f}")

# Plot: Rain vs Taxi Demand
fig = px.scatter(
    df_clean,
    x='rain_mm',
    y='trips',
    trendline='ols',
    title=f"Hourly Rain vs Taxi Demand (Correlation = {corr:.2f})",
    template='plotly_dark'
)
fig.update_layout(xaxis_title="Rain (mm)", yaxis_title="Number of Trips")
fig.show()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- pickup_hour: timestamp (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- time_o

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `trip_count` cannot be resolved. Did you mean one of the following? [`VendorID`, `tpep_pickup_datetime`, `tpep_dropoff_datetime`, `passenger_count`, `trip_distance`, `RatecodeID`, `store_and_fwd_flag`, `PULocationID`, `DOLocationID`, `payment_type`, `fare_amount`, `extra`, `mta_tax`, `tip_amount`, `tolls_amount`, `improvement_surcharge`, `total_amount`, `congestion_surcharge`, `pickup_hour`, `year`, `month`, `day`, `time_only`, `temperature_c`, `precip_mm`, `rain_mm`, `cloudcover`, `cloudcover_low`, `cloudcover_mid`, `cloudcover_high`, `windspeed_kmh`, `winddirection_10m (°)`, `hour`, `day_of_week`, `is_rainy`, `is_cold`, `duration_min`]. SQLSTATE: 42703

### Written Analysis ###

Here we can very clearly see that temperature has a direct impact on taxi demand. Very cold temperatures have the highest taxi demand at all hours of the day, and contribute to an even larger peak during the typical overall peak hours. Cold temperatures have a similar but lesser effect, and cool temperatures hover close to warm temperatures or slightly higher. 

## Question 5: Are trip durations longer during bad weather? ##

In [16]:
duration_df_1 = combined_df.withColumn(
    "duration_min",
    (F.unix_timestamp("tpep_dropoff_datetime") -
     F.unix_timestamp("tpep_pickup_datetime")) / 60
)
avg_duration_1 = (
    duration_df_1
    .groupBy("is_rainy")
    .agg(F.avg("duration_min").alias("avg_duration_min"))
)
avg_duration_1.show()



+--------+------------------+
|is_rainy|  avg_duration_min|
+--------+------------------+
|       0| 17.65615726606718|
|       1|17.580494497915723|
+--------+------------------+



                                                                                

In [17]:
duration_precip = combined_df.withColumn(
    "precip_level",
    F.when(F.col("precip_mm") == 0, "0. None")
     .when(F.col("precip_mm") < 1, "1. Light (<1mm)")
     .when(F.col("precip_mm") < 3, "2. Moderate (1–3mm)")
     .otherwise("3. Heavy (>3mm)")
)
avg_duration_precip = (
    duration_precip
    .groupBy("precip_level")
    .agg(F.avg("duration_min").alias("avg_duration_min"))
    .orderBy("precip_level")
)

avg_duration_precip.show()

duration_quantiles = (
    duration_precip
    .groupBy("precip_level")
    .agg(F.expr("percentile(duration_min, 0.5)").alias("median_duration"),
         F.expr("percentile(duration_min, 0.9)").alias("p90_duration"))
)
pd_precip = avg_duration_precip.toPandas()

import plotly.express as px
fig = px.bar(
    pd_precip,
    x="precip_level",
    y="avg_duration_min",
    title="Average Trip Duration by Precipitation Level",
    text_auto=".2f",
    template="plotly_dark"
)
fig.show()


                                                                                

+-------------------+------------------+
|       precip_level|  avg_duration_min|
+-------------------+------------------+
|            0. None|17.663491331533688|
|    1. Light (<1mm)|17.676529455093064|
|2. Moderate (1–3mm)| 17.11168162955126|
|    3. Heavy (>3mm)|17.930004926887932|
+-------------------+------------------+



                                                                                

In [18]:
duration_temp = combined_df.withColumn(
    "temp_level",
    F.when(F.col("temperature_c") < 5, "Cold (<5°C)")
    .when(F.col("temperature_c") > 25, "Hot (>25°C)")
    .otherwise("Mild (5–25°C)")
)

avg_duration_temp = (
    duration_temp
    .groupBy("temp_level")
    .agg(F.avg("duration_min").alias("avg_duration_min"))
    .orderBy("temp_level")
)

avg_duration_temp.show()

temp_stats = (
    duration_temp
    .groupBy("temp_level")
    .agg(
        F.avg("duration_min").alias("avg_duration"),
        F.expr("percentile(duration_min, 0.5)").alias("median_duration")
    )
    .orderBy("temp_level")
)

pd_temp = temp_stats.toPandas()

fig = px.bar(
    pd_temp,
    x="temp_level",
    y=["avg_duration", "median_duration"],
    barmode="group",
    title="Trip Duration vs Temperature Category",
    labels={"value": "Duration (minutes)", "temp_level": "Temperature Level"},
    text_auto=".1f",
    template="plotly_dark"
)
fig.show()



                                                                                

+-------------+------------------+
|   temp_level|  avg_duration_min|
+-------------+------------------+
|  Cold (<5°C)|17.036486691265456|
|  Hot (>25°C)| 18.21428075306902|
|Mild (5–25°C)|17.898761180313365|
+-------------+------------------+



                                                                                

### Written Analysis ###

Based on these findings, it appears that temperature and precipitation both have very little to no effect on trip duration.

## Question 6: Which zones experience the highest fare surges?  ##

In [None]:
# Q6: Which zones experience the highest fare surges?
from pyspark.sql.functions import first

zone_fare_weather = combined_df.groupBy('PULocationID','is_rainy').agg(avg('fare_amount').alias('avg_fare'))
zone_fare_pivot = zone_fare_weather.groupBy('PULocationID').pivot('is_rainy', [0,1]).agg(first('avg_fare'))
# rename pivot columns if they exist
cols = zone_fare_pivot.columns
if '0' in cols:
    zone_fare_pivot = zone_fare_pivot.withColumnRenamed('0','fare_clear')
if '1' in cols:
    zone_fare_pivot = zone_fare_pivot.withColumnRenamed('1','fare_rainy')

zone_fare_pivot = zone_fare_pivot.withColumn('pct_increase', (col('fare_rainy') - col('fare_clear'))/col('fare_clear')*100)
zone_fare_pivot.orderBy(col('pct_increase').desc()).show(20)

zone_fare_pivot.coalesce(1).write.mode('overwrite').csv('outputs/zone_fare_pivot.csv', header=True)

We compared average taxi fares from each pickup zone during clear conditions versus rainy conditions. Several zones showed significant fare increases, in some cases more than 40–60% higher when it was raining. Because NYC yellow taxis do not use surge pricing, these increases are most likely driven by travel delays caused by rain, such as heavier traffic congestion, reduced road speeds, and longer trip durations. Zones near major transit choke-points or dense commercial areas appear to be the most affected.

## Question 7: What is the correlation between precipitation and city-wide taxi demand, and between temperature and city-wide taxi demand? ##

In [34]:
from pyspark.sql import functions as F
from pyspark.sql import functions as F

hourly = combined_df.groupBy("pickup_hour", "hour").agg(
    F.sum("trip_distance").alias("total_distance"),
    F.count("*").alias("trip_count"),
    F.avg("rain_mm").alias("rain_mm"),
    F.avg("temperature_c").alias("temperature_c")
)
corr_by_hour = (
    hourly.groupBy("hour")
    .agg(F.corr("rain_mm", "trip_count").alias("corr_rain_demand"))
    .orderBy("hour")
)
corr_by_hour.show(24)





+----+--------------------+
|hour|    corr_rain_demand|
+----+--------------------+
|   0| 0.02983007909720509|
|   1| 0.02247646315620474|
|   2| 0.02683976855698382|
|   3|0.026308101556556617|
|   4| 0.06757730485215487|
|   5| 0.03471554783922242|
|   6|-0.01957072841862...|
|   7|-0.00732093605595...|
|   8|-0.04211973522929...|
|   9|-0.05563681876860...|
|  10|-0.00642671691951...|
|  11|0.006722666202910375|
|  12|0.013981369001644818|
|  13|0.016970981435803757|
|  14|0.004391377247016047|
|  15|-0.01614789949997...|
|  16|-0.02768297939305...|
|  17| -0.0380344659914005|
|  18|-0.10733377880662966|
|  19| -0.1095523368443584|
|  20|-0.02523593356947828|
|  21|-0.01524990535060063|
|  22|-0.01297500884698...|
|  23|-0.00771654586846...|
+----+--------------------+



                                                                                

In [37]:
from pyspark.sql import functions as F

corr_temp_by_hour = (
    hourly.groupBy("hour")
    .agg(F.corr("temperature_c", "trip_count").alias("corr_temp_demand"))
    .orderBy("hour")
)

corr_temp_by_hour.show(24)



+----+--------------------+
|hour|    corr_temp_demand|
+----+--------------------+
|   0|-0.11741423537548301|
|   1|-0.08575170270537397|
|   2|-0.07517682390424027|
|   3|-0.09420616501441334|
|   4|-0.10947054553328917|
|   5|-0.14428858563383706|
|   6|-0.12987747136837205|
|   7|-0.17379678900954273|
|   8| -0.1790834191883583|
|   9|-0.17754143206183912|
|  10|-0.20931151244879576|
|  11| -0.2114037983390135|
|  12| -0.2173263513849757|
|  13| -0.2097945434219647|
|  14|-0.21733744058062565|
|  15| -0.2551898374284977|
|  16| -0.2441009423356753|
|  17|-0.23881922366001618|
|  18| -0.2387754610822655|
|  19|-0.21549812128190618|
|  20|  -0.196877305174403|
|  21|-0.19654063418484388|
|  22|-0.19083448350072665|
|  23|-0.14891165825658267|
+----+--------------------+



                                                                                

Analysis of hourly correlations shows that rain has only a weak and inconsistent relationship with taxi demand, with values hovering near zero across all hours, suggesting that precipitation alone does not reliably increase or decrease citywide ridership. In contrast, colder temperatures show a more consistent negative correlation with demand, indicating that taxi use tends to rise as temperatures drop, likely because walking and other outdoor travel becomes less comfortable. Overall, temperature appears to be a more reliable driver of demand than rain, while precipitation’s impact is smaller and more context-dependent.

## Question 8: What are the best predictors of hourly taxi demand ?  ##

In [35]:
# Q8: Best predictors of hourly taxi demand using RandomForest feature importance
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

# Prepare hourly weather averaged features
hourly_weather_avg = weather_df.groupBy('weather_time') \
    .agg(avg('temperature_c').alias('avg_temp'), avg('rain_mm').alias('avg_rain'), avg('windspeed_kmh').alias('avg_wind')) \
    .withColumnRenamed('weather_time', 'pickup_hour')

ml_df = city_hourly.join(hourly_weather_avg, on='pickup_hour', how='left').na.fill(0)
ml_df = ml_df.withColumn('hour', hour(col('pickup_hour'))).withColumn('day_of_week', dayofweek(col('pickup_hour')))

assembler = VectorAssembler(inputCols=['hour','day_of_week','avg_temp','avg_rain','avg_wind'], outputCol='features')
rf = RandomForestRegressor(featuresCol='features', labelCol='trip_count', numTrees=50)
pipeline = Pipeline(stages=[assembler, rf])

train_df, test_df = ml_df.randomSplit([0.8,0.2], seed=42)
model = pipeline.fit(train_df)
preds = model.transform(test_df)

evaluator = RegressionEvaluator(labelCol='trip_count', predictionCol='prediction', metricName='rmse')
print('RMSE:', evaluator.evaluate(preds))

rf_model = model.stages[-1]
print('Feature importances:', rf_model.featureImportances)

RMSE: 4353.552061192032
Feature importances: (5,[0,1,2,3,4],[0.8271281941112725,0.06232991700883111,0.08382161646206047,0.004940132901301945,0.02178013951653414])


The model’s feature importance scores show that time-based variables are by far the strongest predictors, with hour of the day accounting for roughly 83% of the predictive power and day of week contributing another 6%. Weather-related features played a much smaller role: temperature showed a modest effect (~8%), while rain and wind contributed very little. This indicates that demand in New York City is primarily driven by predictable daily and weekly travel patterns rather than short-term weather changes, with weather factors influencing demand only marginally compared to people’s regular commuting schedules and activity cycles.

## Question 9: Forecast next-day demand using lag features + RandomForest ##

In [36]:
# Q9: Forecast next-day demand using lag features + RandomForest
from pyspark.sql.window import Window
from pyspark.sql.functions import lag

city_hourly = city_hourly.orderBy('pickup_hour')
w = Window.orderBy('pickup_hour')

lagged = city_hourly.withColumn('lag_1', lag('trip_count',1).over(w)).withColumn('lag_24', lag('trip_count',24).over(w)).na.fill(0)
lagged = lagged.withColumn('hour', hour(col('pickup_hour'))).withColumn('day_of_week', dayofweek(col('pickup_hour')))

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor

assembler = VectorAssembler(inputCols=['lag_1','lag_24','hour','day_of_week'], outputCol='features')
ml_lag = assembler.transform(lagged).select('features','trip_count')

train, test = ml_lag.randomSplit([0.8,0.2], seed=42)
rf2 = RandomForestRegressor(featuresCol='features', labelCol='trip_count', numTrees=50)
model2 = rf2.fit(train)
preds2 = model2.transform(test)
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol='trip_count', predictionCol='prediction', metricName='rmse')
print('Forecast RMSE:', evaluator.evaluate(preds2))


25/12/03 19:58:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/03 19:58:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/03 19:58:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/03 19:58:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/03 19:58:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/03 19:58:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/03 1

Forecast RMSE: 1010.6088175852046


To explore whether we could forecast taxi demand one day ahead, we created a predictive model using lag features, meaning information from previous hours. We added two key inputs: the demand from one hour earlier (lag_1) and the demand from the same hour on the previous day (lag_24), along with the hour of day and day of week. Using a Random Forest regression model, we trained the model to predict future hourly demand based on these patterns. The model achieved an RMSE of approximately 1,010, which is significantly lower than the baseline RMSE from earlier models without lagged features. This suggests that the strongest predictors of tomorrow’s demand are simply yesterday’s demand at the same time and the general hourly routine of the city, meaning New York’s taxi patterns are highly repetitive and stable across days. As a result, short-term historical demand proves to be more useful for forecasting than standalone weather or timing variables.

## Q10 : Recommendations — compute zones with largest uplift and export ##

### Rain uplift ##

In [None]:
from pyspark.sql import functions as F

# average trips per hour by zone & rain
zone_hourly = combined_df.groupBy("PULocationID", "is_rainy") \
    .agg(F.count("*").alias("trips"), F.countDistinct("pickup_hour").alias("hours"))
zone_avg = zone_hourly.withColumn("avg_trips_per_hour", F.col("trips") / F.col("hours"))
# pivot to compare rain vs clear
zone_uplift = zone_avg.groupBy("PULocationID").pivot("is_rainy", [0,1]).agg(F.first("avg_trips_per_hour"))
zone_uplift = (
    zone_uplift
    .withColumnRenamed("0", "avg_clear")
    .withColumnRenamed("1", "avg_rain")
    .withColumn("pct_uplift", (F.col("avg_rain") - F.col("avg_clear")) / F.col("avg_clear") * 100)
    .orderBy("pct_uplift", ascending=False)
)
zone_uplift.show(20)



+------------+------------------+------------------+------------------+
|PULocationID|         avg_clear|          avg_rain|        pct_uplift|
+------------+------------------+------------------+------------------+
|         221|1.0280898876404494|1.3214285714285714|28.532396565183454|
|         194|1.5776137761377613|1.8721934369602764| 18.67248278876538|
|          70| 2.907264957264957|3.3336466165413534|14.666075006714205|
|           6|1.0555555555555556|1.1702127659574468|10.862262038073908|
|         185|1.3821656050955413|1.4803921568627452|7.1067136532032285|
|          57|1.0596491228070175|             1.125| 6.167218543046362|
|         159|1.8832212563555848|1.9955703211517164|  5.96579209251014|
|         253|1.0703703703703704|1.1296296296296295| 5.536332179930783|
|          27|1.0814814814814815|1.1363636363636365| 5.074719800747206|
|         154|1.0871287128712872|          1.140625|4.9208788706739455|
|          54|1.2660508083140878|1.3283132530120483| 4.917847237

                                                                                

### Temperature Uplift (Cold vs Mild) ###

In [39]:
from pyspark.sql import functions as F

combined_binned = combined_df.withColumn(
    "temp_bin",
    F.when(F.col("temperature_c") < 5, "Cold (<5°C)")
     .when(F.col("temperature_c") < 15, "Cool (5–15°C)")
     .when(F.col("temperature_c") < 25, "Mild (15–25°C)")
     .otherwise("Hot (>25°C)")
)
zone_temp = (
    combined_binned
    .groupBy("PULocationID", "temp_bin")
    .agg(
        F.count("*").alias("trips"),
        F.countDistinct("pickup_hour").alias("hours")
    )
)
zone_temp = zone_temp.withColumn(
    "avg_trips_per_hour",
    F.col("trips") / F.col("hours")
)
zone_temp_pivot = (
    zone_temp
    .groupBy("PULocationID")
    .pivot("temp_bin", ["Cold (<5°C)", "Cool (5–15°C)", "Mild (15–25°C)", "Hot (>25°C)"])
    .agg(F.first("avg_trips_per_hour"))
)

zone_temp_results = zone_temp_pivot.withColumn(
    "pct_uplift_cold_vs_mild",
    (F.col("Cold (<5°C)") - F.col("Mild (15–25°C)")) / F.col("Mild (15–25°C)") * 100
)
zone_temp_results.orderBy("pct_uplift_cold_vs_mild", ascending=False).show(20)


[Stage 356:>                                                        (0 + 8) / 9]

+------------+------------------+------------------+------------------+------------------+-----------------------+
|PULocationID|       Cold (<5°C)|     Cool (5–15°C)|    Mild (15–25°C)|       Hot (>25°C)|pct_uplift_cold_vs_mild|
+------------+------------------+------------------+------------------+------------------+-----------------------+
|         264| 96.77285318559557|58.525851506983585|59.374805598755835| 71.49525452976704|      62.98639163481048|
|         236|392.39142461964036|290.08999280057594|275.07100743399127|251.92532188841201|      42.65095703108676|
|         262|125.86181113265023| 90.52614935538799| 89.25824033220867| 67.87510841283608|      41.00861798777051|
|         238|195.20209828823855|143.17184216670557|138.89932381667919| 139.8017094017094|      40.53495216857093|
|         140|177.59850787510362|130.61513157894737| 126.4297583081571| 129.2345890410959|      40.47207734292186|
|         141|228.11077348066297|168.25508755324185| 162.9006555723651|174.02654

                                                                                

### Analysis ###

The uplift analysis highlights that weather influences taxi demand differently across the city’s pickup zones. A small number of zones show modest increases in demand during rain (up to roughly 28% uplift), suggesting these areas may rely more on taxis when conditions discourage walking or transit use. However, temperature appears to have a much stronger and more consistent impact: several zones show a 30–60% increase in demand during colder conditions compared to mild temperatures, indicating that cold weather is a more powerful driver of taxi usage than rain. These results suggest that demand shifts caused by weather are highly location-dependent, and that temperature-related effects are substantially more significant than precipitation when planning for fleet allocation or forecasting demand across NYC.