In [1]:
from pyspark.sql import SparkSession
import importlib.util
import os

# ========== LOAD CONFIG FIRST ==========
src_path = os.path.join(os.path.dirname(os.getcwd()), 'src')
config_file = os.path.join(src_path, 'config.py')

spec = importlib.util.spec_from_file_location("config", config_file)
config_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(config_module)

Config = config_module.Config
print("âœ“ Config loaded")

# ========== STOP EXISTING SPARK ==========
try:
    spark.stop()
    print("âœ“ Stopped existing Spark session")
except:
    print("â„¹ No existing Spark session to stop")

# ========== CREATE SPARK SESSION ==========
print(f"Creating Spark session: {Config.APP_NAME}")

spark = SparkSession.builder \
    .appName(Config.APP_NAME) \
    .config("spark.driver.memory", Config.SPARK_DRIVER_MEMORY) \
    .config("spark.executor.memory", Config.SPARK_EXECUTOR_MEMORY) \
    .config("spark.executor.instances", Config.SPARK_EXECUTOR_INSTANCES) \
    .config("spark.executor.cores", Config.SPARK_EXECUTOR_CORES) \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
    .getOrCreate()

print(f"âœ“ Spark session created successfully (version {spark.version})")

# ========== CONFIGURE HADOOP FOR MINIO ==========
print("Configuring Hadoop for MinIO...")

hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.endpoint", Config.MINIO_ENDPOINT)
hadoop_conf.set("fs.s3a.access.key", Config.MINIO_ACCESS_KEY)
hadoop_conf.set("fs.s3a.secret.key", Config.MINIO_SECRET_KEY)
hadoop_conf.set("fs.s3a.path.style.access", "true")
hadoop_conf.set("fs.s3a.connection.ssl.enabled", "false")
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

print(f"âœ“ Spark configured for environment: {Config.ENVIRONMENT}")
print(f"âœ“ Using MinIO endpoint: {Config.MINIO_ENDPOINT}")
print(f"âœ“ Reading from bucket: {Config.S3_BUCKET_NAME}")

print("\n" + "="*50)
print("âœ“ Spark Session Ready")
print("="*50)

# Display configuration
Config.display_config()

âœ“ Config loaded
â„¹ No existing Spark session to stop
Creating Spark session: NYC Taxi EDA
âœ“ Spark session created successfully (version 3.5.0)
Configuring Hadoop for MinIO...
âœ“ Spark configured for environment: development
âœ“ Using MinIO endpoint: http://minio:9000
âœ“ Reading from bucket: nyc-taxi

âœ“ Spark Session Ready
Current Configuration:
App Name: NYC Taxi EDA
Environment: development
MinIO Endpoint: http://minio:9000
S3 Bucket: nyc-taxi
Spark Driver Memory: 3g
Spark Executor Memory: 3g
Spark Executor Instances: 3
Spark Executor Cores: 2
Log Level: INFO
Log File: eda.log


### Nearby Hotspots

In [2]:
# Load the zone lookup table
zone_lookup = spark.read.csv("s3a://nyc-taxi/taxi_zone_lookup.csv", header=True, inferSchema=True)

print("=== Zone Lookup Table Schema ===")
zone_lookup.printSchema()

print("\n=== Sample Zone Data ===")
zone_lookup.show(20, truncate=False)

print("\n=== Boroughs Available ===")
zone_lookup.select("Borough").distinct().show(truncate=False)

from pyspark.sql.functions import desc

# Now you can use desc() directly
zone_lookup.groupBy("Borough").count().orderBy(desc("count")).show()

=== Zone Lookup Table Schema ===
root
 |-- LocationID: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)


=== Sample Zone Data ===
+----------+-------------+-----------------------+------------+
|LocationID|Borough      |Zone                   |service_zone|
+----------+-------------+-----------------------+------------+
|1         |EWR          |Newark Airport         |EWR         |
|2         |Queens       |Jamaica Bay            |Boro Zone   |
|3         |Bronx        |Allerton/Pelham Gardens|Boro Zone   |
|4         |Manhattan    |Alphabet City          |Yellow Zone |
|5         |Staten Island|Arden Heights          |Boro Zone   |
|6         |Staten Island|Arrochar/Fort Wadsworth|Boro Zone   |
|7         |Queens       |Astoria                |Boro Zone   |
|8         |Queens       |Astoria Park           |Boro Zone   |
|9         |Queens       |Auburndale             |Boro Zone   |
|10  

#### 2 Borough entries include Unknown and N/A, so we will filter them out.

In [4]:
from pyspark.sql.functions import col, lit

# Define the boroughs to exclude
exclude_boroughs = ["Unknown", "N/A"]

# Filter the zone_lookup table to get the clean IDs
df_clean_zones = zone_lookup.filter(~col("Borough").isin(exclude_boroughs))

# If you want to filter your main trip data (df_tips1) 
# using the clean zone list:
valid_ids = [row['LocationID'] for row in df_clean_zones.select("LocationID").collect()]

df_tips1 = spark.read.parquet("s3a://nyc-taxi/Nearby_Hotspots_DF/")

df_tips_filtered = df_tips1.filter(col("PULocationID").isin(valid_ids) & 
                                  col("DOLocationID").isin(valid_ids))

In [5]:
df_tips_filtered.count()

78879710

In [50]:
df_clean_zones.groupBy("Borough").count().orderBy(desc("count")).show()

+-------------+-----+
|      Borough|count|
+-------------+-----+
|       Queens|   69|
|    Manhattan|   69|
|     Brooklyn|   61|
|        Bronx|   43|
|Staten Island|   20|
|          EWR|    1|
+-------------+-----+



EWR (Newark Airport) is physically separated from the other boroughs by a significant distance and a state line. Combining it with a nearby borough like Manhattan or Queens would create a "ghost hotspot" that a driver cannot realistically reach without a long, expensive trip.

So, removing EWR is the much better choice for your taxi driver dashboard rather than combining it.

In [6]:
df_clean_zones = df_clean_zones.filter(~col("Borough").isin(["EWR"]))

# If you want to filter your main trip data (df_tips1) 
# using the clean zone list:
valid_ids = [row['LocationID'] for row in df_clean_zones.select("LocationID").collect()]

df_tips_filtered = df_tips_filtered.filter(col("PULocationID").isin(valid_ids) & 
                                  col("DOLocationID").isin(valid_ids))

In [7]:
df_tips1.count()

79757391

In [8]:
df_tips_filtered.count()

78664614

#### Create Borough-Based Nearby Locations

In [9]:
from pyspark.sql.functions import col, count, avg, sum as spark_sum
from pyspark.sql.window import Window

# Join your trip data with zone lookup to get borough information
df_with_borough = df_tips_filtered.join(
    zone_lookup.select(
        col("LocationID").alias("PULocationID"),
        col("Borough").alias("PU_Borough"),
        col("Zone").alias("PU_Zone")
    ),
    "PULocationID",
    "left"
)

print("=== Trip Data with Borough Info ===")
df_with_borough.select("PULocationID", "PU_Borough", "PU_Zone", "pickup_hour").show(20, truncate=False)

# Check for any missing borough mappings
print("\n=== Checking for Missing Borough Mappings ===")
missing_mappings = df_with_borough.filter(col("PU_Borough").isNull())
print(f"Trips with missing borough: {missing_mappings.count()}")

=== Trip Data with Borough Info ===
+------------+----------+-----------------------------+-----------+
|PULocationID|PU_Borough|PU_Zone                      |pickup_hour|
+------------+----------+-----------------------------+-----------+
|186         |Manhattan |Penn Station/Madison Sq West |0          |
|79          |Manhattan |East Village                 |0          |
|43          |Manhattan |Central Park                 |0          |
|239         |Manhattan |Upper West Side South        |0          |
|166         |Manhattan |Morningside Heights          |0          |
|107         |Manhattan |Gramercy                     |0          |
|158         |Manhattan |Meatpacking/West Village West|0          |
|75          |Manhattan |East Harlem South            |0          |
|132         |Queens    |JFK Airport                  |0          |
|140         |Manhattan |Lenox Hill East              |0          |
|90          |Manhattan |Flatiron                     |0          |
|113        

#### Calculate Pickup Probabilities by Borough

In [10]:
# Calculate pickups by location, borough, and hour
pickups_by_location_hour = df_with_borough.groupBy("PULocationID", "PU_Borough", "pickup_hour") \
    .agg(
        count("*").alias("location_pickups"),
        avg("fare_amount").alias("avg_fare"),
        avg("trip_distance").alias("avg_distance"),
        avg("tip_percent").alias("avg_tip_percent")
    )

# Calculate total pickups in the borough for each hour
total_pickups_by_borough_hour = df_with_borough.groupBy("PU_Borough", "pickup_hour") \
    .agg(count("*").alias("total_borough_pickups"))

# Join to calculate probability within borough
location_probabilities_in_borough = pickups_by_location_hour \
    .join(
        total_pickups_by_borough_hour,
        ["PU_Borough", "pickup_hour"],
        "inner"
    ) \
    .withColumn(
        "pickup_probability_in_borough_pct",
        (col("location_pickups") / col("total_borough_pickups")) * 100
    ) \
    .select(
        "PULocationID",
        "PU_Borough",
        "pickup_hour",
        "location_pickups",
        "total_borough_pickups",
        "pickup_probability_in_borough_pct",
        "avg_fare",
        "avg_distance",
        "avg_tip_percent"
    )

# pickup_probability_in_borough_pct calculates: "Given that a pickup happens in this borough at this hour, what's the probability it happens at this specific location?"

print("=== Location Probabilities within Borough ===")
location_probabilities_in_borough.orderBy(col("pickup_probability_in_borough_pct").desc()).show(20, truncate=False)

=== Location Probabilities within Borough ===
+------------+-------------+-----------+----------------+---------------------+---------------------------------+------------------+------------------+-------------------+
|PULocationID|PU_Borough   |pickup_hour|location_pickups|total_borough_pickups|pickup_probability_in_borough_pct|avg_fare          |avg_distance      |avg_tip_percent    |
+------------+-------------+-----------+----------------+---------------------+---------------------------------+------------------+------------------+-------------------+
|23          |Staten Island|6          |100             |123                  |81.30081300813008                |63.462000000000025|24.0681           |0.22987390882638217|
|214         |Staten Island|5          |114             |150                  |76.0                             |60.78333333333333 |19.520263157894735|1.609034088288205  |
|132         |Queens       |6          |90804           |119999               |75.670630588588

In [11]:
# Save this
location_probabilities_in_borough.write.mode("overwrite") \
    .parquet("s3a://nyc-taxi/location_probabilities_in_borough")

#### Create Nearby Hotspots Based on Same Borough

In [57]:
def get_nearby_hotspots_by_borough(current_location, current_hour, top_n=5):
    """
    Find top hotspots in the SAME BOROUGH as the current location
    """
    # Load zone lookup
    zone_lookup = spark.read.csv("s3a://nyc-taxi/taxi_zone_lookup.csv", header=True, inferSchema=True)
    
    # Load location probabilities
    location_probs = spark.read.parquet("s3a://nyc-taxi/location_probabilities_in_borough/")
    
    # Get the borough of the current location
    current_borough = zone_lookup.filter(col("LocationID") == current_location) \
        .select("Borough").first()
    
    if current_borough is None:
        print(f"Location {current_location} not found in zone lookup")
        return None
    
    current_borough_name = current_borough["Borough"]
    print(f"Current Borough: {current_borough_name}")
    
    # Get all locations in the same borough at the current hour
    same_borough_hotspots = location_probs.filter(
        (col("PU_Borough") == current_borough_name) &
        (col("pickup_hour") == current_hour)
    ).orderBy(col("pickup_probability_in_borough_pct").desc())
    
    # Get top N locations (excluding current location)
    top_hotspots = same_borough_hotspots.filter(col("PULocationID") != current_location).limit(top_n)
    
    return top_hotspots

# Test the function
print("\n=== Nearby Hotspots (Same Borough) ===")
print("Driver at Location 161, Hour 18 (6 PM)\n")

hotspots = get_nearby_hotspots_by_borough(current_location=161, current_hour=18, top_n=5)
if hotspots:
    hotspots.show(truncate=False)


=== Nearby Hotspots (Same Borough) ===
Driver at Location 161, Hour 18 (6 PM)

Current Borough: Manhattan
+------------+----------+-----------+----------------+---------------------+---------------------------------+------------------+------------------+------------------+
|PULocationID|PU_Borough|pickup_hour|location_pickups|total_borough_pickups|pickup_probability_in_borough_pct|avg_fare          |avg_distance      |avg_tip_percent   |
+------------+----------+-----------+----------------+---------------------+---------------------------------+------------------+------------------+------------------+
|237         |Manhattan |18         |306718          |5294914              |5.792690872788491                |12.772591631400843|1.713306294381158 |28.725783173835083|
|162         |Manhattan |18         |269887          |5294914              |5.097098838621364                |15.01047727382199 |2.0333778951931736|26.75317617408619 |
|236         |Manhattan |18         |265696          

In [21]:
def get_enhanced_nearby_hotspots(current_location, current_hour, top_n=5):
    """
    Enhanced version with zone names and better formatting
    """
    # Load data
    zone_lookup = spark.read.csv("s3a://nyc-taxi/taxi_zone_lookup.csv", header=True, inferSchema=True)
    location_probs = spark.read.parquet("s3a://nyc-taxi/location_probabilities_in_borough/")
    
    # Get current location info
    current_zone_info = zone_lookup.filter(col("LocationID") == current_location).first()
    
    if current_zone_info is None:
        print(f"Location {current_location} not found")
        return None
    
    current_borough = current_zone_info["Borough"]
    current_zone = current_zone_info["Zone"]
    
    print(f"Current Location: {current_zone} ({current_borough})")
    print(f"Current Time: {current_hour}:00\n")
    
    # Get hotspots in same borough
    same_borough_hotspots = location_probs.filter(
        (col("PU_Borough") == current_borough) &
        (col("pickup_hour") == current_hour) &
        (col("PULocationID") != current_location)
    )
    
    # Join with zone lookup to get zone names
    hotspots_with_names = same_borough_hotspots.join(
        zone_lookup.select(
            col("LocationID").alias("PULocationID"),
            col("Zone").alias("zone_name")
        ),
        "PULocationID",
        "left"
    ).select(
        "PULocationID",
        "zone_name",
        "location_pickups",
        "pickup_probability_in_borough_pct",
        "avg_fare",
        "avg_distance",
        "avg_tip_percent"
    ).orderBy(col("pickup_probability_in_borough_pct").desc()).limit(top_n)
    
    return hotspots_with_names

# Test enhanced version
print("=== Enhanced Nearby Hotspots ===\n")
hotspots = get_enhanced_nearby_hotspots(current_location=161, current_hour=18, top_n=5)
if hotspots:
    hotspots.show(truncate=False)

=== Enhanced Nearby Hotspots ===

Current Location: Midtown Center (Manhattan)
Current Time: 18:00

+------------+---------------------+----------------+---------------------------------+------------------+------------------+------------------+
|PULocationID|zone_name            |location_pickups|pickup_probability_in_borough_pct|avg_fare          |avg_distance      |avg_tip_percent   |
+------------+---------------------+----------------+---------------------------------+------------------+------------------+------------------+
|237         |Upper East Side South|306718          |5.792690872788491                |12.772591631400843|1.713306294381158 |28.725783173835083|
|162         |Midtown East         |269887          |5.097098838621364                |15.01047727382199 |2.0333778951931736|26.75317617408619 |
|236         |Upper East Side North|265696          |5.017947411421602                |12.928201628929326|1.8291468068770318|28.730728518766   |
|163         |Midtown North   

In [25]:
location_probabilities_in_borough.show()

+------------+----------+-----------+----------------+---------------------+---------------------------------+------------------+------------------+-------------------+
|PULocationID|PU_Borough|pickup_hour|location_pickups|total_borough_pickups|pickup_probability_in_borough_pct|          avg_fare|      avg_distance|    avg_tip_percent|
+------------+----------+-----------+----------------+---------------------+---------------------------------+------------------+------------------+-------------------+
|          67|  Brooklyn|         13|             101|                38478|               0.2624876552835386| 31.96237623762376| 7.106435643564355|                0.0|
|          77|  Brooklyn|         13|             513|                38478|               1.3332293778262905|31.418771929824562|7.1061793372319695| 0.2094647179090035|
|         225|  Brooklyn|         13|            1066|                38478|               2.7704142626955663|28.867270168855534| 6.195712945590995| 0.4878

In [26]:
def get_normalized_probabilities(current_location, current_hour, top_n=5):
    """
    Get top N locations with probabilities normalized to sum to 100%
    """
    # Load data
    zone_lookup = spark.read.csv("s3a://nyc-taxi/taxi_zone_lookup.csv", header=True, inferSchema=True)
    location_probs = spark.read.parquet("s3a://nyc-taxi/location_probabilities_in_borough/")
    
    # Get current borough
    current_zone_info = zone_lookup.filter(col("LocationID") == current_location).first()
    
    if current_zone_info is None:
        return None
    
    current_borough = current_zone_info["Borough"]
    
    # Get hotspots
    hotspots = location_probs.filter(
        (col("PU_Borough") == current_borough) &
        (col("pickup_hour") == current_hour) &
        (col("PULocationID") != current_location)
    ).orderBy(col("pickup_probability_in_borough_pct").desc()).limit(top_n)
    
    # Join with zone names
    hotspots_with_names = hotspots.join(
        zone_lookup.select(
            col("LocationID").alias("PULocationID"),
            col("Zone").alias("zone_name")
        ),
        "PULocationID",
        "left"
    )
    
    # Convert to pandas for normalization
    hotspots_pd = hotspots_with_names.toPandas()
    
    if len(hotspots_pd) > 0:
        # Normalize probabilities to sum to 100%
        total_prob = hotspots_pd['pickup_probability_in_borough_pct'].sum()
        hotspots_pd['normalized_probability_pct'] = (
            hotspots_pd['pickup_probability_in_borough_pct'] / total_prob * 100
        )
        
        # Format for display
        result = hotspots_pd[[
            'PULocationID',
            'zone_name',
            'normalized_probability_pct',
            'location_pickups',
            'avg_fare',
            'avg_tip_percent'
        ]].round(2)
        
        result.columns = [
            'Location ID',
            'Zone Name',
            'Probability (%)',
            'Pickups/Hour',
            'Avg Fare ($)',
            'Avg Tip (%)'
        ]
        
        return result
    else:
        return None

# Test normalized probabilities
print("\n=== Normalized Probability Recommendations ===")
print("Driver at Location 161, Hour 18 (6 PM)\n")

recommendations = get_normalized_probabilities(current_location=161, current_hour=18, top_n=5)
if recommendations is not None:
    print(recommendations.to_string(index=False))
    print(f"\nTotal Probability: {recommendations['Probability (%)'].sum():.1f}%")


=== Normalized Probability Recommendations ===
Driver at Location 161, Hour 18 (6 PM)

 Location ID             Zone Name  Probability (%)  Pickups/Hour  Avg Fare ($)  Avg Tip (%)
         237 Upper East Side South            24.47        306718         12.77        28.73
         162          Midtown East            21.53        269887         15.01        26.75
         236 Upper East Side North            21.20        265696         12.93        28.73
         163         Midtown North            16.80        210563         14.72        27.27
         142   Lincoln Square East            16.01        200691         13.21        28.45

Total Probability: 100.0%


In [27]:
def driver_dashboard_by_borough(current_location, current_hour, top_n=5):
    """
    Complete dashboard showing nearby hotspots within the same borough
    """
    # Load data
    zone_lookup = spark.read.csv("s3a://nyc-taxi/taxi_zone_lookup.csv", header=True, inferSchema=True)
    
    # Get current location details
    current_info = zone_lookup.filter(col("LocationID") == current_location).first()
    
    if current_info is None:
        print(f"âš  Location {current_location} not found in zone lookup table")
        return
    
    current_borough = current_info["Borough"]
    current_zone = current_info["Zone"]
    
    print(f"\n{'='*80}")
    print(f"DRIVER HOTSPOT RECOMMENDATIONS (Borough-Based)")
    print(f"{'='*80}")
    print(f"Your Location: {current_zone} (Zone ID: {current_location})")
    print(f"Borough: {current_borough}")
    print(f"Time: {current_hour}:00 ({current_hour % 12 or 12} {'PM' if current_hour >= 12 else 'AM'})")
    print(f"{'='*80}\n")
    
    # Get recommendations
    recommendations = get_normalized_probabilities(current_location, current_hour, top_n)
    
    if recommendations is not None and len(recommendations) > 0:
        print(f"TOP {len(recommendations)} HOTSPOTS IN {current_borough.upper()}:\n")
        
        for idx, row in recommendations.iterrows():
            print(f"{idx + 1}. {row['Zone Name']} (ID: {int(row['Location ID'])})")
            print(f"   ðŸ“Š Probability: {row['Probability (%)']:.1f}% of finding next ride here")
            print(f"   ðŸš• Expected Activity: ~{int(row['Pickups/Hour'])} pickups/hour")
            print(f"   ðŸ’° Average Fare: ${row['Avg Fare ($)']:.2f}")
            print(f"   ðŸ’µ Average Tip: {row['Avg Tip (%)']:.1f}%")
            print()
        
        print(f"âœ… These {len(recommendations)} locations represent {recommendations['Probability (%)'].sum():.1f}% ")
        print(f"   of pickup opportunities in {current_borough}")
    else:
        print(f"âš  No recommendations available for {current_borough} at this time")
        print("   Consider checking a different hour or nearby borough")
    
    print(f"\n{'='*80}")

# Test the complete dashboard
driver_dashboard_by_borough(current_location=161, current_hour=18, top_n=5)
driver_dashboard_by_borough(current_location=161, current_hour=8, top_n=5)


DRIVER HOTSPOT RECOMMENDATIONS (Borough-Based)
Your Location: Midtown Center (Zone ID: 161)
Borough: Manhattan
Time: 18:00 (6 PM)

TOP 5 HOTSPOTS IN MANHATTAN:

1. Upper East Side South (ID: 237)
   ðŸ“Š Probability: 24.5% of finding next ride here
   ðŸš• Expected Activity: ~306718 pickups/hour
   ðŸ’° Average Fare: $12.77
   ðŸ’µ Average Tip: 28.7%

2. Midtown East (ID: 162)
   ðŸ“Š Probability: 21.5% of finding next ride here
   ðŸš• Expected Activity: ~269887 pickups/hour
   ðŸ’° Average Fare: $15.01
   ðŸ’µ Average Tip: 26.8%

3. Upper East Side North (ID: 236)
   ðŸ“Š Probability: 21.2% of finding next ride here
   ðŸš• Expected Activity: ~265696 pickups/hour
   ðŸ’° Average Fare: $12.93
   ðŸ’µ Average Tip: 28.7%

4. Midtown North (ID: 163)
   ðŸ“Š Probability: 16.8% of finding next ride here
   ðŸš• Expected Activity: ~210563 pickups/hour
   ðŸ’° Average Fare: $14.72
   ðŸ’µ Average Tip: 27.3%

5. Lincoln Square East (ID: 142)
   ðŸ“Š Probability: 16.0% of finding next ride he