In [2]:
#import tensorflow as tf
import os
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

In [17]:
random = 42

# Part 1- Data Preprocessing

In [3]:
test_data_path = '../data/test.csv'
train_data_path = '../data/train.csv'

In [4]:
test_data = pd.read_csv(test_data_path)
train_data = pd.read_csv(train_data_path, nrows=5000000) # loading first 5 million rows (out of over 55 million)

In [5]:
# filter out NA vals
test_data = test_data.dropna()
train_data = train_data.dropna()

# filter out negative fares, 0 passenger fares, and fares outside of nyc
train_data = train_data.loc[
    (train_data['fare_amount'] > 2) &
    (train_data['passenger_count'] > 0) &
    (train_data['pickup_longitude'].between(-74.27, -73.68)) &
    (train_data['pickup_latitude'].between(40.49, 40.92))
]
test_data = test_data.loc[
    (test_data['passenger_count'] > 0) &
    (test_data['pickup_longitude'].between(-74.27, -73.68)) &
    (test_data['pickup_latitude'].between(40.49, 40.92))
]

In [6]:
test_data['pickup_datetime'] = pd.to_datetime(test_data['pickup_datetime'])
train_data['pickup_datetime'] = pd.to_datetime(train_data['pickup_datetime'])

In [7]:
# extract day, hour, weekday, and month from datetime
frames = [train_data, test_data]
for df in frames:
    df['hour'] = df['pickup_datetime'].dt.hour
    df['day'] = df['pickup_datetime'].dt.day
    df['weekday'] = df['pickup_datetime'].dt.weekday  # 0=Monday, 6=Sunday
    df['month'] = df['pickup_datetime'].dt.month

train_data

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,weekday,month
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.841610,40.712278,1,17,15,0,6
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1,16,5,1,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.761270,-73.991242,40.750562,2,0,18,3,8
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42+00:00,-73.987130,40.733143,-73.991567,40.758092,1,4,21,5,4
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1,7,9,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,2011-01-24 21:33:44.0000003,16.5,2011-01-24 21:33:44+00:00,-74.003883,40.725772,-73.969391,40.800830,1,21,24,0,1
4999996,2013-10-11 12:12:00.000000118,9.0,2013-10-11 12:12:00+00:00,-73.995105,40.739897,-73.985217,40.731950,2,12,11,4,10
4999997,2014-12-06 23:04:28.0000002,10.5,2014-12-06 23:04:28+00:00,-73.981063,40.764125,-73.979259,40.781857,2,23,6,5,12
4999998,2015-05-30 19:01:24.0000004,10.0,2015-05-30 19:01:24+00:00,-73.965401,40.759140,-73.971886,40.750870,1,19,30,5,5


In [8]:
# calculate haversine distance
import math
def haversine_vectorized(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth's radius in kilometers

    # convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # differences in coordinates
    delta_lat = lat2 - lat1
    delta_lon = lon2 - lon1

    # Haversine formula
    a = np.sin(delta_lat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

In [9]:
for df in frames:
    df['h_dist (km)'] = haversine_vectorized(
        df['pickup_latitude'].values, df['pickup_longitude'].values,
        df['dropoff_latitude'].values, df['dropoff_longitude'].values
    )

In [10]:
# group into 5 clusters (for major boroughs: Bronx, Brooklyn, Manhattan, Queens, and Staten Island)
from sklearn.cluster import KMeans

In [32]:
# apply K-means clustering
kmeans_pickup = KMeans(n_clusters=5, random_state = random)
train_data['pickup_cluster'] = kmeans_pickup.fit_predict(train_data[['pickup_latitude', 'pickup_longitude']])

kmeans_dropoff = KMeans(n_clusters=5, random_state=random)
train_data['dropoff_cluster'] = kmeans_dropoff.fit_predict(train_data[['dropoff_latitude', 'dropoff_longitude']])

In [34]:
train_data['dropoff_cluster'].unique()

array([0, 2, 4, 3, 1])

In [35]:
import folium
from folium.plugins import MarkerCluster

# Assuming you have a dataframe with pickup and dropoff coordinates and their cluster labels
# Make sure you have assigned the cluster labels in your dataframe already
# For example: train_data['pickup_cluster'] and train_data['dropoff_cluster']

# Create a base map centered around NYC
nyc_map = folium.Map(location=[40.7128, -74.0060], zoom_start=12, 
                     tiles='CartoDB positron', 
                     control_scale=True)

# Add a marker cluster to group points close together
marker_cluster = MarkerCluster().add_to(nyc_map)

# Add pickup coordinates to the map
for idx, row in train_data.iterrows():
    lat = row['pickup_latitude']
    lon = row['pickup_longitude']
    cluster = row['pickup_cluster']
    
    # Add the point to the marker cluster, you can use the cluster as part of the popup to show more info
    folium.Marker([lat, lon], popup=f'Pickup Cluster: {cluster}', icon=folium.Icon(color='blue')).add_to(marker_cluster)

# Add dropoff coordinates to the map
for idx, row in train_data.iterrows():
    lat = row['dropoff_latitude']
    lon = row['dropoff_longitude']
    cluster = row['dropoff_cluster']
    
    # Add the point to the marker cluster
    folium.Marker([lat, lon], popup=f'Dropoff Cluster: {cluster}', icon=folium.Icon(color='red')).add_to(marker_cluster)

# Optional: you can fit the map bounds to the cluster points
nyc_map.fit_bounds([[40.4774, -74.2591], [40.9176, -73.7004]])

# Show the map (if using Jupyter Notebook, this will display inline)
nyc_map


ModuleNotFoundError: No module named 'folium'