In [1]:
!mkdir -p data/uber_raw
!unzip -j "*.zip" uber-raw-data-apr14.csv uber-raw-data-may14.csv uber-raw-data-jun14.csv uber-raw-data-jul14.csv uber-raw-data-aug14.csv uber-raw-data-sep14.csv -d data/uber_raw

Archive:  uber-raw-data-aug14.csv.zip
  inflating: data/uber_raw/uber-raw-data-aug14.csv  
caution: filename not matched:  uber-raw-data-apr14.csv
caution: filename not matched:  uber-raw-data-may14.csv
caution: filename not matched:  uber-raw-data-jun14.csv
caution: filename not matched:  uber-raw-data-jul14.csv
caution: filename not matched:  uber-raw-data-sep14.csv

Archive:  uber-raw-data-may14.csv.zip
  inflating: data/uber_raw/uber-raw-data-may14.csv  
caution: filename not matched:  uber-raw-data-apr14.csv
caution: filename not matched:  uber-raw-data-jun14.csv
caution: filename not matched:  uber-raw-data-jul14.csv
caution: filename not matched:  uber-raw-data-aug14.csv
caution: filename not matched:  uber-raw-data-sep14.csv

Archive:  uber-raw-data-sep14.csv.zip
  inflating: data/uber_raw/uber-raw-data-sep14.csv  
caution: filename not matched:  uber-raw-data-apr14.csv
caution: filename not matched:  uber-raw-data-may14.csv
caution: filename not matched:  uber-raw-data-jun14.c

In [2]:
import os
import pandas as pd
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.models import HoverTool

In [3]:
months = ["apr14", "may14", "jun14"]
base_path = "data/uber_raw"

dfs = []
for m in months:
    fname = f"uber-raw-data-{m}.csv"
    path = os.path.join(base_path, fname)
    # read CSV, parse Date/Time, keep only the four columns
    df = pd.read_csv(
        path,
        usecols=["Date/Time", "Lat", "Lon", "Base"],
        parse_dates=["Date/Time"],
        infer_datetime_format=True,
    )
    dfs.append(df)

# 2) Concatenate into one DataFrame
full_df = pd.concat(dfs, ignore_index=True)

  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(


In [4]:
# Create a new column called “time_bin_30m” that floors each timestamp to the previous 30 min.
full_df["timestamp"] = full_df["Date/Time"].dt.floor("30T")
df = (
    full_df.groupby("timestamp")
    .size()
    .rename("value")
    .reset_index()
)

  full_df["timestamp"] = full_df["Date/Time"].dt.floor("30T")


In [5]:
df.head(10)

Unnamed: 0,timestamp,value
0,2014-04-01 00:00:00,70
1,2014-04-01 00:30:00,68
2,2014-04-01 01:00:00,39
3,2014-04-01 01:30:00,27
4,2014-04-01 02:00:00,28
5,2014-04-01 02:30:00,25
6,2014-04-01 03:00:00,39
7,2014-04-01 03:30:00,54
8,2014-04-01 04:00:00,75
9,2014-04-01 04:30:00,91


In [6]:
def overview(df: pd.DataFrame, timestamp_col: str = None) -> None:
    print('Null Count:\n', df.isnull().sum(),'\n')
    print('Data Types:\n', df.dtypes)

    if timestamp_col is not None:
        print('\nDate Range:\n\nStart:\t',df[timestamp_col].min())
        print('End:\t',df[timestamp_col].max())
        print('Days:\t',(df[timestamp_col].max() - df[timestamp_col].min()))

In [7]:
overview(df, timestamp_col='timestamp')

Null Count:
 timestamp    0
value        0
dtype: int64 

Data Types:
 timestamp    datetime64[ns]
value                 int64
dtype: object

Date Range:

Start:	 2014-04-01 00:00:00
End:	 2014-06-30 23:30:00
Days:	 90 days 23:30:00


In [8]:
Hourly = hv.Curve(df.set_index('timestamp').resample('H').mean()).opts(
    opts.Curve(title="New York City Taxi Demand Hourly", xlabel="", ylabel="Demand",
               width=700, height=300,tools=['hover'],show_grid=True))

Daily = hv.Curve(df.set_index('timestamp').resample('D').mean()).opts(
    opts.Curve(title="New York City Taxi Demand Daily", xlabel="", ylabel="Demand",
               width=700, height=300,tools=['hover'],show_grid=True))

Weekly = hv.Curve(df.set_index('timestamp').resample('W').mean()).opts(
    opts.Curve(title="New York City Taxi Demand Weekly", xlabel="Date", ylabel="Demand",
               width=700, height=300,tools=['hover'],show_grid=True))


(Hourly + Daily + Weekly).opts(shared_axes=False).cols(1)

  Hourly = hv.Curve(df.set_index('timestamp').resample('H').mean()).opts(


In [9]:
(hv.Distribution(df['value'])
.opts(opts.Distribution(title="Overall Value Distribution",
                        xlabel="Value",
                        ylabel="Density",
                        width=700, height=300,
                        tools=['hover'],show_grid=True)
     ))

In [10]:
(hv.BoxWhisker(df, kdims=[], vdims='value')
 .opts(opts.BoxWhisker(title="Overall Value Boxplot",
                       ylabel="Value",
                       width=700, height=300,
                       tools=['hover'], show_grid=True, invert_axes=True,
                       outlier_radius=6)))

# **Feature Engineering**

In [11]:
# A variety of resamples which I may or may not use
df_hourly = df.set_index('timestamp').resample('H').mean().reset_index()
df_daily = df.set_index('timestamp').resample('D').mean().reset_index()
df_weekly = df.set_index('timestamp').resample('W').mean().reset_index()

  df_hourly = df.set_index('timestamp').resample('H').mean().reset_index()


In [12]:
# New features
# Loop to cycle through both DataFrames
for DataFrame in [df_hourly, df_daily]:
    DataFrame['Weekday'] = (pd.Categorical(DataFrame['timestamp'].dt.strftime('%A'),
                                           categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday','Friday', 'Saturday', 'Sunday'])
                           )
    DataFrame['Hour'] = DataFrame['timestamp'].dt.hour
    DataFrame['Day'] = DataFrame['timestamp'].dt.weekday
    DataFrame['Month'] = DataFrame['timestamp'].dt.month
    DataFrame['Year'] = DataFrame['timestamp'].dt.year
    DataFrame['Month_day'] = DataFrame['timestamp'].dt.day
    DataFrame['Lag'] = DataFrame['value'].shift(1)
    DataFrame['Rolling_Mean'] = DataFrame['value'].rolling(7, min_periods=1).mean()
    DataFrame = DataFrame.dropna()


In [13]:
by_weekday = df_hourly.groupby(['Hour','Weekday']).mean()['value'].unstack()
plot = hv.Distribution(by_weekday['Monday'], label='Monday') * hv.Distribution(by_weekday['Tuesday'], label='Tuesday') * hv.Distribution(by_weekday['Wednesday'], label='Wednesday') * hv.Distribution(by_weekday['Thursday'], label='Thursday') * hv.Distribution(by_weekday['Friday'], label='Friday') * hv.Distribution(by_weekday['Saturday'], label='Saturday') *hv.Distribution(by_weekday['Sunday'], label='Sunday').opts(opts.Distribution(title="Demand Density by Day & Hour"))
plot.opts(opts.Distribution(width=800, height=300,tools=['hover'],show_grid=True, ylabel="Demand", xlabel="Demand"))

  by_weekday = df_hourly.groupby(['Hour','Weekday']).mean()['value'].unstack()


In [14]:
hv.Bars(df_hourly[['value','Weekday']].groupby('Weekday').mean()).opts(
    opts.Bars(title="New York City Taxi Demand by Day", xlabel="", ylabel="Demand",
               width=700, height=300,tools=['hover'],show_grid=True))

  hv.Bars(df_hourly[['value','Weekday']].groupby('Weekday').mean()).opts(


In [15]:
hv.Curve(df_hourly[['value','Hour']].groupby('Hour').mean()).opts(
    opts.Curve(title="New York City Taxi Demand Hourly", xlabel="Hour", ylabel="Demand",
               width=700, height=300,tools=['hover'],show_grid=True))

In [16]:
by_weekday = df_hourly.groupby(['Hour','Weekday']).mean()['value'].unstack()
plot = hv.Curve(by_weekday['Monday'], label='Monday') * hv.Curve(by_weekday['Tuesday'], label='Tuesday') * hv.Curve(by_weekday['Wednesday'], label='Wednesday') * hv.Curve(by_weekday['Thursday'], label='Thursday') * hv.Curve(by_weekday['Friday'], label='Friday') * hv.Curve(by_weekday['Saturday'], label='Saturday') *hv.Curve(by_weekday['Sunday'], label='Sunday').opts(opts.Curve(title="Average Demand by Day & Hour"))
plot.opts(opts.Curve(width=800, height=300,tools=['hover'],show_grid=True, ylabel="Demand"))

  by_weekday = df_hourly.groupby(['Hour','Weekday']).mean()['value'].unstack()


In [17]:
import calendar
import datetime

def compare_weekday(df, weekday):
    df_w = df[df['Day'] == weekday].copy()
    if df_w.empty:
        raise ValueError(f"No data for weekday {weekday}")
    avg = df_w.groupby('Hour')['value'].mean()
    sum_per_date = df_w.groupby(['Year','Month', 'Month_day'])['value'].sum()
    # Create a temporary date column for filtering
    df_w['date'] = pd.to_datetime(df_w[['Year','Month', 'Month_day']].rename(columns={'Month_day': 'day'}))

    # Convert the best_date tuple to a datetime.date object for comparison
    best_date_tuple = sum_per_date.idxmax()
    best_date = datetime.date(*best_date_tuple)

    best = df_w[df_w['date'].dt.date == best_date].groupby('Hour')['value'].mean()
    hours = pd.Index(range(24), name='Hour')
    avg = avg.reindex(hours)
    best = best.reindex(hours)
    return avg, best, best_date

# Choose weekday: 5 for Saturday (0=Monday,...,6=Sunday)
weekday = 0
avg_series, best_series, best_date = compare_weekday(df_hourly, weekday)
weekday_name = calendar.day_name[weekday]

avg_max_comparison = hv.Curve(avg_series, label=f'Average {weekday_name}') * hv.Curve(best_series, label=f'Busiest {weekday_name}').opts(opts.Curve(title=f"Average {weekday_name} vs Busiest {weekday_name}"))
avg_max_comparison.opts(opts.Curve(width=800, height=300,tools=['hover'],show_grid=True, ylabel="Demand"))

display(avg_max_comparison)
print(f"Busiest {weekday_name} date: {best_date}")

Busiest Monday date: 2014-06-09


In [18]:
import folium
from folium.plugins import HeatMap

def plot_geojson_filtered(df, date_filter, hour_filter):
    """
    Plots a heatmap from a DataFrame filtered by date and hour using Folium.

    Args:
        df (pd.DataFrame): The input DataFrame with 'Lat' and 'Lon' columns.
        date_filter (str): A date string in 'YYYY-MM-DD' format to filter by.
        hour_filter (int): An hour (0-23) to filter by.
    """
    # Ensure 'Date/Time' is datetime and filter
    df_filtered = df.copy()
    df_filtered['Date/Time'] = pd.to_datetime(df_filtered['Date/Time'])
    df_filtered = df_filtered[
        (df_filtered['Date/Time'].dt.date == pd.to_datetime(date_filter).date()) &
        (df_filtered['Date/Time'].dt.hour == hour_filter)
    ].copy() # Use .copy() to avoid SettingWithCopyWarning

    if df_filtered.empty:
        print(f"No data points found for date: {date_filter} and hour: {hour_filter}")
        # Create a basic map even if empty
        m = folium.Map(location=[40.75, -73.95], zoom_start=12) # Default NYC center
        folium.Marker(location=[40.75, -73.95], popup="No data found here").add_to(m)
        return m


    # Create a map centered around the mean location of the filtered data
    map_center = [df_filtered['Lat'].mean(), df_filtered['Lon'].mean()]
    m = folium.Map(location=map_center, zoom_start=12)

    # Prepare data for heatmap: list of [latitude, longitude]
    heat_data = [[row['Lat'], row['Lon']] for index, row in df_filtered.iterrows()]

    # Add HeatMap layer to the map
    HeatMap(heat_data).add_to(m)

    return m

In [19]:
map = plot_geojson_filtered(full_df, '2014-04-01', 12)
map