# PostgreSQL database

#### This notebook will successfully connect to PostgreSQL for MSDS 420 to benchmark for the NYC Congestion Relief Zone policy analysis.

In [1]:
#install psycopg2-binary
import sys
#!{sys.executable} -m pip install psycopg2-binary

In [43]:
import pandas as pd
import matplotlib.pyplot as plt  # To create annual trend analysis plot.
import numpy as np
import psycopg2  # required to connect to the PostgreSQL database.
import time

In [52]:
!docker-compose down -v
!docker-compose -f docker-compose-postgres.yml up -d

[1A[1B[0G[?25l[+] Running 1/1
 [32m✔[0m Container finalproject  [32mRunning[0m                                         [34m0.0s [0m
[?25h

In [53]:
# Establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    host='localhost',        # IP address of the Postgres database server
    port=5439,               # Port number for postgres docker container
    dbname="postgres",         # Name of the database to connect to
    user="postgres",         # Enter postgres for authentication
    password="root"
)

In [56]:
# Load congestion_zone dataset
congestion_zone_csv = "/Users/kelseykwon/Final_Project/Datasets/UPDATED MTA_Congestion_Relief_Zone_Vehicle_Entries__Beginning_2025_20250823.csv"
congestion_zone_df = pd.read_csv(congestion_zone_csv)
print(f"Loading congestion_zone_df!\n")

# Create a cursor object using the established connection
# The cursor is used to execute queries and fetch data from the database
    
mycursor = conn.cursor() # Create cursor
mycursor.execute("SET TIME ZONE '+00:00'") # Change timezone

# Drop table if it exists
toll_table_name = "toll_data"
mycursor.execute(f"DROP TABLE IF EXISTS {toll_table_name}")

# Rename column names
congestion_zone_df.rename(columns={
    'Toll Date': 'toll_date',
    'Time Period': 'time_period',
    'Vehicle Class': 'vehicle_class',
    'CRZ Entries': 'crz_entries'
}, inplace=True)

    
# Create table
toll_sql_create_table = f"""
    CREATE TABLE IF NOT EXISTS "{toll_table_name}" (
        toll_date DATE,
        time_period VARCHAR(20),
        vehicle_class VARCHAR(50),
        crz_entries INT
        );
        """
    
mycursor.execute(toll_sql_create_table) # Execute table query
conn.commit() # Commit changes
print(f"Table `{toll_table_name}` created successfully!\n")

# Insert data into table
toll_insert_query = """
INSERT INTO toll_data (
    toll_date, 
    time_period,
    vehicle_class, 
    crz_entries
) VALUES (%s, %s, %s, %s)
"""

# Convert DataFrame to list of tuples
toll_data_tuples = list(congestion_zone_df[
    ['toll_date', 
     'time_period', 'vehicle_class', 
     'crz_entries']
    ].itertuples(index=False, name=None))

#time indexing
start_time = time.time()


# Batch insert
batch_size = 150000
for start in range(0, len(toll_data_tuples), batch_size):
    batch = toll_data_tuples[start:start + batch_size]
    mycursor.executemany(toll_insert_query, batch)
    conn.commit() # Commit changes
    print(f"Inserted rows {start} to {start + len(batch) - 1}")


#end inexing
end_time = time.time()
elapsed_time_1 = end_time - start_time

print(f"Data successfully ran into `{toll_table_name}`!\n")
print(f"\nTotal indexing time for `{toll_table_name}`: {elapsed_time_1} seconds")


Loading congestion_zone_df!

Table `toll_data` created successfully!

Inserted rows 0 to 149999
Inserted rows 150000 to 299999
Inserted rows 300000 to 449999
Inserted rows 450000 to 599999
Inserted rows 600000 to 749999
Inserted rows 750000 to 899999
Inserted rows 900000 to 1049999
Inserted rows 1050000 to 1199999
Inserted rows 1200000 to 1349999
Inserted rows 1350000 to 1499999
Inserted rows 1500000 to 1649999
Inserted rows 1650000 to 1799999
Inserted rows 1800000 to 1949999
Inserted rows 1950000 to 2099999
Inserted rows 2100000 to 2249999
Inserted rows 2250000 to 2322431
Data successfully ran into `toll_data`!


Total indexing time for `toll_data`: 786.4804549217224 seconds


In [57]:
# Load tunnel and bridge crossings data
tunnel_bridge_crossing_csv = "/Users/kelseykwon/Final_Project/Datasets/UPDATED MTA_Bridges_and_Tunnels_Hourly_Crossings__Beginning_2019_20250823.csv"
tunnel_bridge_crossing_df = pd.read_csv(tunnel_bridge_crossing_csv)
print(f"Loading tunnel_bridge_crossing_df!\n")

# Create a cursor object using the established connection
# The cursor is used to execute queries and fetch data from the database
    
mycursor = conn.cursor() # Create cursor
mycursor.execute("SET TIME ZONE '+00:00'") # Change timezone

# Drop table if it exists
tunnel_table_name = "tunnel_bridge_data"
mycursor.execute(f"DROP TABLE IF EXISTS {tunnel_table_name}")

# Rename column names
tunnel_bridge_crossing_df.rename(columns={
    'Date': 'date',
    'Direction': 'direction',
    'Traffic Count': 'traffic_count',
}, inplace=True)

    
# Create table
tunnel_sql_create_table = f"""
    CREATE TABLE IF NOT EXISTS "{tunnel_table_name}" (
        date DATE,
        direction VARCHAR(100),
        traffic_count INT
        );
        """
    
mycursor.execute(tunnel_sql_create_table) # Execute table query
conn.commit() # Commit changes
print(f"Table `{tunnel_table_name}` created successfully!\n")

# Insert data into table
tunnel_insert_query = """
INSERT INTO tunnel_bridge_data (
    date, 
    direction,
    traffic_count
) VALUES (%s, %s, %s)
"""

# Convert DataFrame to list of tuples
tunnel_data_tuples = list(tunnel_bridge_crossing_df[
    ['date', 
     'direction', 'traffic_count']
    ].itertuples(index=False, name=None))

#time indexing
start_time = time.time()

# Batch insert
batch_size = 150000
for start in range(0, len(tunnel_data_tuples), batch_size):
    batch = tunnel_data_tuples[start:start + batch_size]
    mycursor.executemany(tunnel_insert_query, batch)
    conn.commit() # Commit changes
    print(f"Inserted rows {start} to {start + len(batch) - 1}")
    
#end inexing
end_time = time.time()
elapsed_time_1 = end_time - start_time

print(f"Data successfully ran into `{tunnel_table_name}`!\n")
print(f"\nTotal indexing time for `{toll_table_name}`: {elapsed_time_1} seconds")

Loading tunnel_bridge_crossing_df!

Table `tunnel_bridge_data` created successfully!

Inserted rows 0 to 149999
Inserted rows 150000 to 299999
Inserted rows 300000 to 449999
Inserted rows 450000 to 599999
Inserted rows 600000 to 749999
Inserted rows 750000 to 899999
Inserted rows 900000 to 1049999
Inserted rows 1050000 to 1199999
Inserted rows 1200000 to 1349999
Inserted rows 1350000 to 1499999
Inserted rows 1500000 to 1649999
Inserted rows 1650000 to 1799999
Inserted rows 1800000 to 1949999
Inserted rows 1950000 to 2099999
Inserted rows 2100000 to 2249999
Inserted rows 2250000 to 2399999
Inserted rows 2400000 to 2549999
Inserted rows 2550000 to 2699999
Inserted rows 2700000 to 2849999
Inserted rows 2850000 to 2999999
Inserted rows 3000000 to 3149999
Inserted rows 3150000 to 3299999
Inserted rows 3300000 to 3449999
Inserted rows 3450000 to 3599999
Inserted rows 3600000 to 3749999
Inserted rows 3750000 to 3899999
Inserted rows 3900000 to 4049999
Inserted rows 4050000 to 4199999
Inserted

In [55]:
# Load daily ridership data
daily_ridership_csv = "/Users/kelseykwon/Final_Project/Datasets/UPDATED MTA_Daily_Ridership_and_Traffic__Beginning_2020_20250823_final.csv"
daily_ridership_df = pd.read_csv(daily_ridership_csv)
print(f"Loading daily_ridership_df!\n")

# Create a cursor object using the established connection
# The cursor is used to execute queries and fetch data from the database
    
mycursor = conn.cursor() # Create cursor
mycursor.execute("SET TIME ZONE '+00:00'") # Change timezone

# Drop table if it exists
ridership_table_name = "daily_ridership_data"
mycursor.execute(f"DROP TABLE IF EXISTS {ridership_table_name}")

# Rename column names
daily_ridership_df.rename(columns={
    'Date': 'date',
    'Mode': 'mode',
    'Count': 'count',
}, inplace=True)

    
# Create table
ridership_sql_create_table = f"""
    CREATE TABLE IF NOT EXISTS "{ridership_table_name}" (
        date DATE,
        mode VARCHAR(20),
        count FLOAT
        );
        """
    
mycursor.execute(ridership_sql_create_table) # Execute table query
conn.commit() # Commit changes
print(f"Table `{ridership_table_name}` created successfully!\n")

# Insert data into table
ridership_insert_query = """
INSERT INTO daily_ridership_data (
    date, 
    mode,
    count
) VALUES (%s, %s, %s)
"""

# Convert DataFrame to list of tuples
ridership_data_tuples = list(daily_ridership_df[
    ['date', 
     'mode', 'count']
    ].itertuples(index=False, name=None))


#time indexing
start_time = time.time()


# Batch insert
batch_size = 150000
for start in range(0, len(ridership_data_tuples), batch_size):
    batch = ridership_data_tuples[start:start + batch_size]
    mycursor.executemany(ridership_insert_query, batch)
    conn.commit() # Commit changes
    print(f"Inserted rows {start} to {start + len(batch) - 1}")

    
#end inexing
end_time = time.time()
elapsed_time_3 = end_time - start_time

print(f"Data successfully ran into `{ridership_table_name}`!\n")
print(f"\nTotal indexing time for `{ridership_table_name}`: {elapsed_time_3} seconds")

Loading daily_ridership_df!

Table `daily_ridership_data` created successfully!

Inserted rows 0 to 14118
Data successfully ran into `daily_ridership_data`!


Total indexing time for `daily_ridership_data`: 4.857693910598755 seconds


In [None]:
import sys
sys.path.append('/Users/kelseykwon/Final_Project')
from datasets_load_data2 import load_all_data
congestion_zone_df, tunnel_bridge_crossing_df, daily_ridership_df = load_all_data()

### DATASETS

In [13]:
print("First few rows of df1:")
print(congestion_zone_df.head())

First few rows of df1:
    toll_date               Toll Hour    Toll 10 Minute Block  Minute of Hour  \
0  08/16/2025  08/16/2025 11:00:00 PM  08/16/2025 11:50:00 PM              50   
1  08/16/2025  08/16/2025 11:00:00 PM  08/16/2025 11:50:00 PM              50   
2  08/16/2025  08/16/2025 11:00:00 PM  08/16/2025 11:50:00 PM              50   
3  08/16/2025  08/16/2025 11:00:00 PM  08/16/2025 11:50:00 PM              50   
4  08/16/2025  08/16/2025 11:00:00 PM  08/16/2025 11:50:00 PM              50   

   Hour of Day  Day of Week Int Day of Week   Toll Week time_period  \
0           23                7    Saturday  08/10/2025   Overnight   
1           23                7    Saturday  08/10/2025   Overnight   
2           23                7    Saturday  08/10/2025   Overnight   
3           23                7    Saturday  08/10/2025   Overnight   
4           23                7    Saturday  08/10/2025   Overnight   

                vehicle_class               Detection Group  \


In [14]:
print("\nFirst few rows of df2:")
print(tunnel_bridge_crossing_df.head())


First few rows of df2:
        Transit Timestamp        date  Hour  Facility ID  \
0  07/29/2025 11:00:00 PM  07/29/2025    23           30   
1  07/29/2025 11:00:00 PM  07/29/2025    23           21   
2  07/29/2025 11:00:00 PM  07/29/2025    23           21   
3  07/29/2025 11:00:00 PM  07/29/2025    23           21   
4  07/29/2025 11:00:00 PM  07/29/2025    23           30   

                         Facility                          direction  \
0     Verrazzano - Narrows Bridge         Westbound to Staten Island   
1  Robert F. Kennedy Bridge Bronx  Southbound to Manhattan or Queens   
2  Robert F. Kennedy Bridge Bronx  Southbound to Manhattan or Queens   
3  Robert F. Kennedy Bridge Bronx   Northbound to Manhattan or Bronx   
4     Verrazzano - Narrows Bridge         Westbound to Staten Island   

  Payment Method  Vehicle Class Vehicle Class Description  \
0        E-ZPass             34              4-axle truck   
1        E-ZPass             34              4-axle truck   

In [15]:
print("\nFirst few rows of df2:")
print(daily_ridership_df.head())


First few rows of df2:
         date    mode      count
0  2025-07-31     Bus  1310912.0
1  2025-07-31  Subway  3718979.0
2  2025-07-31     SIR     7004.0
3  2025-07-31     AAR    42155.0
4  2025-07-31     MNR   229384.0


On January 5, 2025, the CBD Tolling Program took effect, charging a toll for vehicles entering Manhattan’s Congestion Relief Zone (CRZ), which consists of streets and avenues at or below 60 St but does not include the Franklin D. Roosevelt (FDR) Drive and the West Side Highway/Route 9A, including the Battery Park Underpass and any surface roadway portion of the Hugh L. Carey Tunnel that connects to West Street (the West Side Highway/Route 9A). Roads not included in the tolling program are referred to as “excluded roadways”. The program is intended to relieve congestion in the most congested district in the United States.

Our analysis aims to provide an early answer to the impact of the congestion pricing across 4 overarching areas:
- Is NYC removing cars from Lower Manhattan (i.e., are crossing decreasing)?
- Is NYC adding subway riders?
- Is NYC adding revenue (i.e., how many people are crossing and paying the toll)?
- Are fewer people entering Manhattan overall?

### Question 1: Is NYC removing cars from Lower Manhattan?

To determine if NYC is removing cars from Lower Manhattan since implementing the Congestion Zone on January 5th, we'll examine the 'Tunnel and Bridge Crossings MTA: Beginning 2019' dataset. The dataset includes 11 columns:
    - 'Transit Timestamp'
    - 'Date'
    - 'Hour'
    - 'Facility ID'
    - 'Facility'
    - 'Direction'
    - 'Payment Method'
    - 'Vehicle Class'
    - 'Vehicle Class Description'
    - 'Vehicle Class Category'
    - 'Traffic Count'
    
The 'Date' column will allow the comparison of daily traffic count (i.e., 'Count') from before and after the Congestion Zone toll. We'll examine the data going back to the beginning of 2019 to compare the past 6 years of data and determine whether there was a significant increase in crossings from similar time periods in previous years. We will also filter the data to only looking at the crossings 'Direction' to Manhattan. 

In [50]:
# View dataset
tunnel_bridge_crossing_df.info

<bound method DataFrame.info of                Transit Timestamp        Date  Hour  Facility ID  \
0         07/29/2025 11:00:00 PM  07/29/2025    23           30   
1         07/29/2025 11:00:00 PM  07/29/2025    23           21   
2         07/29/2025 11:00:00 PM  07/29/2025    23           21   
3         07/29/2025 11:00:00 PM  07/29/2025    23           21   
4         07/29/2025 11:00:00 PM  07/29/2025    23           30   
...                          ...         ...   ...          ...   
11608859  01/01/2019 12:00:00 AM  01/01/2019     0           28   
11608860  01/01/2019 12:00:00 AM  01/01/2019     0           26   
11608861  01/01/2019 12:00:00 AM  01/01/2019     0           29   
11608862  01/01/2019 12:00:00 AM  01/01/2019     0           21   
11608863  01/01/2019 12:00:00 AM  01/01/2019     0           29   

                                Facility                          Direction  \
0            Verrazzano - Narrows Bridge         Westbound to Staten Island   
1    

In [17]:
tunnel_bridge_crossing_df.tail()

Unnamed: 0,Transit Timestamp,date,Hour,Facility ID,Facility,direction,Payment Method,Vehicle Class,Vehicle Class Description,Vehicle Class Category,traffic_count
11608859,01/01/2019 12:00:00 AM,01/01/2019,0,28,Hugh L. Carey Tunnel,Northbound to Manhattan,E-ZPass,1,2-axle passenger car,Car,324
11608860,01/01/2019 12:00:00 AM,01/01/2019,0,26,Cross Bay Bridge,Southbound,Tolls by Mail,4,2-axle truck,Truck,3
11608861,01/01/2019 12:00:00 AM,01/01/2019,0,29,Throgs Neck Bridge,Northbound to Bronx,Tolls by Mail,4,2-axle truck,Truck,5
11608862,01/01/2019 12:00:00 AM,01/01/2019,0,21,Robert F. Kennedy Bridge Bronx,Northbound to Manhattan or Bronx,E-ZPass,4,2-axle truck,Truck,8
11608863,01/01/2019 12:00:00 AM,01/01/2019,0,29,Throgs Neck Bridge,Southbound to Queens,E-ZPass,2,3-axle passenger car,Car,3


# Define queries

In [59]:
# Define queries
revenue_case = """
CASE 
    WHEN vehicle_class IN ('1 - Cars, Pickups and Vans') AND time_period = 'Peak' THEN 9.00
    WHEN vehicle_class IN ('1 - Cars, Pickups and Vans') AND time_period = 'Overnight' THEN 2.25
    WHEN vehicle_class IN ('5 - Motorcycles') AND time_period = 'Peak' THEN 4.50
    WHEN vehicle_class IN ('5 - Motorcycles') AND time_period = 'Overnight' THEN 1.05
    WHEN vehicle_class IN ('2 - Single-Unit Trucks', '4 - Buses') AND time_period = 'Peak' THEN 14.40
    WHEN vehicle_class IN ('2 - Single-Unit Trucks', '4 - Buses') AND time_period = 'Overnight' THEN 3.60
    WHEN vehicle_class = '3 - Multi-Unit Trucks' AND time_period = 'Peak' THEN 21.60
    WHEN vehicle_class = '3 - Multi-Unit Trucks' AND time_period = 'Overnight' THEN 5.40
    WHEN vehicle_class = 'TLC Taxi/FHV' THEN 1.125
    ELSE 0
END
"""

queries = {
    "Pre-toll Average Daily Traffic (5/1-12/31)": """
        SELECT EXTRACT(YEAR FROM date) AS year, 
               AVG(traffic_count) AS average_daily_traffic
        FROM tunnel_bridge_data
        WHERE TO_CHAR(date, 'MM-DD') BETWEEN '05-01' AND '12-31'
            AND direction ILIKE '%to Manhattan%'
        GROUP BY year
        ORDER BY year
    """,
     "Post-toll Average Daily Traffic (1/5-7/31)": """
         SELECT EXTRACT(YEAR FROM date) AS year,
               AVG(traffic_count) AS average_daily_traffic
        FROM tunnel_bridge_data
        WHERE TO_CHAR(date, 'MM-DD') BETWEEN '01-05' AND '07-31'
            AND direction ILIKE '%to Manhattan%'
        GROUP BY EXTRACT(YEAR FROM date)
        ORDER BY year
    """,
    "Average Ridership By Mode (Jan-July)": """
        SELECT EXTRACT(MONTH FROM date) AS month,
               mode,
               AVG(count) AS average_monthly_ridership
        FROM daily_ridership_data
        WHERE EXTRACT(YEAR FROM date) = 2025
        GROUP BY EXTRACT(MONTH FROM date), mode
        ORDER BY month;
    """,
    "Average Subway Yearly Ridership (Jan–Jul)": """
        SELECT EXTRACT(YEAR FROM date) AS year,
               AVG(count) AS average_subway_ridership
        FROM daily_ridership_data
        WHERE mode = 'Subway'
            AND EXTRACT(MONTH FROM date) <= 7
        GROUP BY EXTRACT(YEAR FROM date)
        ORDER BY year;
    """,
    "Average Subway Post-Toll Monthly Ridership": """
        SELECT TO_CHAR(date, 'YYYY-MM') AS month, 
               AVG(count) as average_subway_ridership
        FROM daily_ridership_data
        WHERE mode = 'Subway' 
            AND EXTRACT(YEAR FROM date) = 2025
        GROUP BY TO_CHAR(date, 'YYYY-MM')
        ORDER BY month;
    """,
    "Monthly Estimated Revenue": f"""
        SELECT TO_CHAR(toll_date, 'YYYY-MM') AS month, 
               SUM(crz_entries * ({revenue_case})) AS est_revenue
        FROM toll_data
        GROUP BY TO_CHAR(toll_date, 'YYYY-MM')
        ORDER BY month;
    """,
    "Estimated Revenue By Vehicle Class": f"""
        SELECT vehicle_class, 
               SUM(crz_entries * ({revenue_case})) AS est_revenue
        FROM toll_data
        GROUP BY vehicle_class
        ORDER BY vehicle_class, est_revenue DESC;
    """
}

# Run benchmarks
results = []

#run ingestion benchmarks
results = []
for name, query in ingestion_queries.items():
    start_time = time.time()
    exec(query)
    elapsed = time.time() - start_time
    results.append((name, round(elapsed, 4)))
    
dfs = {}
for name, query in queries.items():
    start_time = time.time()
    df = pd.read_sql_query(query, conn)
    elapsed = time.time() - start_time
    results.append((name, round(elapsed, 4)))
    dfs[name] = df

# Print benchmark results
print("PostgreSQL Benchmark Results (seconds):")
for name, elapsed in results:
    print(f"{name:45} {elapsed:.4f}")

# Access dataframes
daily_traffic_pre_tax_df = dfs["Pre-toll Average Daily Traffic (5/1-12/31)"]
daily_traffic_post_tax_df = dfs["Post-toll Average Daily Traffic (1/5-7/31)"]
average_mode_ridership_df = dfs["Average Ridership By Mode (Jan-July)"]
subway_yearly_ridership_df = dfs["Average Subway Yearly Ridership (Jan–Jul)"]
subway_monthly_ridership_df = dfs["Average Subway Post-Toll Monthly Ridership"]
monthly_revenue_df = dfs["Monthly Estimated Revenue"]
revenue_vehicle_class_df = dfs["Estimated Revenue By Vehicle Class"]

# Print
print(f'\n {daily_traffic_pre_tax_df} \n')
print(f'{daily_traffic_post_tax_df} \n')
print(f'{average_mode_ridership_df} \n')
print(f'{subway_yearly_ridership_df} \n')
print(f'{subway_monthly_ridership_df} \n')
print(f'{monthly_revenue_df} \n')
print(f'{revenue_vehicle_class_df}')

Loading tunnel_bridge_crossing_df!

Loading congestion_zone_df!

Loading daily_ridership_df!



  df = pd.read_sql_query(query, conn)


PostgreSQL Benchmark Results (seconds):
Load tunnel_bridge_crossing_df                14.4686
Load congestion_zone_df                       2.9622
Load daily_ridership_df                       0.0117
Pre-toll Average Daily Traffic (5/1-12/31)    8.1252
Post-toll Average Daily Traffic (1/5-7/31)    5.9052
Average Ridership By Mode (Jan-July)          0.0226
Average Subway Yearly Ridership (Jan–Jul)     0.0074
Average Subway Post-Toll Monthly Ridership    0.0039
Monthly Estimated Revenue                     1.3785
Estimated Revenue By Vehicle Class            0.8284

      year  average_daily_traffic
0  2019.0             199.498250
1  2020.0             153.563357
2  2021.0             199.858590
3  2022.0             207.192218
4  2023.0             211.650732
5  2024.0             213.220083
6  2025.0             216.163348 

     year  average_daily_traffic
0  2019.0             202.828601
1  2020.0             143.221976
2  2021.0             183.875041
3  2022.0             204.172