In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import numpy as np
matplotlib.use('TkAgg') 

# Load the dataset and specify data types or handle mixed data types
df = pd.read_csv('/Users/andre-chancegoddard/Downloads/TaxiDatset(1).csv', dtype={'VendorID': str}, low_memory=False)

# Convert datetime fields and specify the format directly to avoid parsing errors
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], format='%m/%d/%Y %I:%M:%S %p')
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'], format='%m/%d/%Y %I:%M:%S %p')

# Calculate trip duration in minutes and extract hour of day and day of week
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
df['hour_of_day'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_week'] = df['tpep_pickup_datetime'].dt.day_name()

# Filtering to avoid erroneous data
df = df[df['trip_duration'] > 0]  # Assuming only positive trip durations are valid

# Plotting trip duration by hour of day
plt.figure(figsize=(12, 6))
sns.lineplot(x='hour_of_day', y='trip_duration', data=df)
plt.title('Average Trip Duration by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Average Trip Duration (minutes)')
plt.grid(True)
plt.show()

# Plotting trip duration by day of the week
plt.figure(figsize=(12, 6))
sns.barplot(x='day_of_week', y='trip_duration', data=df, estimator=np.mean, order=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])
plt.title('Average Trip Duration by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Average Trip Duration (minutes)')
plt.grid(True)
plt.show()

# Plotting congestion analysis scatter plot
if 'congestion_surcharge' in df.columns:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='hour_of_day', y='congestion_surcharge', data=df)
    plt.title('Distribution of Congestion Surcharge by Hour of Day')
    plt.xlabel('Hour of Day')
    plt.ylabel('Congestion Surcharge ($)')
    plt.grid(True)
    plt.show()
else:
    print("Congestion surcharge data not available.")

# Prepare data for daily congrestion heatmap
heatmap_data = df.pivot_table(values='congestion_surcharge', index='day_of_week', columns='hour_of_day', aggfunc='mean', fill_value=0)
order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
heatmap_data = heatmap_data.reindex(order)  # Order days for a standard week

# Plotting the daily congestion heatmap
plt.figure(figsize=(16, 8))
sns.heatmap(heatmap_data, annot=True, fmt=".1f", cmap='coolwarm', linewidths=.5)
plt.title('Average Congestion Surcharge by Day of Week and Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Day of Week')
plt.show()


print("Script finished.")