In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import KMeans

In [None]:
# File paths
traffic_flow_data_path = 'SDCC_Traffic_Flow_Data_July_to_Dec_2020.csv'
public_bins_locations_path = 'Public_Bins_Locations_MappingFCC.csv'
brown_bin_data_path = 'brown_bin_data.csv'

In [None]:
# Load datasets
traffic_flow_data = pd.read_csv(traffic_flow_data_path)
public_bins_locations = pd.read_csv(public_bins_locations_path)
brown_bin_data = pd.read_csv(brown_bin_data_path)

In [None]:
#  Preprocessing for Traffic Data 
# Converting date column to datetime
traffic_flow_data['date'] = pd.to_datetime(traffic_flow_data['date'], errors='coerce')

In [None]:
# Extracting time related features for analysis
traffic_flow_data['hour'] = pd.to_datetime(traffic_flow_data['start_time'], errors='coerce').dt.hour
traffic_flow_data['day_of_week'] = traffic_flow_data['date'].dt.day_name()

In [None]:
# Analysing the top 10 locations by number of bins
if 'ParkName' in public_bins_locations.columns:
    top_parks = public_bins_locations['ParkName'].value_counts().head(10)
    plt.figure(figsize=(12, 6))
    sns.barplot(x=top_parks.values, y=top_parks.index, palette='viridis')
    plt.title("Top 10 Locations by Number of Bins")
    plt.xlabel("Number of Bins")
    plt.ylabel("Park Name")
    plt.show()

EDA: Traffic Congestion Trends 

In [None]:
# 1. Traffic Congestion by Hour of Day
plt.figure(figsize=(12, 6))
sns.lineplot(data=traffic_flow_data, x='hour', y='cong', ci=None)
plt.title("Traffic Congestion by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Congestion Level")
plt.show()

In [None]:
# 2. Traffic Congestion by Day of Week
plt.figure(figsize=(12, 6))
sns.boxplot(data=traffic_flow_data, x='day_of_week', y='cong', order=[
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.title("Traffic Congestion by Day of Week")
plt.xlabel("Day of Week")
plt.ylabel("Congestion Level")
plt.show()

In [None]:
# 3. High Congestion Areas
plt.figure(figsize=(12, 6))
top_sites = traffic_flow_data.groupby('site')['cong'].mean().sort_values(ascending=False).head(10)
sns.barplot(x=top_sites.values, y=top_sites.index, palette="viridis")
plt.title("Top 10 Sites with Highest Average Congestion")
plt.xlabel("Average Congestion Level")
plt.ylabel("Site")
plt.show()

In [None]:
# Saving the traffic flow summary to a CSV file
traffic_flow_summary = traffic_flow_data.describe()
traffic_flow_summary.to_csv('traffic_flow_summary.csv')
print("Traffic flow summary saved to 'traffic_flow_summary.csv'.")

Preprocess Public Bins Locations 

In [None]:
# Convert to GeoDataFrame
public_bins_locations['geometry'] = public_bins_locations.apply(
    lambda row: Point(row['X'], row['Y']), axis=1
)
bins_gdf = gpd.GeoDataFrame(public_bins_locations, geometry='geometry', crs='EPSG:4326')

In [None]:
bins_gdf = bins_gdf.to_crs(epsg=3857)

In [None]:
print("X Range:", bins_gdf.geometry.x.min(), bins_gdf.geometry.x.max())
print("Y Range:", bins_gdf.geometry.y.min(), bins_gdf.geometry.y.max())

Filter Outliers

In [None]:
# Set realistic bounds for coordinates 
bins_filtered = bins_gdf[
    (bins_gdf.geometry.x > bins_gdf.geometry.x.min()) & 
    (bins_gdf.geometry.x < bins_gdf.geometry.x.max()) &
    (bins_gdf.geometry.y > bins_gdf.geometry.y.min()) & 
    (bins_gdf.geometry.y < bins_gdf.geometry.y.max())
].copy()

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(x=bins_filtered.geometry.x, y=bins_filtered.geometry.y)
plt.title("Filtered Public Bin Locations (Raw Coordinates)")
plt.xlabel("Longitude (X)")
plt.ylabel("Latitude (Y)")
plt.show()

KMeans Clustering 

In [None]:
# Extracting coordinates for clustering
coordinates_filtered = bins_filtered['geometry'].apply(lambda p: (p.x, p.y)).tolist()

In [None]:
# Ensuring there is enough points for clustering
if len(coordinates_filtered) >= 5:  
    kmeans = KMeans(n_clusters=5, random_state=42)
    clusters_filtered = kmeans.fit_predict(coordinates_filtered)
    bins_filtered['Cluster'] = clusters_filtered

    # Visualising the clusters
    plt.figure(figsize=(12, 8))
    sns.scatterplot(
        x=bins_filtered.geometry.x, 
        y=bins_filtered.geometry.y, 
        hue=bins_filtered['Cluster'], 
        palette="Set2"
    )
    plt.title("KMeans Clustering of Public Bin Locations")
    plt.xlabel("Longitude (X)")
    plt.ylabel("Latitude (Y)")
    plt.legend(title="Cluster")
    plt.show()
else:
    print("Not enough data points for clustering. Check filtering criteria.")

In [None]:
# Correlation Matrix 
correlation_columns = ['Brown weight collected', 'Lift count', 'Average weight per bin']
correlation_matrix = brown_bin_data[correlation_columns].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap of Collection Metrics")
plt.show()

Waste Trends Over Time 

In [None]:
# Weekly Trends
weekly_waste = brown_bin_data.groupby('WEEK')['Brown weight collected'].sum()
plt.figure(figsize=(10, 6))
sns.lineplot(x=weekly_waste.index, y=weekly_waste.values)
plt.title("Waste Generation Over Time by Week")
plt.xlabel("Week")
plt.ylabel("Total Brown Weight Collected")
plt.grid(True)
plt.show()

In [None]:
# Yearly Trends
yearly_waste = brown_bin_data.groupby('YEAR')['Brown weight collected'].sum()
plt.figure(figsize=(10, 6))
sns.lineplot(x=yearly_waste.index, y=yearly_waste.values)
plt.title("Waste Generation Over Time by Year")
plt.xlabel("Year")
plt.ylabel("Total Brown Weight Collected")
plt.grid(True)
plt.show()