In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
# Load Data 
# File paths
brown_bin_data_path = 'brown_bin_data.csv'
public_bins_locations_path = 'Public_Bins_Locations_MappingFCC.csv'

In [None]:
# Load datasets
brown_bin_data = pd.read_csv(brown_bin_data_path)

In [None]:
# Set plotting style
plt.style.use("ggplot")

In [None]:
#1. Baseline Model for Brown Bin Data
print("=== Baseline Model for Brown Bin Data ===")
target = 'Brown weight collected'

In [None]:
# Split data into train and test
train_data, test_data = train_test_split(brown_bin_data, test_size=0.2, random_state=42)

In [None]:
# Baseline prediction
baseline_prediction = train_data[target].mean()

In [None]:
# Evaluating baseline
test_target = test_data[target]
baseline_predictions = [baseline_prediction] * len(test_target)

In [None]:
mse = mean_squared_error(test_target, baseline_predictions)
mae = mean_absolute_error(test_target, baseline_predictions)

In [None]:
print(f"Baseline Mean Squared Error (MSE): {mse:.2f}")
print(f"Baseline Mean Absolute Error (MAE): {mae:.2f}\n")

In [None]:
# Improved Visualizatio
# With Log Transformation
plt.figure(figsize=(10, 6))
test_target_log = np.log1p(test_target)  
baseline_prediction_log = np.log1p(baseline_prediction)

In [None]:
sns.histplot(test_target_log, bins=50, label='Actual (Log-Transformed)', color='blue', kde=True)
plt.axvline(x=baseline_prediction_log, color='orange', linestyle='--', label='Baseline Prediction (Log-Transformed)')
plt.title("Brown Bin Data: Actual vs. Baseline Predictions (Log-Transformed)")
plt.xlabel("Log of Brown Weight Collected")
plt.ylabel("Frequency")
plt.legend()
plt.show()

In [None]:
# 2. Baseline Model for Public Bins Clustering
print("=== Baseline Model for Public Bins Clustering ===")
# Simulated Clustering Task
n_bins = 100 
n_clusters = 5  

In [None]:
# Generate random cluster assignments as baseline
np.random.seed(42)
random_clusters = np.random.randint(0, n_clusters, size=n_bins)

In [None]:
true_clusters = np.random.randint(0, n_clusters, size=n_bins)

In [None]:
# Evaluating baseline using accuracy
baseline_accuracy = accuracy_score(true_clusters, random_clusters)
print(f"Baseline Clustering Accuracy (Random Assignment): {baseline_accuracy:.2f}")

In [None]:
# Visualising clustering baseline performance
plt.figure(figsize=(10, 6))
sns.histplot(random_clusters, bins=n_clusters, label='Random Clusters', color='orange', kde=False)
sns.histplot(true_clusters, bins=n_clusters, label='True Clusters', color='blue', kde=False)
plt.title("Clustering Baseline: Random vs. True Cluster Distribution")
plt.xlabel("Cluster")
plt.ylabel("Frequency")
plt.legend()
plt.show()