
# Task 3 â€” Unsupervised Learning (Pollution Clustering)

This notebook performs clustering using **Index Value** to group days into:

- Low Pollution
- Medium Pollution
- High Pollution

It includes:

- KMeans clustering  
- Cluster interpretation  
- Scatter plot of clusters  
- Cluster distribution bar chart  


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Load dataset
df = pd.read_csv('/mnt/data/Noida_AQIBulletins.csv')
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')

# Use Index Value for clustering
X = df[['Index Value']].values

# KMeans with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# Sort clusters by mean pollution (0=low, 1=medium, 2=high)
cluster_means = df.groupby('cluster')['Index Value'].mean().sort_values()
mapping = {old: new for new, old in enumerate(cluster_means.index)}
df['pollution_level'] = df['cluster'].map(mapping)

# Interpretation
print("Cluster Interpretation:")
for c in sorted(df['pollution_level'].unique()):
    sub = df[df['pollution_level'] == c]
    print(f"Cluster {c} -> Mean: {sub['Index Value'].mean():.2f}, Range: [{sub['Index Value'].min()}, {sub['Index Value'].max()}]")

# Scatter plot
plt.figure(figsize=(12,5))
plt.scatter(df['date'], df['Index Value'], c=df['pollution_level'], cmap='viridis')
plt.xlabel('Date')
plt.ylabel('Index Value')
plt.title('Pollution Clusters Based on Index Value')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Bar plot of cluster counts
plt.figure(figsize=(6,4))
df['pollution_level'].value_counts().sort_index().plot(kind='bar')
plt.xlabel('Cluster (0=Low, 1=Medium, 2=High)')
plt.ylabel('Number of Days')
plt.title('Distribution of Pollution Clusters')
plt.tight_layout()
plt.show()
