In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import joblib


In [None]:

df = pd.read_csv('updated_pollution_dataset.csv')

# 2. Data Cleaning
# Physical pollutants cannot be negative. We clip values to a minimum of 0.
df['PM10'] = df['PM10'].clip(lower=0)
df['SO2'] = df['SO2'].clip(lower=0)


In [None]:


# Features (X): Temperature, Humidity, PM2.5, PM10, NO2, SO2, CO, etc.
# Target (y): Air Quality (Good, Moderate, Poor, Hazardous)
X = df.drop('Air Quality', axis=1)
y = df['Air Quality']

# Convert text labels into numbers (e.g., Good -> 0, Moderate -> 2)
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [None]:

# 4. Split Data (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# 5. Initialize and Train the Model
# Random Forest is chosen for its high accuracy and ability to handle non-linear data.
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:

# 6. Evaluate the Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"‚úÖ Training Complete!")
print(f"üìä Model Accuracy: {accuracy * 100:.2f}%")
print("\nüìù Detailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


In [None]:

# 7. Save the Model and Encoder
# These files will be used by your backend (Node.js/Python) to make real-time predictions.
joblib.dump(model, 'aqi_classifier_model.joblib')
joblib.dump(le, 'label_encoder.joblib')

print("\nüíæ Files saved: 'aqi_classifier_model.joblib' and 'label_encoder.joblib'")

In [None]:
import joblib
import pandas as pd

# Load the saved components
trained_model = joblib.load('aqi_classifier_model.joblib')
encoder = joblib.load('label_encoder.joblib')

# Example: New data from a user's location
new_reading = pd.DataFrame([{
    'Temperature': 30.5,
    'Humidity': 65.0,
    'PM2.5': 45.2,
    'PM10': 55.0,
    'NO2': 25.4,
    'SO2': 10.1,
    'CO': 1.2,
    'Proximity_to_Industrial_Areas': 5.0,
    'Population_Density': 450
}])

# Make prediction
prediction_idx = trained_model.predict(new_reading)
category = encoder.inverse_transform(prediction_idx)[0]

print(f"The Air Quality is: {category}")