In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# --- 1. OFFICIAL AQI CALCULATION (CPCB Standards) ---
# Logic for piecewise linear interpolation

def get_pm25_si(x):
    if x <= 30: return x * 50 / 30
    elif x <= 60: return 50 + (x - 30) * 50 / 30
    elif x <= 90: return 100 + (x - 60) * 100 / 30
    elif x <= 120: return 200 + (x - 90) * 100 / 30
    elif x <= 250: return 300 + (x - 120) * 100 / 130
    else: return 400 + (x - 250) * 100 / 130

def get_pm10_si(x):
    if x <= 50: return x
    elif x <= 100: return x
    elif x <= 250: return 100 + (x - 100) * 100 / 150
    elif x <= 350: return 200 + (x - 250) * 100 / 100
    elif x <= 430: return 300 + (x - 350) * 100 / 80
    else: return 400 + (x - 430) * 100 / 80

def get_no2_si(x):
    if x <= 40: return x * 50 / 40
    elif x <= 80: return 50 + (x - 40) * 50 / 40
    elif x <= 180: return 100 + (x - 80) * 100 / 100
    elif x <= 280: return 200 + (x - 180) * 100 / 100
    elif x <= 400: return 300 + (x - 280) * 100 / 120
    else: return 400 + (x - 400) * 100 / 120

def get_so2_si(x):
    if x <= 40: return x * 50 / 40
    elif x <= 80: return 50 + (x - 40) * 50 / 40
    elif x <= 380: return 100 + (x - 80) * 100 / 300
    elif x <= 800: return 200 + (x - 380) * 100 / 420
    elif x <= 1600: return 300 + (x - 800) * 100 / 800
    else: return 400 + (x - 1600) * 100 / 800

def get_co_si(x):
    if x <= 1: return x * 50 / 1
    elif x <= 2: return 50 + (x - 1) * 50 / 1
    elif x <= 10: return 100 + (x - 2) * 100 / 8
    elif x <= 17: return 200 + (x - 10) * 100 / 7
    elif x <= 34: return 300 + (x - 17) * 100 / 17
    else: return 400 + (x - 34) * 100 / 17

def get_o3_si(x):
    if x <= 50: return x * 50 / 50
    elif x <= 100: return 50 + (x - 50) * 50 / 50
    elif x <= 168: return 100 + (x - 100) * 100 / 68
    elif x <= 208: return 200 + (x - 168) * 100 / 40
    elif x <= 748: return 300 + (x - 208) * 100 / 540
    else: return 400 + (x - 748) * 100 / 540

def get_nh3_si(x):
    if x <= 200: return x * 50 / 200
    elif x <= 400: return 50 + (x - 200) * 50 / 200
    elif x <= 800: return 100 + (x - 400) * 100 / 400
    elif x <= 1200: return 200 + (x - 800) * 100 / 400
    elif x <= 1800: return 300 + (x - 1200) * 100 / 600
    else: return 400 + (x - 1800) * 100 / 600

# --- 2. AQI CATEGORY (BUCKET) LOGIC ---
# Standard health implication ranges

def get_aqi_bucket(x):
    if x <= 50: return "Good"
    elif x <= 100: return "Satisfactory"
    elif x <= 200: return "Moderate"
    elif x <= 300: return "Poor"
    elif x <= 400: return "Very Poor"
    else: return "Severe"

# --- 3. TRAINING & VISUALIZATION SYSTEM ---

def run_prediction_system(csv_path):
    # Load and Prepare Data
    df = pd.read_csv(csv_path)
    
    # Identify features based on your image
    pollutants = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3']
    
    # Calculate target AQI using sub-index logic
    df['si_pm25'] = df['PM2.5'].apply(get_pm25_si)
    df['si_pm10'] = df['PM10'].apply(get_pm10_si)
    df['si_no2'] = df['NO2'].apply(get_no2_si)
    df['si_so2'] = df['SO2'].apply(get_so2_si)
    df['si_co'] = df['CO'].apply(get_co_si)
    df['si_o3'] = df['O3'].apply(get_o3_si)
    df['si_nh3'] = df['NH3'].apply(get_nh3_si)
    
    df['AQI'] = df[['si_pm25', 'si_pm10', 'si_no2', 'si_so2', 'si_co', 'si_o3', 'si_nh3']].max(axis=1)
    
    # Clean and split
    df_clean = df.dropna(subset=pollutants + ['AQI'])
    X = df_clean[pollutants]
    y = df_clean['AQI']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale Features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train Random Forest
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, oob_score=True)
    rf_model.fit(X_train_scaled, y_train)
    
    # Accuracy Metrics
    preds = rf_model.predict(X_test_scaled)
    print(f"--- Accuracy Metrics ---")
    print(f"R2 Score: {r2_score(y_test, preds):.2f}")
    print(f"Mean Absolute Error: {mean_absolute_error(y_test, preds):.2f}")
    print(f"OOB Accuracy: {rf_model.oob_score_:.2f}")

    # --- VISUALIZATIONS ---
    # 1. Learning Curve (Train vs Val Error)
    train_sizes, train_scores, test_scores = learning_curve(
        rf_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_absolute_error'
    )
    plt.figure(figsize=(10, 5))
    plt.plot(train_sizes, -np.mean(train_scores, axis=1), label='Train Error')
    plt.plot(train_sizes, -np.mean(test_scores, axis=1), label='Validation Error')
    plt.title('Random Forest Learning Curve')
    plt.xlabel('Training Samples')
    plt.ylabel('MAE')
    plt.legend()
    plt.show()

    # 2. Feature Importance
    plt.figure(figsize=(10, 5))
    importances = pd.Series(rf_model.feature_importances_, index=pollutants)
    importances.sort_values().plot(kind='barh')
    plt.title('Which Pollutant Drives the AQI?')
    plt.show()

    return rf_model, scaler

# --- 4. PREDICTION FUNCTION FOR NEW SAMPLES ---

def predict_aqi(model, scaler, pm25, pm10, no, no2, nox, nh3, co, so2, o3):
    sample = np.array([[pm25, pm10, no, no2, nox, nh3, co, so2, o3]])
    scaled_sample = scaler.transform(sample)
    aqi_prediction = model.predict(scaled_sample)[0]
    bucket = get_aqi_bucket(aqi_prediction)
    return aqi_prediction, bucket

# --- EXAMPLE EXECUTION ---
# trained_model, my_scaler = run_prediction_system('your_data.csv')
# aqi, health = predict_aqi(trained_model, my_scaler, 60.5, 98, 2.35, 30.8, 18.25, 8.5, 0.1, 11.85, 126.4)
# print(f"Predicted AQI: {aqi:.2f} ({health})")

In [None]:
import streamlit as st
import joblib
import numpy as np
import pandas as pd

try:
    model = joblib.load('aqi_rf_model.pkl')
    scaler = joblib.load('aqi_scaler.pkl')
except FileNotFoundError:
    st.error("Error: 'aqi_rf_model.pkl' or 'aqi_scaler.pkl' not found. Please run your training script first.")

def get_aqi_bucket(x):
    if x <= 50: return "Good", "#00e400"
    elif x <= 100: return "Satisfactory", "#ffff00"
    elif x <= 200: return "Moderate", "#ff7e00"
    elif x <= 300: return "Poor", "#ff0000"
    elif x <= 400: return "Very Poor", "#8f3f97"
    else: return "Severe", "#7e0023"

st.set_page_config(page_title="AQI Prediction App", page_icon="üåç")
st.title("üåç Real-Time Air Quality Predictor")
st.markdown("Enter the pollutant concentrations to predict the Air Quality Index (AQI).")

col1, col2, col3 = st.columns(3)

with col1:
    pm25 = st.number_input("PM2.5 (¬µg/m¬≥)", min_value=0.0, step=1.0, value=60.0)
    pm10 = st.number_input("PM10 (¬µg/m¬≥)", min_value=0.0, step=1.0, value=100.0)
    no = st.number_input("NO (¬µg/m¬≥)", min_value=0.0, step=0.1, value=2.5)

with col2:
    no2 = st.number_input("NO2 (¬µg/m¬≥)", min_value=0.0, step=0.1, value=30.0)
    nox = st.number_input("NOx (¬µg/m¬≥)", min_value=0.0, step=0.1, value=18.0)
    nh3 = st.number_input("NH3 (¬µg/m¬≥)", min_value=0.0, step=0.1, value=8.5)

with col3:
    co = st.number_input("CO (mg/m¬≥)", min_value=0.0, step=0.01, value=0.1)
    so2 = st.number_input("SO2 (¬µg/m¬≥)", min_value=0.0, step=0.1, value=12.0)
    o3 = st.number_input("O3 (¬µg/m¬≥)", min_value=0.0, step=1.0, value=125.0)

if st.button("Predict AQI"):
    input_data = np.array([[pm25, pm10, no, no2, nox, nh3, co, so2, o3]])
    
    input_scaled = scaler.transform(input_data)
    
    # 3. Predict using Random Forest
    prediction = model.predict(input_scaled)[0]
    
    # 4. Get category and color
    bucket_name, color = get_aqi_bucket(prediction)
    
    # 5. Display results
    st.markdown("---")
    st.subheader(f"Predicted AQI: **{prediction:.2f}**")
    st.markdown(f"**Health Category:** <span style='color:{color}; font-weight:bold;'>{bucket_name}</span>", unsafe_allow_html=True)
    
    if bucket_name == "Severe" or bucket_name == "Very Poor":
        st.warning("‚ö†Ô∏è High pollution levels detected. Stay indoors and use air purifiers.")