In [None]:
import numpy as np
import pandas as pd
import os, math, warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv('water_quality_data.csv')
data.head

<bound method NDFrame.head of                     date  temperature_C  turbidity_NTU        pH   DO_mg_L  \
0    2010-01-01 00:00:00      22.993185       5.516143  7.520389  7.906427   
1    2010-02-01 00:00:00      24.104013       7.048402  7.394924  7.910389   
2    2010-03-01 00:00:00      25.775834       7.640319  7.321058  7.860950   
3    2010-04-01 00:00:00      26.215549       6.792833  7.186963  7.355597   
4    2010-05-01 00:00:00      22.320496       2.673462  7.065185  8.019599   
..                   ...            ...            ...       ...       ...   
895  2084-08-01 00:00:00      16.508522       7.440319  7.114732  9.831446   
896  2084-09-01 00:00:00      14.800850       8.047401  7.108811  9.410436   
897  2084-10-01 00:00:00      15.788144       4.782564  6.986962  9.955817   
898  2084-11-01 00:00:00      18.846331       4.806334  7.240330  8.922875   
899  2084-12-01 00:00:00      18.901694       3.330760  7.398031  8.683586   

     BOD_mg_L   COD_mg_L  nitrate

In [None]:
# =============== Synthetic Dataset ===================
def seasonal(n, period=12, amplitude=1.0, phase=0.0):
    t = np.arange(n)
    return amplitude * np.sin(2*np.pi*(t/period) + phase)

def trend(n, slope=0.0, start=0.0):
    return start + slope*np.arange(n)

def generate_data(n_months=900):
    np.random.seed(42)
    dates = pd.date_range("2010-01-01", periods=n_months, freq="MS")

    temp = 20 + seasonal(n_months, 12, 5, 0.5) + np.random.normal(0, 1.2, n_months)
    turbidity = np.clip(5 + 3*seasonal(n_months, 6) + np.random.normal(0, 1.4, n_months) + 0.02*trend(n_months, 0.05), 0, None)
    ph = np.clip(7.2 + 0.2*seasonal(n_months, 12, 1.0, 1.0) + np.random.normal(0, 0.08, n_months), 6.2, 8.8)
    do = np.clip(8.5 - 0.25*(temp-20) + np.random.normal(0, 0.4, n_months), 2.5, 14.0)
    bod = np.clip(2.5 + 0.5*seasonal(n_months, 12, 1.0, 2.0) + np.random.normal(0, 0.4, n_months) + 0.02*turbidity, 0.5, None)
    cod = np.clip(10 + 2.2*seasonal(n_months, 12, 1.0, 2.5) + np.random.normal(0, 1.6, n_months) + 1.5*bod, 2, None)
    nitrate = np.clip(2.0 + 0.8*seasonal(n_months, 12, 1.0, 0.2) + np.random.normal(0, 0.3, n_months) + 0.03*turbidity, 0, None)
    phosphate = np.clip(0.4 + 0.15*seasonal(n_months, 12, 1.0, 0.8) + np.random.normal(0, 0.05, n_months) + 0.01*turbidity, 0, None)
    tds = np.clip(150 + 40*seasonal(n_months, 12, 1.0, 1.4) + np.random.normal(0, 20, n_months) + 3.0*temp, 50, None)
    conductivity = np.clip(300 + 1.5*tds + np.random.normal(0, 30, n_months), 100, None)
    fecal_coliform = np.clip(30 + 8*seasonal(n_months, 12, 1.0, 2.1) + np.random.normal(0, 6, n_months) + 1.5*turbidity, 0, None)

    df = pd.DataFrame({
        "date": dates,
        "temperature_C": temp,
        "turbidity_NTU": turbidity,
        "pH": ph,
        "DO_mg_L": do,
        "BOD_mg_L": bod,
        "COD_mg_L": cod,
        "nitrate_mg_L": nitrate,
        "phosphate_mg_L": phosphate,
        "TDS_mg_L": tds,
        "conductivity_uS_cm": conductivity,
        "fecal_coliform_CFU_100mL": fecal_coliform
    })

    wqi = (
        12 +
        8*np.clip(1 - np.abs(df["pH"]-7.0)/1.5, 0, 1) +
        15*np.clip(df["DO_mg_L"]/12, 0, 1) -
        8*np.tanh(df["turbidity_NTU"]/15) -
        10*np.tanh(df["BOD_mg_L"]/6) -
        10*np.tanh(df["COD_mg_L"]/40) -
        6*np.tanh(df["nitrate_mg_L"]/6) -
        6*np.tanh(df["phosphate_mg_L"]/0.8) -
        8*np.tanh(df["fecal_coliform_CFU_100mL"]/150) -
        5*np.tanh(df["TDS_mg_L"]/600)
    )
    wqi = np.clip(wqi + np.random.normal(0, 2.0, n_months), 0, 100)
    df["WQI"] = wqi
    return df

In [None]:
data.columns

Index(['date', 'temperature_C', 'turbidity_NTU', 'pH', 'DO_mg_L', 'BOD_mg_L',
       'COD_mg_L', 'nitrate_mg_L', 'phosphate_mg_L', 'TDS_mg_L',
       'conductivity_uS_cm', 'fecal_coliform_CFU_100mL', 'WQI'],
      dtype='object')

In [None]:
df = generate_data()
os.makedirs("artifacts", exist_ok=True)
df.to_csv("artifacts/water_quality_synthetic.csv", index=False)
df.to_excel("artifacts/water_quality_synthetic.xlsx", index=False)

# =============== Train/Test Split (Modified to use only 3 features) ===================
# MODIFIED: Use only TDS_mg_L, pH, turbidity_NTU as features
features = ["TDS_mg_L", "pH", "turbidity_NTU"]
X = df[features].values
y = df["WQI"].values

In [None]:
print(f"Using features: {features}")
print(f"Dataset shape: {X.shape}")
print(f"Feature statistics:")
for i, feature in enumerate(features):
    print(f"  {feature}: min={X[:, i].min():.2f}, max={X[:, i].max():.2f}, mean={X[:, i].mean():.2f}")


Using features: ['TDS_mg_L', 'pH', 'turbidity_NTU']
Dataset shape: (900, 3)
Feature statistics:
  TDS_mg_L: min=104.66, max=315.78, mean=210.44
  pH: min=6.78, max=7.65, mean=7.20
  turbidity_NTU: min=0.00, max=12.78, mean=5.56


In [None]:
split_idx = int(0.8*len(df))
X_train, y_train = X[:split_idx], y[:split_idx]
X_test, y_test = X[split_idx:], y[split_idx:]

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")


Train set: 720 samples
Test set: 180 samples


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import math

# =============== Random Forest ===================
print("\n" + "="*50)
print("TRAINING MODELS WITH 3 FEATURES")
print("="*50)

rf = RandomForestRegressor(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

def evaluate(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = math.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{name}: MAE={mae:.3f}, RMSE={rmse:.3f}, R²={r2:.3f}")

evaluate("Random Forest", y_test, rf_pred)

# Feature importance for Random Forest
feature_importance = rf.feature_importances_
print(f"\nRandom Forest Feature Importance:")
for i, feature in enumerate(features):
    print(f"  {feature}: {feature_importance[i]:.3f}")



TRAINING MODELS WITH 3 FEATURES
Random Forest: MAE=2.004, RMSE=2.483, R²=0.673

Random Forest Feature Importance:
  TDS_mg_L: 0.331
  pH: 0.404
  turbidity_NTU: 0.264


In [None]:
# =============== Model Comparison and Visualization ===================
print("\n" + "="*50)
print("MODEL COMPARISON SUMMARY")
print("="*50)

# Create a simple prediction function for new data
def predict_wqi(tds, ph, turbidity, model_type='rf'):
    """
    Predict WQI for new data points
    
    Parameters:
    - tds: Total Dissolved Solids (mg/L)
    - ph: pH level
    - turbidity: Turbidity (NTU)
    - model_type: 'rf' for Random Forest, 'xgb' for XGBoost
    """
    new_data = np.array([[tds, ph, turbidity]])
    return rf.predict(new_data)[0]
    

# Example predictions
print("\nExample WQI Predictions:")
print("-" * 30)

test_cases = [
    {"name": "Good Quality", "tds": 150, "ph": 7.0, "turbidity": 2.0},
    {"name": "Moderate Quality", "tds": 300, "ph": 6.5, "turbidity": 5.0},
    {"name": "Poor Quality", "tds": 500, "ph": 6.0, "turbidity": 10.0},
    {"name": "Very Poor Quality", "tds": 800, "ph": 5.5, "turbidity": 15.0}
]

for case in test_cases:
    rf_pred = predict_wqi(case["tds"], case["ph"], case["turbidity"], 'rf')
    print(f"{case['name']:15} (TDS={case['tds']}, pH={case['ph']}, Turb={case['turbidity']}): WQI = {rf_pred:.1f}")


MODEL COMPARISON SUMMARY

Example WQI Predictions:
------------------------------
Good Quality    (TDS=150, pH=7.0, Turb=2.0): WQI = 16.5
Moderate Quality (TDS=300, pH=6.5, Turb=5.0): WQI = 12.3
Poor Quality    (TDS=500, pH=6.0, Turb=10.0): WQI = 9.7
Very Poor Quality (TDS=800, pH=5.5, Turb=15.0): WQI = 9.9


In [None]:
import pickle
import numpy as np
import os
import pandas as pd

# ==================== WATER QUALITY PREDICTOR ====================

class WaterQualityPredictor:
    def __init__(self, model_path="artifacts/"):
        self.model_path = model_path
        self.model = None
        self.features = ["TDS_mg_L", "pH", "turbidity_NTU"]
        self.load_model()

    def load_model(self):
        """Load the Random Forest model from pickle"""
        rf_path = os.path.join(self.model_path, "wqi_model_rf.pkl")
        if os.path.exists(rf_path):
            with open(rf_path, "rb") as f:
                self.model = pickle.load(f)
            print("✅ Random Forest model loaded successfully")
        else:
            raise FileNotFoundError("❌ Random Forest model not found in artifacts/")

    def predict(self, tds, ph, turbidity):
        """Predict WQI value"""
        if self.model is None:
            raise ValueError("Model not loaded!")
        input_data = np.array([[tds, ph, turbidity]])
        prediction = self.model.predict(input_data)[0]
        return max(0, min(100, prediction))  # keep 0–100

    def get_water_quality_category(self, wqi_score):
        """Convert WQI to quality category"""
        if wqi_score >= 90:
            return "Excellent", "Safe for all purposes"
        elif wqi_score >= 70:
            return "Good", "Safe for drinking with minor treatment"
        elif wqi_score >= 50:
            return "Fair", "Suitable for irrigation and industrial use"
        elif wqi_score >= 25:
            return "Poor", "Suitable only for irrigation"
        else:
            return "Very Poor", "Unsuitable for any use - requires extensive treatment"


# ==================== HEALTH SYSTEM INTEGRATION ====================

def integrate_with_health_system():
    predictor = WaterQualityPredictor()

    # Simulated sensor data
    sensor_data = {
        "tds": 450,
        "ph": 6.3,
        "turbidity": 9.5,
        "timestamp": pd.Timestamp.now().isoformat()
    }

    # Predict WQI and health risk
    wqi = predictor.predict(sensor_data["tds"], sensor_data["ph"], sensor_data["turbidity"])
    category, _ = predictor.get_water_quality_category(wqi)

    if wqi < 50:
        risk = "HIGH"
        alert = "🚨 WATER QUALITY ALERT: High risk of waterborne diseases!"
        actions = [
            "Issue boil water advisory",
            "Distribute water purification tablets",
            "Increase health monitoring",
            "Test alternative water sources",
            "Chlorinate the water thoroughly",
            "Use point-of-use water filters before drinking",
            "Provide ORS and basic medication for common waterborne diseases"
        ]
    elif wqi < 70:
        risk = "MODERATE"
        alert = "⚠️ Water quality below recommended standards"
        actions = [
            "Advise basic water treatment",
            "Monitor health reports closely"
        ]
    else:
        risk = "LOW"
        alert = "✅ Water quality within acceptable range"
        actions = ["Continue routine monitoring"]

    return {
        "wqi_score": round(wqi, 1),
        "quality_category": category,
        "health_risk_level": risk,
        "alert_message": alert,
        "recommended_actions": actions,
        "timestamp": sensor_data["timestamp"]
    }


# ==================== MAIN ====================

if __name__ == "__main__":
    result = integrate_with_health_system()

    print("\n📋 Integration Result for Health System")
    print("=" * 60)
    print(f"🔢 WQI Score: {result['wqi_score']} ({result['quality_category']})")
    print(f"⚠️  Health Risk: {result['health_risk_level']}")
    print(f"📢 Alert: {result['alert_message']}")
    print("📝 Recommended Actions:")
    for action in result["recommended_actions"]:
        print(f"   • {action}")


✅ Random Forest model loaded successfully

📋 Integration Result for Health System
🔢 WQI Score: 9.8 (Very Poor)
⚠️  Health Risk: HIGH
📢 Alert: 🚨 WATER QUALITY ALERT: High risk of waterborne diseases!
📝 Recommended Actions:
   • Issue boil water advisory
   • Distribute water purification tablets
   • Increase health monitoring
   • Test alternative water sources
   • Chlorinate the water thoroughly
   • Use point-of-use water filters before drinking
   • Provide ORS and basic medication for common waterborne diseases


In [2]:
import sqlite3

# Connect to database (creates water_data.db if not exists)
conn = sqlite3.connect('water_data.db')
cursor = conn.cursor()

# Create table for storing sensor readings
cursor.execute('''
CREATE TABLE IF NOT EXISTS water_readings (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    tds REAL,
    ph REAL,
    turbidity REAL)
''')

conn.commit()
conn.close()

print("✅ Database and table created successfully.")


✅ Database and table created successfully.


In [5]:
import sqlite3

def save_sensor_reading(tds, ph, turbidity):
    conn = sqlite3.connect('water_data.db')
    cursor = conn.cursor()
    
    cursor.execute('''
        INSERT INTO water_readings (tds, ph, turbidity)
        VALUES (?, ?, ?)
    ''', (tds, ph, turbidity))
    
    conn.commit()
    conn.close()
    
    print("✅ Sensor data saved successfully.")
