In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import joblib

In [13]:
df = pd.read_csv(r"C:\\Users\\katta\\Chandapur_Lake_Water_Quality_Prediction\\outputs\\cleaned_chandapur_lake_water_quality.csv")

In [17]:
# ===============================
# MODEL EXPERIMENTS - CLEAN CODE
# ===============================

print("Dataset shape:", df.shape)
print("Columns:\n", df.columns)

# -------------------------------
# Select TARGET (professionally)
# -------------------------------
TARGET_COLUMN = "BOD (mg/L)"

# -------------------------------
# Separate X and y
# -------------------------------
X = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN]

# -------------------------------
# Handle non-numeric columns
# -------------------------------
X = X.select_dtypes(include=[np.number])

print("X shape:", X.shape)
print("y shape:", y.shape)

# -------------------------------
# Train-test split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------------------
# Scaling
# -------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -------------------------------
# Model 1: Linear Regression
# -------------------------------
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
lr_preds = lr.predict(X_test_scaled)

lr_rmse = np.sqrt(mean_squared_error(y_test, lr_preds))
lr_r2 = r2_score(y_test, lr_preds)

print("Linear Regression RMSE:", lr_rmse)
print("Linear Regression R2:", lr_r2)

# -------------------------------
# Model 2: Random Forest
# -------------------------------
rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

rf_rmse = np.sqrt(mean_squared_error(y_test, rf_preds))
rf_r2 = r2_score(y_test, rf_preds)

print("Random Forest RMSE:", rf_rmse)
print("Random Forest R2:", rf_r2)

Dataset shape: (23, 46)
Columns:
 Index(['Sampling Month', 'Name of Monitoring location', 'Use based class',
       'Temperature', 'Dissolved O2 (mg/L)', 'pH',
       'Conductivity (micro;mho/cm)', 'BOD (mg/L)', 'Nitrate (N) (mg/L)',
       'Nitrite (N) (mg/L)', 'Fecal coliform (MPN/100ml)',
       'Total coliform (MPN/100ml)', 'Fecal streptococci (MPN/100ml)',
       'Carbonate (CO3)', 'Bicarbonate (HCO3)', 'Turbidity (mg/L)',
       'Phenaphthalein Alkalinity (mg/L)', 'Total Alkalinity (mg/L)',
       'Chlorides (mg/L)', 'COD (mg/L)', 'Total Kjedhal Nitogen (mg/L)',
       'Ammonical-N (mg/L)', 'Total Hardness (mg/L)', 'Ca as CaCo3 (mg/L)',
       'Ca', 'Mg as CaCo3 (mg/L)', 'Mg', 'Sulphate (mg/L)', 'Sodium (mg/l)',
       'Total Dissolved Solids (mg/L)', 'Total Suspended Solids (mg/L)',
       'Phosphate (mg/L)', 'Boron (mg/L)', 'Potassium (mg/L)',
       'Fluoride (mg/L)', 'Sodium percentage', 'SAR', 'Ortho Phosphate (mg/L)',
       'Cadmium (mg/L', 'Copper (mg/L)', 'Lead (mg/L)', 