In [None]:
pip install pandas openpyxl




In [None]:
import pandas as pd

# --- Configuration ---
INPUT_FILE = "/content/drive/MyDrive/cur_water_data_labeled.xlsx"
OUTPUT_FILE = "cur_water_data_with_month_numbers.xlsx"
DATE_COLUMN = "sample_date"

def get_season(month_number):
    """
    Categorizes a month number into a season based on the provided logic.
    """
    if month_number in [3, 4, 5, 6]:
        return "Summer"
    elif month_number in [7, 8, 9, 10]:
        return "Monsoon"
    else:  # Covers months 11, 12, 1, 2
        return "Winter"

# --- Main Script ---
try:
    # 1. Load the dataset
    print(f"Reading data from '{INPUT_FILE}'...")
    df = pd.read_excel('/content/drive/MyDrive/cur_water_data_labeled.xlsx', engine='openpyxl')
    print("✅ Data loaded successfully.")

    # 2. Convert the date column to a proper datetime format
    df['date'] = pd.to_datetime(df[DATE_COLUMN], errors='coerce')
    df.dropna(subset=['date'], inplace=True)
    print(f"✅ Converted '{DATE_COLUMN}' to a usable date format.")

    # 3. Create the new 'Year', 'Month', and 'Season' columns
    df['Year'] = df['date'].dt.year
    df['Month'] = df['date'].dt.month # Get the month number
    df['Season'] = df['Month'].apply(get_season) # Use the month number for logic

    print("✅ New 'Year', 'Month' (as a number), and 'Season' columns created.")

    # 4. Show a preview of the new columns
    print("\n--- Preview of the new data ---")
    print(df[[DATE_COLUMN, 'Year', 'Month', 'Season']].head())

    # 5. Save the updated DataFrame to a new Excel file
    # We drop the temporary 'date' column before saving
    df.drop(columns=['date'], inplace=True)
    df.to_excel(OUTPUT_FILE, index=False)
    print(f"\n✅ Success! New file saved as '{OUTPUT_FILE}'")

except FileNotFoundError:
    print(f"❌ Error: Input file '{INPUT_FILE}' not found. Please make sure it's in the same folder.")
except Exception as e:
    print(f"An error occurred: {e}")

Reading data from '/content/drive/MyDrive/cur_water_data_labeled.xlsx'...
❌ Error: Input file '/content/drive/MyDrive/cur_water_data_labeled.xlsx' not found. Please make sure it's in the same folder.


In [None]:
!pip uninstall flask-ngrok -y
!pip install pyngrok

[0mCollecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.3.0


In [None]:
pip install pandas openpyxl



In [None]:
import pandas as pd
import re

# --- Configuration ---
# ⬇️ UPDATE THIS with the path to your Excel file ⬇️
INPUT_FILE = "/content/drive/MyDrive/cur_water_data_with_month_numbers.xlsx"
# Name for the new, cleaned output file
OUTPUT_FILE = "water_data_cleaned.csv"

try:
    # 1. Load the dataset from the Excel file
    print(f"--- Loading Data from '{INPUT_FILE}' ---")
    # The corrected function is pd.read_excel() and we use the 'openpyxl' engine
    df = pd.read_excel(INPUT_FILE, engine='openpyxl')
    print("✅ Data loaded successfully.")

    # --- 2. Clean the Data ---
    print("\n--- Cleaning Data ---")

    # List of columns that might contain non-numeric text like '(BDL)'
    # Add any other columns you need to clean to this list
    cols_to_clean = [
        'bod', 'cod', 'nitrate', 'FecalColiform',
        # Adding other potential columns just in case names vary
        'B.O.D. (mg/L)', 'C.O.D. (mg/L)', 'Nitrate (mg/L)', 'Fecal Coliform (MPN/100 ml)'
    ]

    cleaned_cols_found = []
    for col in cols_to_clean:
        if col in df.columns:
            cleaned_cols_found.append(col)
            # First, convert the column to a string type to handle any mixed data
            df[col] = df[col].astype(str)
            # Use regex to find and extract the first number (integer or float)
            df[col] = df[col].str.extract(r'(\d+\.?\d*)', expand=False)
            # Convert the cleaned column back to a numeric type
            df[col] = pd.to_numeric(df[col], errors='coerce')

    if cleaned_cols_found:
        print(f"✅ Cleaned the following columns by removing text like '(BDL)': {cleaned_cols_found}")
    else:
        print("⚠️ No columns with names matching the cleaning list were found.")

    # --- 3. Show a Preview ---
    print("\n--- Preview of Cleaned Data ---")
    # Display the first 5 rows of the potentially cleaned columns to verify
    if cleaned_cols_found:
        print(df[cleaned_cols_found].head())
    else:
        print("No columns were cleaned.")

    # --- 4. Save the Cleaned Data ---
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"\n✅ Success! Cleaned data has been saved to '{OUTPUT_FILE}'")

except FileNotFoundError:
    print(f"❌ Error: Input file '{INPUT_FILE}' not found. Please check the name and path.")
except Exception as e:
    print(f"An error occurred: {e}")

--- Loading Data from '/content/drive/MyDrive/cur_water_data_with_month_numbers.xlsx' ---
❌ Error: Input file '/content/drive/MyDrive/cur_water_data_with_month_numbers.xlsx' not found. Please check the name and path.


In [None]:
pip install pandas openpyxl scikit-learn



**MODEL TRAINING**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

# ==============================================================================
# 1. LOAD AND CLEAN THE DATA
# ==============================================================================
print("--- 1. Loading and Cleaning Data ---")
try:
    # Use the filename provided from your upload
    file_path = "/content/drive/MyDrive/cur_water_data_cleaned.csv"
    df = pd.read_csv(file_path)
    print(f"✅ Data loaded successfully with {len(df)} rows.")
except FileNotFoundError:
    print(f"❌ Error: File '{file_path}' not found.")
    exit()

# --- Clean all potential numeric columns ---
# This ensures columns like 'bod', 'cod', etc., are purely numeric
for col in df.columns:
    if df[col].dtype == 'object':
        # Extracts the first number found in a string, handles cases like '0.3 (BDL)'
        df[col] = df[col].astype(str).str.extract(r'(\d+\.?\d*)', expand=False)
        df[col] = pd.to_numeric(df[col], errors='coerce')

# --- Handle Missing Values using Median Imputation ---
for column in df.select_dtypes(include=np.number).columns:
    median_value = df[column].median()
    df[column].fillna(median_value, inplace=True)
print("✅ Data cleaned and missing values handled.")

# ==============================================================================
# 2. PREPARE DATA FOR THE MODEL
# ==============================================================================
print("\n--- 2. Preparing Data ---")

# --- Define Target (y) and Initial Features (X) ---
TARGET_COLUMN = 'wqi'

# Drop the target and columns directly derived from it to prevent data leakage
# Also drop the original date string, as we have Year and Month
X = df.drop(columns=[TARGET_COLUMN, 'wqi_index', 'sample_date'], errors='ignore')
y = df[TARGET_COLUMN]

# --- Handle Categorical Features using One-Hot Encoding ---
# This converts text columns like 'Station Name' and 'Season' into a numeric format
X = pd.get_dummies(X, columns=['Station Name', 'Season'], drop_first=True)

print(f"🎯 Target variable: '{TARGET_COLUMN}'")
print(f"📖 Final features being used ({len(X.columns)}): {X.columns.tolist()}")

# --- Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\n✅ Data split into training and testing sets.")

# --- Scale the Features ---
# This standardizes all numerical features to a common scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("✅ Features scaled successfully.")

# ==============================================================================
# 3. TRAIN THE PREDICTIVE MODEL
# ==============================================================================
print("\n--- 3. Training the Model ---")

# We will use a RandomForestRegressor, which is excellent for this type of problem
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Train the model on the prepared training data
model.fit(X_train_scaled, y_train)
print("✅ Model trained successfully!")

# ==============================================================================
# 4. EVALUATE THE MODEL'S PERFORMANCE
# ==============================================================================
print("\n--- 4. Evaluating Model Performance ---")

# Make predictions on the unseen test data
y_pred = model.predict(X_test_scaled)

# --- Print Performance Metrics ---
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R²): {r2:.2f}")

# ==============================================================================
# 5. UNDERSTAND WHAT DRIVES THE PREDICTION (FEATURE IMPORTANCE)
# ==============================================================================
print("\n--- 5. Feature Importance ---")

# Create a DataFrame to see which features were most important to the model
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("Top features influencing the prediction:")
print(importance_df.head(10)) # Display top 10 features

--- 1. Loading and Cleaning Data ---
✅ Data loaded successfully with 900 rows.
✅ Data cleaned and missing values handled.

--- 2. Preparing Data ---
🎯 Target variable: 'wqi'
📖 Final features being used (10): ['latitude', 'longitude', 'pH', 'dissolvedoxygen', 'bod', 'cod', 'nitrate', 'FecalColiform', 'Year', 'Month']

✅ Data split into training and testing sets.
✅ Features scaled successfully.

--- 3. Training the Model ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(median_value, inplace=True)


✅ Model trained successfully!

--- 4. Evaluating Model Performance ---
Mean Absolute Error (MAE): 1.18
R-squared (R²): 0.98

--- 5. Feature Importance ---
Top features influencing the prediction:
           Feature  Importance
3  dissolvedoxygen    0.487083
4              bod    0.269537
5              cod    0.152543
7    FecalColiform    0.060365
2               pH    0.024013
1        longitude    0.003263
6          nitrate    0.001057
9            Month    0.000911
0         latitude    0.000645
8             Year    0.000583


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**XGboost Model used for Final Project**

In [None]:
pip install pandas scikit-learn xgboost lightgbm



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np
import joblib


# ==============================================================================
# 1. LOAD AND PREPARE THE DATA
# ==============================================================================
print("--- 1. Loading and Preparing Data for XGBoost ---")
try:
    # Use the cleaned data file you provided
    file_path = "cur_water_data_cleaned.csv"
    df = pd.read_csv(file_path)
    print(f"✅ Data loaded successfully with {len(df)} rows.")
except FileNotFoundError:
    print(f"❌ Error: File '{file_path}' not found.")
    exit()

# --- Define Target (y) and Features (X) ---
TARGET_COLUMN = 'wqi'
# For this model, we'll focus on the core numerical measurements
features_to_use = [
    'pH', 'dissolvedoxygen', 'bod', 'cod', 'nitrate', 'FecalColiform',
    'Year', 'Month', 'latitude', 'longitude'
]
X = df[features_to_use].copy() # Create a copy to avoid SettingWithCopyWarning
y = df[TARGET_COLUMN]

print(f"🎯 Target variable: '{TARGET_COLUMN}'")
print(f"📖 Features ({len(X.columns)}) being used: {X.columns.tolist()}")

# --- Clean all potential numeric columns in the selected features ---
# Ensure columns like 'bod', 'cod', etc., are purely numeric
for col in features_to_use:
    if col in X.columns:
        # Extracts the first number found in a string, handles cases like '0.3 (BDL)'
        X[col] = X[col].astype(str).str.extract(r'(\d+\.?\d*)', expand=False)
        X[col] = pd.to_numeric(X[col], errors='coerce')

# --- Handle Missing Values using Median Imputation ---
for column in X.select_dtypes(include=np.number).columns:
    median_value = X[column].median()
    X[column].fillna(median_value, inplace=True)
print("✅ Data cleaned and missing values handled.")


# --- Split and Scale Data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
joblib.dump(X_train, "X_train_wqi.pkl")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("✅ Data prepared, split, and scaled successfully.")

# ==============================================================================
# 2. TRAIN AND EVALUATE XGBOOST MODEL
# ==============================================================================
print("\n--- 2. Training XGBoost Model ---")
# Initialize and train the XGBRegressor
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(X_train_scaled, y_train)
print("✅ XGBoost model trained successfully!")

# --- Evaluate Performance ---
print("\n--- XGBoost Model Performance ---")
y_pred_xgb = xgb_model.predict(X_test_scaled)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"Mean Absolute Error (MAE): {mae_xgb:.2f}")
print(f"R-squared (R²): {r2_xgb:.2f}")

# --- Feature Importance ---
xgb_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nTop features (XGBoost):")
print(xgb_importance_df.head())

--- 1. Loading and Preparing Data for XGBoost ---
✅ Data loaded successfully with 900 rows.
🎯 Target variable: 'wqi'
📖 Features (10) being used: ['pH', 'dissolvedoxygen', 'bod', 'cod', 'nitrate', 'FecalColiform', 'Year', 'Month', 'latitude', 'longitude']
✅ Data cleaned and missing values handled.
✅ Data prepared, split, and scaled successfully.

--- 2. Training XGBoost Model ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(median_value, inplace=True)


✅ XGBoost model trained successfully!

--- XGBoost Model Performance ---
Mean Absolute Error (MAE): 0.95
R-squared (R²): 0.99

Top features (XGBoost):
           Feature  Importance
1  dissolvedoxygen    0.480543
2              bod    0.406799
5    FecalColiform    0.078269
3              cod    0.019331
0               pH    0.011481


In [None]:
import pickle

# Bundle model and scaler
model_bundle = {
    "model": xgb_model,
    "scaler": scaler,
    "features": features_to_use
}

# Save the model to a file
with open("xgboost_model.pkl", "wb") as model_file:
    pickle.dump(model_bundle, model_file)

**LightGBM Model not used for Final Model**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

# ==============================================================================
# 1. LOAD AND PREPARE THE DATA
# ==============================================================================
print("--- 1. Loading and Preparing Data for LightGBM ---")
try:
    # Use the cleaned data file you provided
    file_path = "/content/drive/MyDrive/cur_water_data_cleaned.csv"
    df = pd.read_csv(file_path)
    print(f"✅ Data loaded successfully with {len(df)} rows.")
except FileNotFoundError:
    print(f"❌ Error: File '{file_path}' not found.")
    exit()

# --- Define Target (y) and Features (X) ---
TARGET_COLUMN = 'wqi'
# For this model, we'll focus on the core numerical measurements
features_to_use = [
    'pH', 'dissolvedoxygen', 'bod', 'cod', 'nitrate', 'FecalColiform',
    'Year', 'Month', 'latitude', 'longitude'
]
X = df[features_to_use].copy() # Create a copy to avoid SettingWithCopyWarning
y = df[TARGET_COLUMN]

print(f"🎯 Target variable: '{TARGET_COLUMN}'")
print(f"📖 Features ({len(X.columns)}) being used: {X.columns.tolist()}")

# --- Clean all potential numeric columns in the selected features ---
# Ensure columns like 'bod', 'cod', etc., are purely numeric
for col in features_to_use:
    if col in X.columns:
        # Extracts the first number found in a string, handles cases like '0.3 (BDL)'
        X[col] = X[col].astype(str).str.extract(r'(\d+\.?\d*)', expand=False)
        X[col] = pd.to_numeric(X[col], errors='coerce')

# --- Handle Missing Values using Median Imputation ---
for column in X.select_dtypes(include=np.number).columns:
    median_value = X[column].median()
    X[column].fillna(median_value, inplace=True)
print("✅ Data cleaned and missing values handled.")


# --- Split and Scale Data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("✅ Data prepared, split, and scaled successfully.")

# ==============================================================================
# 2. TRAIN AND EVALUATE LIGHTGBM MODEL
# ==============================================================================
print("\n--- 2. Training LightGBM Model ---")
# Initialize and train the LGBMRegressor
lgbm_model = lgb.LGBMRegressor(random_state=42)
lgbm_model.fit(X_train_scaled, y_train)
print("✅ LightGBM model trained successfully!")

# --- Evaluate Performance ---
print("\n--- LightGBM Model Performance ---")
y_pred_lgbm = lgbm_model.predict(X_test_scaled)
mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
r2_lgbm = r2_score(y_test, y_pred_lgbm)

print(f"Mean Absolute Error (MAE): {mae_lgbm:.2f}")
print(f"R-squared (R²): {r2_lgbm:.2f}")

# --- Feature Importance ---
lgbm_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': lgbm_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nTop features (LightGBM):")
print(lgbm_importance_df.head())

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import numpy as np

# --- 1. Load the Dataset ---
# Make sure 'cur_water_data_cleaned.csv' is in the same directory as this script
try:
    df = pd.read_csv('/content/drive/MyDrive/cur_water_data_cleaned.csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'cur_water_data_cleaned.csv' not found. Please check the file path.")
    exit()

# --- 2. Define Features and Target ---
# Select the numerical columns to be used as features for the model
numerical_features = [
    'latitude', 'longitude', 'pH', 'dissolvedoxygen',
    'bod', 'cod', 'nitrate', 'FecalColiform', 'Year', 'Month'
]
# Define the target variable we want to predict
target = 'wqi'

# --- 3. Data Cleaning and Preprocessing ---
# Convert feature columns to numeric, replacing non-numeric values with NaN
for col in numerical_features:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill any missing (NaN) values with the mean of that column
for col in numerical_features:
    if df[col].isnull().any():
        df[col].fillna(df[col].mean(), inplace=True)

# Clean the target column as well
df[target] = pd.to_numeric(df[target], errors='coerce')
if df[target].isnull().any():
    df[target].fillna(df[target].mean(), inplace=True)

print("Data cleaning and preprocessing complete.")

# --- 4. Prepare Data for Modeling ---
# Assign the features to X and the target to y
X = df[numerical_features]
y = df[target]

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 5. Train the RandomForestRegressor Model ---
# Initialize the model with 100 trees for a good balance of performance and speed
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)
print("Model training complete.")

# --- 6. Evaluate the Model ---
# Make predictions on the unseen test data
y_pred = model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the performance of the model
print("\n--- Model Performance ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.3f}")
print("--------------------------")


# --- 7. Save the Trained Model ---
# Save the model to a file for later use
model_filename = 'random_forest_regressor_model.joblib'
joblib.dump(model, model_filename)
print(f"\nModel saved successfully as '{model_filename}'")


# --- Example of How to Load and Use the Model for a New Prediction ---
print("\n--- Prediction Example ---")
# Load the saved model
loaded_model = joblib.load(model_filename)

# Create a new data sample for prediction (example values)
# The order of values must match the 'numerical_features' list
new_sample = [[19.07, 72.87, 7.5, 5.0, 10.0, 150.0, 1.5, 400.0, 2024, 9]]
new_prediction = loaded_model.predict(new_sample)

print(f"Predicted WQI for the new sample: {new_prediction[0]:.2f}")
print("--------------------------")

sarima forecasting


In [None]:
# This command fixes the environment by reinstalling necessary libraries.
# The verbose flag will show you the progress so you know it's not stuck.

!pip uninstall -y pmdarima numpy statsmodels scipy
!pip install --verbose pmdarima

In [None]:
# Import all necessary libraries
import pandas as pd
import pmdarima as pm
import matplotlib.pyplot as plt

print("Libraries imported successfully!")

# --- 1. Load and Prepare the Data ---
try:
    df = pd.read_csv('/content/drive/MyDrive/cur_water_data_cleaned.csv')
    df['sample_date'] = pd.to_datetime(df['sample_date'])
    monthly_wqi = df.groupby('sample_date')['wqi'].mean().sort_index()
    monthly_wqi = monthly_wqi.asfreq('MS', method='ffill')
    print("Data prepared for time-series forecasting.")
    print("Latest WQI data point is for:", monthly_wqi.index[-1].strftime('%B %Y'))

except FileNotFoundError:
    print("Error: 'cur_water_data_cleaned.csv' not found. Please upload the file again.")

# --- 2. Find the Best Model and Train It (Faster Version) ---
print("\nFinding the best SARIMA model... You will now see the progress below:")

# The key change is trace=True, which shows you what the model is doing.
# We also slightly reduce the search space (max_p, max_q) to speed it up.
sarima_model = pm.auto_arima(monthly_wqi,
                             start_p=1, start_q=1,
                             test='adf',
                             max_p=2, max_q=2, # Reduced search space
                             m=12,
                             d=None,
                             seasonal=True,
                             start_P=0,
                             D=1,
                             trace=True,       # <-- THIS IS THE IMPORTANT CHANGE
                             error_action='ignore',
                             suppress_warnings=True,
                             stepwise=True)

print("\nBest SARIMA model found and trained.")
print(sarima_model.summary())

# --- 3. Forecast and Visualize (No changes needed here) ---
n_periods = 3
forecast, conf_int = sarima_model.predict(n_periods=n_periods, return_conf_int=True)
forecast_dates = pd.date_range(start=monthly_wqi.index[-1] + pd.DateOffset(months=1),
                               periods=n_periods,
                               freq='MS')
forecast_series = pd.Series(forecast, index=forecast_dates)

print("\n--- Water Quality Index (WQI) Forecast ---")
for date, value in forecast_series.items():
    print(f"{date.strftime('%B %Y')}: {value:.2f}")
print("------------------------------------------")

plt.figure(figsize=(15, 7))
plt.plot(monthly_wqi.index, monthly_wqi, label='Historical WQI')
plt.plot(forecast_series.index, forecast_series, label='Forecasted WQI', color='red', marker='o')
plt.fill_between(forecast_series.index, conf_int[:, 0], conf_int[:, 1], color='red', alpha=0.1, label='95% Confidence Interval')
plt.title('WQI Forecast for the Next 3 Months', fontsize=16)
plt.xlabel('Date')
plt.ylabel('Average Water Quality Index (WQI)')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()