In [11]:
# %%
import os
import math
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import requests
from dotenv import load_dotenv

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

# GPU?
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)


Using device: cpu


In [12]:
from dotenv import load_dotenv
import os

# Load the .env file from the project root
load_dotenv(dotenv_path="AI/.env")  # adjust path if notebook is in /notebooks

OPENWEATHER_API_KEY = os.getenv("OPENWEATHER_API_KEY")
print("API key loaded:", OPENWEATHER_API_KEY)


API key loaded: a4003d7f23905ff2209adeff29fc386f


In [13]:
# %%
# === CONFIG ===
CSV_PATH = "AI/synthetic_dataset (1).csv"   # path to your CSV
WINDOW = 14           
BATCH_SIZE = 64
LR = 1e-3
EPOCHS = 80
PATIENCE = 8         

FEATURE_COLS = ["temp", "humidity", "rainfall", "wqi", "diarrhea", "cholera", "typhoid"]
TARGET_COLS = ["diarrhea", "cholera", "typhoid"]
MODEL_OUT = "cnn_lstm_best.pth"
SEED = 42

# 🔑 Load API key from .env file
load_dotenv()
OPENWEATHER_API_KEY = os.getenv("OPENWEATHER_API_KEY")
print("OPENWEATHER_API_KEY loaded:", OPENWEATHER_API_KEY is not None)

torch.manual_seed(SEED)
np.random.seed(SEED)


OPENWEATHER_API_KEY loaded: True


In [14]:
# %%
df = pd.read_csv(CSV_PATH, parse_dates=["Date"], dayfirst=True, infer_datetime_format=True)
df = df.sort_values("Date").reset_index(drop=True)

df[FEATURE_COLS] = df[FEATURE_COLS].apply(pd.to_numeric, errors='coerce').fillna(0.0)
df[TARGET_COLS] = df[TARGET_COLS].astype(int)

# Add difference features
for col in ["temp", "humidity", "rainfall"]:
    df[f"{col}_diff"] = df[col].diff().fillna(0)

FEATURE_COLS_EXT = FEATURE_COLS + [f"{col}_diff" for col in ["temp", "humidity", "rainfall"]]

print("Dataset preview:")
print(df.head())
print("\nFeature columns:", FEATURE_COLS_EXT)


Dataset preview:
         Date       temp   humidity  rainfall        wqi  diarrhea  cholera  \
0  2024-01-01  19.461478  71.675727       0.0  51.414393        62       86   
1  2024-01-02  22.791228  70.747452       0.0  51.734313        60       93   
2  2024-01-03  19.489235  72.009741       0.0  49.689902        56       90   
3  2024-01-04  22.852806  69.314989       0.0  48.437165        72       81   
4  2024-01-05  22.272993  69.781259       0.0  47.146171        72      106   

   typhoid  total_cases  temp_diff  humidity_diff  rainfall_diff  
0       41          189   0.000000       0.000000            0.0  
1       33          186   3.329750      -0.928276            0.0  
2       40          186  -3.301993       1.262289            0.0  
3       51          204   3.363571      -2.694752            0.0  
4       46          224  -0.579813       0.466270            0.0  

Feature columns: ['temp', 'humidity', 'rainfall', 'wqi', 'diarrhea', 'cholera', 'typhoid', 'temp_diff', '

  df = pd.read_csv(CSV_PATH, parse_dates=["Date"], dayfirst=True, infer_datetime_format=True)


In [15]:
# %%
class SeqDataset(Dataset):
    def __init__(self, df, feature_cols, target_cols, window=WINDOW, scaler_x=None, scaler_y=None):
        self.df = df.reset_index(drop=True)
        self.feature_cols = feature_cols
        self.target_cols = target_cols
        self.window = window

        X = self.df[self.feature_cols].values.astype(np.float32)
        Y = self.df[self.target_cols].values.astype(np.float32)

        self.scaler_x = MinMaxScaler() if scaler_x is None else scaler_x
        self.scaler_y = MinMaxScaler() if scaler_y is None else scaler_y
        if scaler_x is None:
            self.scaler_x.fit(X)
        if scaler_y is None:
            self.scaler_y.fit(Y)

        Xs = self.scaler_x.transform(X)
        Ys = self.scaler_y.transform(Y)

        self.samples = []
        for i in range(window, len(self.df)):
            x_win = Xs[i-window:i]
            y_target = Ys[i]
            self.samples.append((x_win.astype(np.float32), y_target.astype(np.float32)))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        x, y = self.samples[idx]
        return x, y


In [16]:
# %%
n = len(df)
train_end = int(0.7 * n)
val_end = int(0.85 * n)

df_train = df.iloc[:train_end].reset_index(drop=True)
df_val = df.iloc[train_end - WINDOW:val_end].reset_index(drop=True)
df_test = df.iloc[val_end - WINDOW:].reset_index(drop=True)

train_ds = SeqDataset(df_train, FEATURE_COLS_EXT, TARGET_COLS, window=WINDOW)
val_ds = SeqDataset(df_val, FEATURE_COLS_EXT, TARGET_COLS, window=WINDOW,
                    scaler_x=train_ds.scaler_x, scaler_y=train_ds.scaler_y)
test_ds = SeqDataset(df_test, FEATURE_COLS_EXT, TARGET_COLS, window=WINDOW,
                     scaler_x=train_ds.scaler_x, scaler_y=train_ds.scaler_y)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

print("Dataset sizes:", len(train_ds), len(val_ds), len(test_ds))


Dataset sizes: 8386 1800 1800


In [17]:
# %%
class CNNLSTM(nn.Module):
    def __init__(self, n_features, cnn_channels=64, lstm_hidden=128, lstm_layers=1, out_dim=3, dropout=0.3):
        super().__init__()
        self.conv1 = nn.Conv1d(n_features, cnn_channels, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(cnn_channels)

        self.conv2 = nn.Conv1d(cnn_channels, cnn_channels*2, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(cnn_channels*2)

        self.conv3 = nn.Conv1d(cnn_channels*2, cnn_channels*4, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(cnn_channels*4)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

        self.lstm = nn.LSTM(input_size=cnn_channels*4, hidden_size=lstm_hidden,
                            num_layers=lstm_layers, batch_first=True)

        self.fc = nn.Sequential(
            nn.Linear(lstm_hidden, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, out_dim)
        )

    def forward(self, x):
        x = x.permute(0, 2, 1)   # (batch, features, seq)
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.relu(self.bn2(self.conv2(x)))
        x = self.relu(self.bn3(self.conv3(x)))
        x = x.permute(0, 2, 1)   # (batch, seq, features)
        x, _ = self.lstm(x)
        out = x[:, -1, :]
        return self.fc(out)

model = CNNLSTM(n_features=len(FEATURE_COLS_EXT), out_dim=len(TARGET_COLS)).to(DEVICE)
print(model)


CNNLSTM(
  (conv1): Conv1d(10, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (lstm): LSTM(256, 128, batch_first=True)
  (fc): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=3, bias=True)
  )
)


In [18]:
# %%
def train_model_full_epochs(model, train_loader, val_loader, epochs=EPOCHS, lr=LR):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=4)
    best_val = 1e9

    for epoch in range(1, epochs+1):
        model.train()
        train_losses = []
        for X, y in train_loader:
            X, y = X.to(DEVICE), y.to(DEVICE)
            pred = model(X)
            loss = criterion(pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        model.eval()
        val_losses = []
        with torch.no_grad():
            for Xv, yv in val_loader:
                Xv, yv = Xv.to(DEVICE), yv.to(DEVICE)
                pv = model(Xv)
                val_losses.append(criterion(pv, yv).item())

        train_loss, val_loss = np.mean(train_losses), np.mean(val_losses)
        scheduler.step(val_loss)
        print(f"Epoch {epoch}: Train {train_loss:.6f}, Val {val_loss:.6f}")

        # Save model if validation improves
        if val_loss < best_val - 1e-6:
            best_val = val_loss
            torch.save(model.state_dict(), MODEL_OUT)
            print(f"  Saved best model at epoch {epoch}")

    model.load_state_dict(torch.load(MODEL_OUT, map_location=DEVICE))
    return model

# Train model for all epochs
model = train_model_full_epochs(model, train_loader, val_loader)


Epoch 1: Train 0.028542, Val 0.015706
  Saved best model at epoch 1
Epoch 2: Train 0.019213, Val 0.015464
  Saved best model at epoch 2
Epoch 3: Train 0.018341, Val 0.015226
  Saved best model at epoch 3
Epoch 4: Train 0.017434, Val 0.015107
  Saved best model at epoch 4
Epoch 5: Train 0.016957, Val 0.015176
Epoch 6: Train 0.016498, Val 0.015020
  Saved best model at epoch 6
Epoch 7: Train 0.016255, Val 0.015282
Epoch 8: Train 0.015885, Val 0.015050
Epoch 9: Train 0.015311, Val 0.015055
Epoch 10: Train 0.015218, Val 0.015719
Epoch 11: Train 0.014849, Val 0.015197
Epoch 12: Train 0.014347, Val 0.015226
Epoch 13: Train 0.014075, Val 0.015185
Epoch 14: Train 0.014071, Val 0.015245
Epoch 15: Train 0.013688, Val 0.015812
Epoch 16: Train 0.013435, Val 0.015470
Epoch 17: Train 0.012874, Val 0.015568
Epoch 18: Train 0.012399, Val 0.015834
Epoch 19: Train 0.012008, Val 0.015880
Epoch 20: Train 0.011823, Val 0.016169
Epoch 21: Train 0.011516, Val 0.016230
Epoch 22: Train 0.010885, Val 0.016838
E

In [19]:
# %%
def classify_risk(disease, value):
    thresholds = {
        "diarrhea": {"low": 30, "medium": 60},
        "cholera": {"low": 50, "medium": 80},
        "typhoid": {"low": 20, "medium": 40}
    }
    
    if value <= thresholds[disease]["low"]:
        return "LOW ✅ — normal levels"
    elif value <= thresholds[disease]["medium"]:
        return "MEDIUM ⚠️ — monitor closely"
    else:
        return "HIGH 🚨 — take preventive measures"



In [20]:
# %%
def predict_next_day(model, df_recent, lat, lon):
    """Use live weather + model to predict next-day diseases"""
    try:
        weather_data = requests.get(
            f"https://api.openweathermap.org/data/2.5/weather?lat={lat}&lon={lon}&appid={OPENWEATHER_API_KEY}&units=metric"
        ).json()
        if "main" not in weather_data:
            raise ValueError("Weather API returned no 'main'")
        live_weather = {
            'temp': round(weather_data['main']['temp'], 2),
            'humidity': round(weather_data['main']['humidity'], 2),
            'rainfall': round(weather_data.get('rain', {}).get('1h', 0), 2)
        }
        print("Weather API raw response:", weather_data)
    except Exception as e:
        print("⚠️ Weather API failed:", e)
        live_weather = {
            'temp': df_recent['temp'].iloc[-1],
            'humidity': df_recent['humidity'].iloc[-1],
            'rainfall': df_recent['rainfall'].iloc[-1]
        }

    df_next = df_recent.copy()
    df_next['temp_diff'] = df_next['temp'].diff().fillna(0)
    df_next['humidity_diff'] = df_next['humidity'].diff().fillna(0)
    df_next['rainfall_diff'] = df_next['rainfall'].diff().fillna(0)

    df_next.iloc[-1, df_next.columns.get_loc('temp')] = live_weather['temp']
    df_next.iloc[-1, df_next.columns.get_loc('humidity')] = live_weather['humidity']
    df_next.iloc[-1, df_next.columns.get_loc('rainfall')] = live_weather['rainfall']

    df_next.iloc[-1, df_next.columns.get_loc('temp_diff')] = df_next['temp'].iloc[-1] - df_next['temp'].iloc[-2]
    df_next.iloc[-1, df_next.columns.get_loc('humidity_diff')] = df_next['humidity'].iloc[-1] - df_next['humidity'].iloc[-2]
    df_next.iloc[-1, df_next.columns.get_loc('rainfall_diff')] = df_next['rainfall'].iloc[-1] - df_next['rainfall'].iloc[-2]

    X_next = df_next[FEATURE_COLS_EXT].values.astype(np.float32)
    X_next_scaled = train_ds.scaler_x.transform(X_next)
    X_next_tensor = torch.tensor(X_next_scaled).unsqueeze(0).to(DEVICE)

    model.eval()
    with torch.no_grad():
        pred_scaled = model(X_next_tensor).cpu().numpy()

    pred = train_ds.scaler_y.inverse_transform(pred_scaled)[0]
    # Round predictions to integer
    pred_int = [int(round(p)) for p in pred]
    result = {col: pred_int[i] for i, col in enumerate(TARGET_COLS)}
    result.update(live_weather)
    return result


In [21]:
# %%
recent_df = df.iloc[-WINDOW:].copy()
next_day_pred = predict_next_day(model, recent_df, lat=13.6576 , lon=78.2624)

print("🌍 Next-day Disease Forecast with Risk Alerts")
for disease in ["diarrhea", "cholera", "typhoid"]:
    value = next_day_pred[disease]
    alert = classify_risk(disease, value)
    print(f"{disease.capitalize()}: {value} cases → {alert}")

print(f"\nWeather Conditions: 🌡️ {next_day_pred['temp']}°C, 💧 {next_day_pred['humidity']}%, 🌧️ {next_day_pred['rainfall']}mm")


Weather API raw response: {'coord': {'lon': 78.2624, 'lat': 13.6576}, 'weather': [{'id': 804, 'main': 'Clouds', 'description': 'overcast clouds', 'icon': '04n'}], 'base': 'stations', 'main': {'temp': 22.44, 'feels_like': 22.93, 'temp_min': 22.44, 'temp_max': 22.44, 'pressure': 1010, 'humidity': 84, 'sea_level': 1010, 'grnd_level': 930}, 'visibility': 10000, 'wind': {'speed': 5.14, 'deg': 283, 'gust': 12.49}, 'clouds': {'all': 100}, 'dt': 1758907385, 'sys': {'country': 'IN', 'sunrise': 1758846965, 'sunset': 1758890433}, 'timezone': 19800, 'id': 1264621, 'name': 'Madanapalle', 'cod': 200}
🌍 Next-day Disease Forecast with Risk Alerts
Diarrhea: 68 cases → HIGH 🚨 — take preventive measures
Cholera: 96 cases → HIGH 🚨 — take preventive measures
Typhoid: 45 cases → HIGH 🚨 — take preventive measures

Weather Conditions: 🌡️ 22.44°C, 💧 84%, 🌧️ 0mm
