In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split
from sklearn.preprocessing import StandardScaler
import t

orch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt


In [2]:
# load datasets
flu_ili = pd.read_csv('data/flu_surveillance/flu-ili-byregion-fluseason.csv')
flu_clinicallab = pd.read_csv('data/flu_surveillance/flu-clinicallab-byregion-fluseason.csv')
flu_publichealthlab = pd.read_csv('data/flu_surveillance/flu-publichealthlab-byregion-fluseason.csv')
vaccination = pd.read_csv('data/vaccination/hcp_flu_vaccination_by_hospital_and_county_2017-18_season_072419.csv')
weather = pd.read_csv('data/weather/avg_temp_0920.csv', skiprows=4)



In [3]:
# Preprocessing
flu_ili = flu_ili[['date_code', 'Total_ILI', 'Total_Patients_Seen', 'Percent_ILI']]
flu_clinicallab = flu_clinicallab[['date_code', 'Number_Positive', 'Percent_Positive']]
flu_publichealthlab = flu_publichealthlab[['date_code', 'Count']]
vaccination = vaccination[['Influenza_Season', 'County', 'HCP_Percent_Vaccinated']]
weather = weather[['Date', 'Value']].rename(columns={'Date': 'date_code', 'Value': 'Avg_Temp'})



In [4]:
weather.head()

Unnamed: 0,date_code,Avg_Temp
0,200901,46.9
1,200902,45.2
2,200903,49.2
3,200904,54.3
4,200905,66.7


In [5]:
# merge datasets
merged_data = flu_ili.merge(flu_clinicallab, on='date_code', how='left')
merged_data = merged_data.merge(flu_publichealthlab, on='date_code', how='left')
merged_data = merged_data.merge(weather, on='date_code', how='left')



In [6]:
# forward-fill 
merged_data['Avg_Temp'].fillna(method='ffill', inplace=True)
# Backward-fill in case of missing values at the start
merged_data['Avg_Temp'].fillna(method='bfill', inplace=True)

# weekly median temperature
merged_data['year'] = merged_data['date_code'].astype(str).str[:4].astype(int)
merged_data['week'] = merged_data['date_code'].astype(str).str[4:].astype(int)
weekly_median_temp = merged_data.groupby('week')['Avg_Temp'].median()

merged_data['Avg_Temp'] = merged_data.apply(
    lambda row: weekly_median_temp[row['week']] if pd.isnull(row['Avg_Temp']) else row['Avg_Temp'], axis=1
)

# Drop extra columns if not needed later
merged_data.drop(columns=['year', 'week'], inplace=True)

# no missing values
print(merged_data.isnull().sum())


date_code                    0
Total_ILI                    0
Total_Patients_Seen          0
Percent_ILI                 29
Number_Positive           2508
Percent_Positive       2520096
Count                     2508
Avg_Temp                     0
dtype: int64


In [12]:


# median imputation for influenza surveillance features
flu_features = ['Total_ILI', 'Total_Patients_Seen', 'Percent_ILI']
merged_data[flu_features] = merged_data[flu_features].fillna(merged_data[flu_features].median())

# missing indicators for lab-related data
lab_features = ['Number_Positive', 'Percent_Positive', 'Count']
for feature in lab_features:
    merged_data[f'{feature}_missing'] = merged_data[feature].isnull().astype(int)
    merged_data[feature].fillna(0, inplace=True)  # Assume missing means no reports

# final missing values
print(merged_data.isnull().sum())




date_code                   0
Total_ILI                   0
Total_Patients_Seen         0
Percent_ILI                 0
Number_Positive             0
Percent_Positive            0
Count                       0
Avg_Temp                    0
Number_Positive_missing     0
Percent_Positive_missing    0
Count_missing               0
dtype: int64


In [8]:
merged_data.head()

Unnamed: 0,date_code,Total_ILI,Total_Patients_Seen,Percent_ILI,Number_Positive,Percent_Positive,Count,Avg_Temp,Number_Positive_missing,Percent_Positive_missing,Count_missing
0,200140,3,135,2.22,0.0,0.0,0.0,46.9,1,1,1
1,200140,25,1211,2.06,0.0,0.0,0.0,46.9,1,1,1
2,200140,0,0,1.65,0.0,0.0,0.0,46.9,1,1,1
3,200140,2,336,0.6,0.0,0.0,0.0,46.9,1,1,1
4,200140,20,529,3.78,0.0,0.0,0.0,46.9,1,1,1


In [9]:
# standardize numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(merged_data.iloc[:, 1:])



In [10]:
# convert to PyTorch Dataset
class FluDataset(Dataset):
    def __init__(self, data):
        self.data = torch.tensor(data, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx, :-1], self.data[idx, -1]  # Features, Target (assumed last column)



In [11]:
# convert NumPy array to PyTorch TensorDataset
scaled_tensor = torch.tensor(scaled_features, dtype=torch.float32)

# split into train, validation, test sets
train_size = int(0.7 * len(scaled_tensor))
val_size = int(0.15 * len(scaled_tensor))
test_size = len(scaled_tensor) - train_size - val_size  # Ensure all data is used

train_data, val_data, test_data = random_split(scaled_tensor, [train_size, val_size, test_size])

# create DataLoaders
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

print(f"Train dataset size: {len(train_loader.dataset)}")
print(f"Validation dataset size: {len(val_loader.dataset)}")
print(f"Test dataset size: {len(test_loader.dataset)}")


Train dataset size: 11840564
Validation dataset size: 2537263
Test dataset size: 2537265


In [13]:
class FluPredictorNN(nn.Module):
    def __init__(self, input_size):
        super(FluPredictorNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.BatchNorm1d(128),  # Batch Normalization
            nn.ReLU(),
            nn.Dropout(0.3),  # Dropout for regularization
            
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(64, 32),
            nn.ReLU(),

            nn.Linear(32, 1)  # Output layer (regression task)
        )

    def forward(self, x):
        return self.model(x)


In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FluPredictorNN(input_size=scaled_tensor.shape[1] - 1).to(device)

criterion = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)


In [None]:
from tqdm import tqdm  # progress bar for training

# early stopping params
patience = 5
best_val_loss = np.inf
patience_counter = 0

num_epochs = 50
train_losses, val_losses = [], []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader):
        features, target = batch[:, :-1].to(device), batch[:, -1].to(device).view(-1, 1)

        optimizer.zero_grad()
        predictions = model(features)
        loss = criterion(predictions, target)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # average training loss
    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    # validation 
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            features, target = batch[:, :-1].to(device), batch[:, -1].to(device).view(-1, 1)
            predictions = model(features)
            loss = criterion(predictions, target)
            val_loss += loss.item()
    
    val_loss /= len(val_loader)
    val_losses.append(val_loss)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_model.pth")  # Save best model
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

# best model after training
model.load_state_dict(torch.load("best_model.pth"))


100%|██████████████████████████████████| 185009/185009 [07:12<00:00, 428.18it/s]


Epoch 1/50, Train Loss: 0.0433, Val Loss: 13.1460


100%|██████████████████████████████████| 185009/185009 [04:37<00:00, 666.61it/s]


Epoch 2/50, Train Loss: 0.0203, Val Loss: 19.3748


100%|██████████████████████████████████| 185009/185009 [07:34<00:00, 406.71it/s]


Epoch 3/50, Train Loss: 0.0106, Val Loss: 2.1961


100%|██████████████████████████████████| 185009/185009 [07:58<00:00, 386.52it/s]


Epoch 4/50, Train Loss: 0.0105, Val Loss: 2.3330


100%|██████████████████████████████████| 185009/185009 [04:46<00:00, 644.71it/s]


Epoch 5/50, Train Loss: 0.0244, Val Loss: 3.4853


100%|██████████████████████████████████| 185009/185009 [04:47<00:00, 643.01it/s]


Epoch 6/50, Train Loss: 0.0283, Val Loss: 0.0144


100%|██████████████████████████████████| 185009/185009 [04:46<00:00, 646.29it/s]


Epoch 7/50, Train Loss: 0.0226, Val Loss: 0.1849


100%|██████████████████████████████████| 185009/185009 [04:55<00:00, 625.61it/s]


Epoch 8/50, Train Loss: 0.0293, Val Loss: 0.0246


100%|██████████████████████████████████| 185009/185009 [04:51<00:00, 634.49it/s]


Epoch 9/50, Train Loss: 0.0104, Val Loss: 0.0020


100%|██████████████████████████████████| 185009/185009 [04:50<00:00, 636.76it/s]


Epoch 10/50, Train Loss: 0.0140, Val Loss: 0.0197


100%|██████████████████████████████████| 185009/185009 [04:51<00:00, 633.90it/s]


Epoch 11/50, Train Loss: 0.0097, Val Loss: 0.0797


100%|██████████████████████████████████| 185009/185009 [05:05<00:00, 604.93it/s]


Epoch 12/50, Train Loss: 0.0117, Val Loss: 0.0013


100%|██████████████████████████████████| 185009/185009 [19:27<00:00, 158.52it/s]


Epoch 13/50, Train Loss: 0.0136, Val Loss: 0.1777


100%|██████████████████████████████████| 185009/185009 [04:47<00:00, 642.92it/s]


Epoch 14/50, Train Loss: 0.0143, Val Loss: 0.0752


100%|██████████████████████████████████| 185009/185009 [04:18<00:00, 715.71it/s]


Epoch 15/50, Train Loss: 0.0146, Val Loss: 0.0261


100%|██████████████████████████████████| 185009/185009 [04:58<00:00, 620.03it/s]


Epoch 16/50, Train Loss: 0.0213, Val Loss: 0.0116


 96%|████████████████████████████████▋ | 177944/185009 [04:51<00:11, 594.35it/s]

In [None]:
model.eval()
test_loss = 0.0
with torch.no_grad():
    for batch in test_loader:
        features, target = batch[:, :-1].to(device), batch[:, -1].to(device).view(-1, 1)
        predictions = model(features)
        loss = criterion(predictions, target)
        test_loss += loss.item()

test_loss /= len(test_loader)
print(f"Final Test Loss: {test_loss:.4f}")
