In [1]:
import sqlite3 as sql #to read the database
import pandas as pd #for dataframes
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
con = sql.connect('../data_collection/campus_data.db')
cur = con.cursor()
df = pd.read_sql_query("SELECT * FROM historical_air_quality_data", con)
df

Unnamed: 0,date,pm10,pm2_5,carbon_monoxide,nitrogen_dioxide,sulphur_dioxide,ozone,carbon_dioxide,ammonia,aerosol_optical_depth,...,mugwort_pollen,olive_pollen,ragweed_pollen,us_aqi,us_aqi_pm2_5,us_aqi_pm10,us_aqi_nitrogen_dioxide,us_aqi_ozone,us_aqi_sulphur_dioxide,us_aqi_carbon_monoxide
0,2022-08-03 01:00:00-05:00,,,,,,,,,,...,,,,,,,,,,
1,2022-08-03 02:00:00-05:00,,,,,,,,,,...,,,,,,,,,,
2,2022-08-03 03:00:00-05:00,,,,,,,,,,...,,,,,,,,,,
3,2022-08-03 04:00:00-05:00,,,,,,,,,,...,,,,,,,,,,
4,2022-08-03 05:00:00-05:00,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28603,2025-11-06 19:00:00-06:00,3.5,3.4,175.0,6.9,11.8,67.0,448.0,,0.09,...,,,,34.380798,17.013887,3.996213,3.398345,34.380798,6.434024,1.595411
28604,2025-11-06 20:00:00-06:00,3.5,3.5,167.0,6.7,13.5,66.0,446.0,,0.10,...,,,,34.612705,17.048611,4.007576,3.299842,34.612705,7.360960,1.613527
28605,2025-11-06 21:00:00-06:00,3.5,3.4,162.0,6.4,14.2,66.0,445.0,,0.11,...,,,,34.264843,17.100693,4.018940,3.152088,34.264843,7.742640,1.629227
28606,2025-11-06 22:00:00-06:00,3.5,3.5,161.0,6.1,13.4,66.0,445.0,,0.14,...,,,,33.627087,17.135418,4.026516,3.004334,33.627087,7.306434,1.637681


In [3]:
df_cleaned = df.dropna(axis = 1, how = "all") #removing columns with only null values
df_dataset = df_cleaned.dropna() #removing rows that have any null values

In [4]:
df_dataset.columns

Index(['date', 'pm10', 'pm2_5', 'carbon_monoxide', 'nitrogen_dioxide',
       'sulphur_dioxide', 'ozone', 'carbon_dioxide', 'aerosol_optical_depth',
       'methane', 'dust', 'uv_index', 'uv_index_clear_sky', 'us_aqi',
       'us_aqi_pm2_5', 'us_aqi_pm10', 'us_aqi_nitrogen_dioxide',
       'us_aqi_ozone', 'us_aqi_sulphur_dioxide', 'us_aqi_carbon_monoxide'],
      dtype='object')

In [5]:
# Custom Dataset for time series data
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# LSTM Model
class lstm_model(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=2, dropout=0.2):
        super(lstm_model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, 32),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(32, 1)
        )
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])
        return out

In [6]:
#Data Preparation
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np

#picking the x- and y- variables
FEATURES = ['pm10', 'carbon_monoxide', 'nitrogen_dioxide', 'sulphur_dioxide', 'ozone', 'carbon_dioxide']
RESPONSE = 'pm2_5'
sequence_length = 24 #the length of the sequence because it's analyzing data from the last 24 hours
X = df_dataset[FEATURES].values
y = df_dataset[RESPONSE].values


# Scale the data using StandardScaler (better for neural networks)
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Create sequences for LSTM
X_sequences, y_sequences = [], []
for i in range(len(X_scaled) - sequence_length):
    X_sequences.append(X_scaled[i:i+sequence_length])
    y_sequences.append(y_scaled[i+sequence_length])

X_sequences = np.array(X_sequences)
y_sequences = np.array(y_sequences)


# Split into training and testing datasets (shuffle=False for time series)
X_train, X_test, y_train, y_test = train_test_split(X_sequences, y_sequences, test_size=0.2, shuffle=False)


# Create datasets and dataloaders
train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [9]:
model = lstm_model(input_size=len(FEATURES))
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [10]:
# Train the model
train_losses = []
epochs = 50
test_losses = []

for epoch in range(epochs):
    # Training
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = loss_fn(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    train_loss /= len(train_loader)
    train_losses.append(train_loss)
    
    # testing
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            loss = loss_fn(outputs, y_batch)
            test_loss += loss.item()
    
    test_loss /= len(test_loader)
    test_losses.append(test_loss)
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch+1}/{epochs}, Training Loss: {train_loss:.4f}, Testing Loss: {test_loss:.4f}')

Epoch 10/50, Training Loss: 0.0994, Testing Loss: 0.0388
Epoch 20/50, Training Loss: 0.0908, Testing Loss: 0.0407
Epoch 30/50, Training Loss: 0.0858, Testing Loss: 0.0379
Epoch 40/50, Training Loss: 0.0811, Testing Loss: 0.0473
Epoch 50/50, Training Loss: 0.0767, Testing Loss: 0.0517


In [11]:
#Evaluating the model
from sklearn.metrics import mean_squared_error, r2_score
model.eval()
y_pred = []
y_real = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        y_pred.extend(outputs.numpy())
        y_real.extend(y_batch.numpy())

# Inverse transform to get original scale
predictions = scaler_y.inverse_transform(np.array(y_pred))
actuals = scaler_y.inverse_transform(np.array(y_real))

# Calculate metrics
rmse = np.sqrt(mean_squared_error(actuals, predictions))
r2 = r2_score(actuals, predictions)


print(f"RMSE: {rmse:.4f}")
print(f"R-squared: {r2:.4f}")

RMSE: 1.3644
R-squared: 0.8430


In [12]:
#saving the model
torch.save(model.state_dict(), "model1_pm25.pth")