In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = pd.read_csv('/train_data/train_data.csv')
df = df.sort_values(by=['timestamp'])


In [None]:
# imputation
for column in df.columns[5:]:
  if np.any(df[column] <= 0):  # Check if column contains negative values
      column_mean = df[column].mean()  # Calculate column mean
      df[column] = np.where(df[column] <= 0, column_mean, df[column])  # Replace negative values with mean


In [None]:
# normalize
scaler = MinMaxScaler()
target_columns = ['temp','WDSD','pm2.5']  # Select only numeric columns
for column in target_columns:
    df[column] = scaler.fit_transform(df[column].values.reshape(-1, 1))

In [None]:
class PM25Dataset(Dataset):
    def __init__(self, data, sequence_length):
        self.data = data
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.data) - self.sequence_length

    def __getitem__(self, index):
        sequence = self.data[index:index+self.sequence_length, : -1]
        label = self.data[index+self.sequence_length-1, -1]
        return sequence, label

In [None]:
# creating training and testing data
sequence_length = 4
datasets = []
for device_id, group in df.groupby('deviceId'):
    group = group.drop(columns=['deviceId', 'timestamp', 'Unnamed: 0'])  # Drop non-feature columns
    group_values = group.values # 24 * 5
    dataset = PM25Dataset(group_values, sequence_length)
    datasets.append(dataset)

dataset = ConcatDataset(datasets)
train_size = int(0.8 * len(dataset))  # Let's use 80% of the data for training

test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=1000, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

In [None]:
# init device to store data
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class PM25Predictor(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(PM25Predictor, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device) 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Hyperparameters
input_size = 5 # PM2.5, temperature, wind speed, longitude, latitude
hidden_size = 30
num_layers = 8
output_size = 1 # Next PM2.5
num_epochs = 250
learning_rate = 0.0001


# Initialize the model, loss function, and optimizer
model = PM25Predictor(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(num_epochs):
    for i, (sequences, labels) in enumerate(train_loader):

        sequences = sequences.to(device) # 32 * sequence_length * 58
        labels = labels.to(device)
        sequences = sequences.to(torch.float32)
        labels = labels.to(torch.float32)
        outputs = model(sequences)
        labels = labels.view(-1,1)

        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

In [None]:
torch.save(model, './model.pt')

In [None]:
model.eval()  # Set the model to evaluation mode
total_loss = []
with torch.no_grad():  # Disable gradient tracking
    total_predictions, correct_predictions = 0, 0
    for sequences, labels in test_loader:
        sequences = sequences.to(device)
        labels = labels.to(device)
        
        sequences = sequences.to(torch.float32)
        labels = labels.to(torch.float32)

        outputs = model(sequences)
        labels = labels.view(outputs.shape)


        loss = criterion(outputs, labels)
        total_loss.append(loss.item())
        predicted = outputs.round()  # Assuming PM2.5 can be rounded off to nearest integer
        total_predictions += labels.size(0)
        correct_predictions += (abs(outputs/labels-1)<=0.2).sum().item()
print(f'avg mse : {np.average(total_loss)}')
print('Test Accuracy of the model on the test images: {} %'.format((correct_predictions / total_predictions) * 100))