In [28]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [29]:
df = pd.read_csv('/kaggle/input/training-data/data.csv')
df = df.sort_values(by=['deviceId', 'timestamp'])


In [30]:
# imputation
for column in df.columns[5:]:
  if np.any(df[column] <= 0):  # Check if column contains negative values
      column_mean = df[column].mean()  # Calculate column mean
      df[column] = np.where(df[column] <= 0, column_mean, df[column])  # Replace negative values with mean


In [31]:
# normalize
scaler = MinMaxScaler()
target_columns = ['temp','WDSD','pm2.5']  # Select only numeric columns
for column in target_columns:
    df[column] = scaler.fit_transform(df[column].values.reshape(-1, 1))

In [32]:
class PM25Dataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

In [33]:
# creating training and testing data
datasets = []
for device_id, group in df.groupby('deviceId'):
    group = group.drop(columns=['deviceId', 'timestamp', 'Unnamed: 0'])  # Drop non-feature columns
    group_values = group.values # 24 * 5
    dataset = PM25Dataset(group_values)
    datasets.append(dataset)

dataset = ConcatDataset(datasets)
train_size = int(0.8 * len(dataset))  # Let's use 80% of the data for training

test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=1000, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

In [34]:
# init device to store data
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [35]:
# Define the Linear Regression model
class LinearRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearRegression, self).__init__()
        self.fc = nn.Linear(input_size, output_size)
    
    def forward(self, x):
        out = self.fc(x)
        return out

# Hyperparameters
input_size = 5 # PM2.5, temperature, wind speed, longitude, latitude
hidden_size = 30
num_layers = 8
output_size = 1 # Next PM2.5
num_epochs = 250
learning_rate = 0.0001


# Initialize the model, loss function, and optimizer
model = LinearRegression(input_size, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [36]:
# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    for inputs in train_loader:
        # Forward pass
        inputs = inputs.float()
        inputs=inputs.to(device)
        outputs = model(inputs[:, :-1])  # Exclude the last column (PM2.5) from inputs
        
        # Compute loss
        targets = inputs[:, -1:]  # Last column contains the PM2.5 values
        targets = targets.to(device)
        loss = criterion(outputs, targets)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch+1) % 10 == 0:
        # Print training loss for each epoch
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

Epoch 10/100, Loss: 106.46006774902344
Epoch 20/100, Loss: 118.3214340209961
Epoch 30/100, Loss: 107.87921905517578
Epoch 40/100, Loss: 118.25617218017578
Epoch 50/100, Loss: 109.83740997314453
Epoch 60/100, Loss: 101.0091323852539
Epoch 70/100, Loss: 99.94104766845703
Epoch 80/100, Loss: 109.53874206542969
Epoch 90/100, Loss: 105.4314193725586
Epoch 100/100, Loss: 114.27147674560547


In [37]:
torch.save(model, './model.pt')

In [38]:
# Evaluate the model on the test set
model.eval()
total_loss = 0
with torch.no_grad():
    for inputs in test_loader:
        inputs = inputs.float()
        inputs=inputs.to(device)
        outputs = model(inputs[:, :-1])
        targets = inputs[:, -1:]
        targets = targets.to(device)
        loss = criterion(outputs, targets)
        total_loss += loss.item()

avg_loss = total_loss / len(test_loader)
print(f'Test Loss: {avg_loss}')

Test Loss: 110.86722073025173
