# Preprocess Climate Data

In [2]:
import pandas as pd

climate_df = pd.read_csv('data/climate_data.csv')
climate_df['date'] = pd.to_datetime(climate_df['date'])
climate_df.index = climate_df['date']

hypothetical_redundant_cols = ["sunrise", "sunset"]
climate_df = climate_df.drop(columns=hypothetical_redundant_cols)

In [3]:
from sklearn.preprocessing import MinMaxScaler

numeric_cols = climate_df.select_dtypes(include=['float64', 'int64']).columns.to_list()
numeric_cols.remove('lat')
numeric_cols.remove('lon')
non_numeric_cols = climate_df.select_dtypes(exclude=['float64', 'int64']).columns.to_list() + ['lat', 'lon']

climate_df[numeric_cols] = climate_df[numeric_cols].astype(float)
climate_df[non_numeric_cols] = climate_df[non_numeric_cols]
scaler = MinMaxScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(climate_df[numeric_cols]), columns=numeric_cols, index=climate_df.index)

climate_data = pd.concat([scaled_df, climate_df[non_numeric_cols]], axis=1)

## Time-Series Sequences

In [4]:
import numpy as np

def lagged_input_output(data, target_col, seq_length=28):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data.iloc[i:i + seq_length].values)
        y.append(data.iloc[i + seq_length][target_col])
    return np.array(X), np.array(y)

climate_data = climate_data.drop(columns=['date', 'lat', 'lon'])
X, y = lagged_input_output(climate_data, "temperature_2m_mean", 14)

# Model

## Load Data

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader

class ClimateDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
climate_dataset = ClimateDataset(X, y)
train_len = int(len(climate_dataset) * 0.8)
val_len = len(climate_dataset) - train_len
train_dataset, val_dataset = torch.utils.data.random_split(climate_dataset, [train_len, val_len])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

## LSTM

In [9]:
import torch.nn as nn
import torch.optim as optim
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out
input_size = X.shape[2]
hidden_size = 64
output_size = 1
model = LSTMModel(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y.view(-1, 1))
        loss.backward()
        optimizer.step()

    model.eval()
    val_loss = 0

    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y.view(-1, 1))
            val_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Val Loss: {val_loss/len(val_loader):.4f}')

Epoch [1/1], Loss: 0.0045, Val Loss: 0.0106
tensor([[0.8095, 0.7941],
        [0.7591, 0.8204],
        [0.6670, 0.5371],
        ...,
        [0.5407, 0.4514],
        [0.7372, 0.7825],
        [0.6970, 0.6771]])
