In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Torch
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import random_split
from torch.utils.data import DataLoader, Dataset

In [2]:
BATCH_SIZE = 8
EPOCHS = 100
TRAIN_TEST_SPLIT = 0.8
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Data processing

In [3]:
df = pd.read_csv('./data/agg_data.csv')
df.columns

Index(['StnPres', 'SeaPres', 'StnPresMax', 'StnPresMaxTime', 'StnPresMin',
       'StnPresMinTime', 'Temperature', 'T Max', 'T Max Time', 'T Min',
       'T Min Time', 'Td dew point', 'RH', 'RHMin', 'RHMinTime', 'WS', 'WD',
       'WSGust', 'WDGust', 'WGustTime', 'Precp', 'PrecpHour', 'PrecpMax10',
       'PrecpMax10Time', 'PrecpMax60', 'PrecpMax60Time', 'SunShine',
       'SunshineRate', 'GloblRad', 'VisbMean', 'EvapA', 'UVI Max',
       'UVI Max Time', 'Cloud Amount', 'TxSoil0cm', 'TxSoil5cm', 'TxSoil10cm',
       'TxSoil20cm', 'TxSoil30cm', 'TxSoil50cm', 'TxSoil100cm'],
      dtype='object')

In [4]:
DROP_COL = ["StnPresMaxTime","StnPresMinTime","T Max Time","UVI Max Time","T Min Time","RHMinTime","WGustTime","PrecpMax10Time","PrecpMax60Time"]
df = df.drop(DROP_COL, axis=1)

In [5]:
df['EvapA'] = df['EvapA'].replace('/', 0)
df['EvapA'] = df['EvapA'].astype(float)
df['Precp'] = df['Precp'].replace('T', 0)
df['Precp'] = df['Precp'].astype(float)
df['PrecpMax10'] = df['PrecpMax10'].replace('T', 0)
df['PrecpMax10'] = df['PrecpMax10'].astype(float)
df['PrecpMax60'] = df['PrecpMax60'].replace('T', 0)
df['PrecpMax60'] = df['PrecpMax60'].astype(float)

In [6]:
df.head()

Unnamed: 0,StnPres,SeaPres,StnPresMax,StnPresMin,Temperature,T Max,T Min,Td dew point,RH,RHMin,...,EvapA,UVI Max,Cloud Amount,TxSoil0cm,TxSoil5cm,TxSoil10cm,TxSoil20cm,TxSoil30cm,TxSoil50cm,TxSoil100cm
0,1011.3,1013.8,1012.5,1010.5,27.4,32.2,24.7,25.3,89,62,...,5.3,9,3.0,27.9,27.9,27.7,27.7,28.5,27.8,27.7
1,1011.1,1013.6,1012.4,1010.0,27.7,31.5,24.7,25.5,88,63,...,4.2,9,0.8,28.3,28.3,27.7,27.7,28.6,27.9,27.8
2,1010.8,1013.3,1012.1,1009.6,27.6,31.2,25.0,25.5,89,67,...,4.9,9,2.6,28.3,28.3,27.9,27.9,28.7,28.0,27.8
3,1011.0,1013.5,1012.8,1009.5,28.1,32.1,25.1,24.9,83,62,...,-5.7,9,1.8,28.4,28.4,27.9,27.9,28.8,28.1,27.9
4,1012.6,1015.1,1014.6,1011.0,27.0,31.2,24.5,24.3,86,64,...,-5.3,10,5.8,28.0,28.0,27.8,27.9,28.8,28.1,28.0


In [7]:
TARGET = "Temperature"

# Make dataset

In [8]:
class WeatherDataset(Dataset):
    def __init__(self, df, target, window_size=30):
        self.df = df
        self.target = target
        self.window_size = window_size
        self.features = df.columns

    def __len__(self):
        return len(self.df) - self.window_size - 1
    
    def __getitem__(self, idx):
        idx += self.window_size
        window = self.df.iloc[idx-self.window_size:idx]
        target = self.df.iloc[idx+1][self.target]
        features = window[self.features].values
        return features, target
    
dataset = WeatherDataset(df, TARGET)
train_size = int(TRAIN_TEST_SPLIT * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
# Model
class WeatherModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(WeatherModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1])
        return x
    
model = WeatherModel(len(df.columns), 64, 1).to(DEVICE)

# Loss and Optimizer

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training

def train(model, dataloader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(dataloader):
        features, target = data
        features, target = features.float().to(DEVICE), target.float().to(DEVICE)
        optimizer.zero_grad()
        output = model(features)
        loss = criterion(output, target.unsqueeze(1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(dataloader)

def test(model, dataloader, criterion):
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for i, data in enumerate(dataloader):
            features, target = data
            features, target = features.float().to(DEVICE), target.float().to(DEVICE)
            output = model(features)
            loss = criterion(output, target.unsqueeze(1))
            running_loss += loss.item()
    return running_loss / len(dataloader)

train_losses = []
test_losses = []

for epoch in range(EPOCHS):
    train_loss = train(model, train_dataloader, criterion, optimizer)
    test_loss = test(model, test_dataloader, criterion)
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    print(f"Epoch: {epoch+1}/{EPOCHS}, Train Loss: {train_loss}, Test Loss: {test_loss}")