In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import random
import glob
import pickle
import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from models import CNN_LSTM, SepCNN_LSTM, ConvGRU_LSTM, RandomForestBaseline, LassoModel

In [3]:
seed = 42

# Set seed for NumPy
np.random.seed(seed)

# Set seed for Python's built-in random module
random.seed(seed)

# Set seed for PyTorch
torch.manual_seed(seed)

# Set seed for Torch's CUDA operations if GPU is used
# if torch.cuda.is_available():
#     torch.backends.cudnn.deterministic = True
#     torch.cuda.manual_seed(seed)

<torch._C.Generator at 0x2506c0ac450>

In [8]:
# # Load a sample of the data
sample_data = np.load('./data/PROCESSEDIII/2021_51_101.npy')  

# # Check the shape of the sample data
print("Shape of sample data:", sample_data.shape)

FileNotFoundError: [Errno 2] No such file or directory: './data/PROCESSEDIII/2021_51_101.npy'

In [9]:
# Define generator function
def generator(IDs, yields, batch_size, cutoff=None):
    def load_data(ID):
        try:
            data = np.load('./data/PROCESSED_III/' + ID + '.npy')
            return data
        except Exception as e:
            # print('Error loading data:', e)
            return None

    batches = 0

    while True:
        batch_features = np.zeros((batch_size, 38, 1, 128, 9)) if cutoff is None else np.zeros((batch_size, cutoff, 1, 128, 9))
        batch_yields = np.zeros(batch_size)

        if batches == len(IDs) // batch_size:
            batches = 0
            yield None, None

        for i in range(batch_size):
            index = random.choice(range(len(IDs)))
            ID = IDs[index]
            data = load_data(ID)

            if data is not None:
                if cutoff is not None:
                    if not np.isnan(data).any():
                        batch_features[i, :, :, :, :] = data[:cutoff, :, :, :]
                        batch_yields[i] = yields[ID]
                    else:
                        print('Data contains NaN values:', ID)
                else:
                    batch_features[i, :, :, :, :] = data
                    batch_yields[i] = yields[ID]

        batches += 1

        yield torch.tensor(batch_features, dtype=torch.float32, device='cuda'), torch.tensor(batch_yields, dtype=torch.float32, device='cuda')


In [12]:
# Datasets
yields = pickle.load(open('data/yields.p', 'rb'))

# Generators
training_generator = generator(list(yields['train'].keys()), yields['train'], 16)
validation_generator = generator(list(yields['validation'].keys()), yields['validation'], 16)


In [None]:
## for Random Forest baseline model
random_forest_model = RandomForestBaseline(n_estimators=100, max_depth=None, random_state=42)

# Fit the model to training data
X_train, y_train = next(training_generator)
random_forest_model.fit(X_train.cpu().reshape(X_train.shape[0], -1), y_train.cpu())

# Make predictions on test data
X_test, y_test = next(validation_generator)
predictions = random_forest_model.predict(X_test.cpu().reshape(X_test.shape[0], -1))  # Flattening input features

# Evaluate the model
mse = random_forest_model.evaluate(X_test.cpu().reshape(X_test.cpu().shape[0], -1), y_test.cpu())  # Flattening input features
print("Mean Squared Error:", mse)

Mean Squared Error: 1891.901342797789


In [None]:
## for Lasso baseline model
lasso_model = LassoModel(alpha=0.5, random_state=42)

# Fit the model to training data
X_train, y_train = next(training_generator)
lasso_model.fit(X_train.cpu().reshape(X_train.shape[0], -1), y_train.cpu())

# Make predictions on test data
X_test, y_test = next(validation_generator)
predictions = lasso_model.predict(X_test.cpu().reshape(X_test.shape[0], -1))  # Flattening input features

# Evaluate the model
mse = lasso_model.evaluate(X_test.cpu().reshape(X_test.cpu().shape[0], -1), y_test.cpu())  # Flattening input features
print("Mean Squared Error:", mse)

Mean Squared Error: 3469.941841439345


In [13]:
model_functions = {
    'CNN_LSTM': CNN_LSTM,
    # 'SepCNN_LSTM': SepCNN_LSTM,
    # 'ConvGRU_LSTM': ConvGRU_LSTM,
}


epochs = 10

for model_name, model_function in model_functions.items():
    model = model_function(dimensions=[38, 1, 128, 9])
    model.to('cuda')
    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        train_losses = []
        for batch_data, batch_labels in tqdm.tqdm(training_generator, desc=f"Epoch {epoch+1}/{epochs}"):
            if batch_data is None:
                break

            optimizer.zero_grad()
            outputs = model(batch_data)
            loss = criterion(outputs, batch_labels.unsqueeze(1))
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        model.eval()
        val_losses = []
        best_loss = float('inf')
        with torch.no_grad():
            for val_data, val_labels in validation_generator:
                if val_data is None:
                    break
                val_outputs = model(val_data)
                val_loss = criterion(val_outputs, val_labels.unsqueeze(1))
                val_losses.append(val_loss.item())
        current_loss = np.mean(val_losses)
        if current_loss < best_loss:
            print('save best model')
            torch.save(model, f'{model_name}_best.pt')
            best_loss = current_loss
            
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {np.mean(train_losses):.4f}, Val Loss: {current_loss:.4f}")

    # Save the model
    torch.save(model, f'{model_name}.pt')

Epoch 1/10: 409it [00:25, 16.01it/s]


save best model
Epoch 1/10, Train Loss: 4309.9201, Val Loss: 3655.0766


Epoch 2/10: 409it [00:25, 16.18it/s]


save best model
Epoch 2/10, Train Loss: 1380.0852, Val Loss: 651.1990


Epoch 3/10: 409it [00:25, 16.21it/s]


save best model
Epoch 3/10, Train Loss: 772.5979, Val Loss: 439.3886


Epoch 4/10: 61it [00:03, 15.91it/s]


KeyboardInterrupt: 