In [47]:
from torch.utils.data import Dataset
import torch
import pandas as pd
import json
import os
from pathlib import Path
import torch
import torch.nn as nn

class CropYieldDataset(Dataset):
    def __init__(self, data_lake_dir="../data/data_lake_organized", crop_name="corn", transform=None):
        self.samples = []
        self.transform = transform
        self.crop_name = crop_name.lower()
        
        fips_folders = [f for f in Path(data_lake_dir).iterdir() if f.is_dir()]
        
        for fips_folder in fips_folders:
            crop_json_path = fips_folder / f"{self.crop_name}.json"
            if not crop_json_path.exists():
                continue
            
            with open(crop_json_path, 'r') as f:
                yield_data = json.load(f)
            
            year_folders = [y for y in fips_folder.iterdir() if y.is_dir()]
            
            for year_folder in year_folders:
                year = year_folder.name
                weather_csv = year_folder / f"WeatherTimeSeries{year}.csv"
                
                if not weather_csv.exists():
                    continue
                if year not in yield_data:
                    continue
                
                df = pd.read_csv(weather_csv)
                
                # Only keep April–October
                df = df[(df['Month'] >= 4) & (df['Month'] <= 10)]

                # Drop non-weather columns
                df = df.drop(columns=['Year', 'Month', 'Day'], errors='ignore')

                # Make sure it's float tensor
                weather_tensor = torch.tensor(df.values, dtype=torch.float32)

                # Target
                yield_target = torch.tensor(yield_data[year]['yield'], dtype=torch.float32)

                self.samples.append((weather_tensor, yield_target))
    
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        x, y = self.samples[idx]
        if self.transform:
            x = self.transform(x)
        return x, y

LSTM + TCN MODEL

In [48]:
class LSTMTCNRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, lstm_layers=1, tcn_channels=[64, 32]):
        super(LSTMTCNRegressor, self).__init__()
        
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=lstm_layers, batch_first=True)
        
        # TCN part: using 1D Convolutions
        self.tcn = nn.Sequential(
            nn.Conv1d(hidden_dim, tcn_channels[0], kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(tcn_channels[0], tcn_channels[1], kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1)  # output shape: (batch, channels, 1)
        )
        
        self.fc = nn.Linear(tcn_channels[-1], 1)
    
    def forward(self, x):
        # x shape: (batch, time_steps, features)
        out, _ = self.lstm(x)  # (batch, time_steps, hidden_dim)
        out = out.permute(0, 2, 1)  # (batch, hidden_dim, time_steps) for Conv1D
        out = self.tcn(out)  # (batch, channels, 1)
        out = out.squeeze(-1)  # (batch, channels)
        out = self.fc(out)  # (batch, 1)
        return out.squeeze(-1)

Training Loop

In [49]:
from torch.utils.data import DataLoader
import torch.optim as optim
import tqdm

def train_model(model, dataloader, num_epochs=30, lr=1e-3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    model.train()
    for epoch in tqdm.tqdm(range(num_epochs), desc="Training Progress"):
        epoch_loss = 0
        for x_batch, y_batch in dataloader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        """ tqdm.tqdm.write(f"Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss/len(dataloader):.4f}")
        tqdm.tqdm.set_description(f"Training Progress (Epoch {epoch+1} Loss: {epoch_loss/len(dataloader):.4f})") """
    
    return model

In [50]:
dataset = CropYieldDataset(crop_name="corn")
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=lambda x: (nn.utils.rnn.pad_sequence([i[0] for i in x], batch_first=True), torch.stack([i[1] for i in x])))

# Example input_dim = number of weather features
input_dim = next(iter(dataloader))[0].shape[-1]

model = LSTMTCNRegressor(input_dim=input_dim)
trained_model = train_model(model, dataloader)

Training Progress: 100%|██████████| 30/30 [00:36<00:00,  1.22s/it]


Split the Dataset (Train/Validation)

In [None]:
from torch.utils.data import random_split

def get_dataloaders(dataset, train_ratio=0.8, batch_size=32):
    """
    Splits the dataset into training and validation sets.
    """
    total_len = len(dataset)
    train_len = int(total_len * train_ratio)
    val_len = total_len - train_len
    
    train_dataset, val_dataset = random_split(dataset, [train_len, val_len])

    def collate_fn(batch):
        xs = [b[0] for b in batch]
        ys = [b[1] for b in batch]
        xs = nn.utils.rnn.pad_sequence(xs, batch_first=True)                                                                
        ys = torch.stack(ys)
        return xs, ys

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    return train_loader, val_loader

Update training to track Validation Loss

In [52]:
def train_model(model, train_loader, val_loader, num_epochs=30, lr=1e-3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    model.train()
    for epoch in tqdm.tqdm(range(num_epochs), desc="Training Progress"):
        epoch_loss = 0
        model.train()
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        # Validation loss
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for x_val, y_val in val_loader:
                x_val, y_val = x_val.to(device), y_val.to(device)
                y_pred = model(x_val)
                val_loss += criterion(y_pred, y_val).item()
        
        """ print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {epoch_loss/len(train_loader):.4f} - Val Loss: {val_loss/len(val_loader):.4f}") """
    
    return model

Evaluate Final RMSE and MAE

In [53]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

def evaluate_model(model, val_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    y_true = []
    y_pred = []
    
    with torch.no_grad():
        for x_val, y_val in val_loader:
            x_val = x_val.to(device)
            outputs = model(x_val)
            y_true.extend(y_val.numpy())
            y_pred.extend(outputs.cpu().numpy())
    
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)

    print(f"Final Validation RMSE: {rmse:.4f}")
    print(f" Final Validation MAE: {mae:.4f}")

    return rmse, mae

In [54]:
dataset = CropYieldDataset(crop_name="corn")
train_loader, val_loader = get_dataloaders(dataset)

# Detect input_dim
input_dim = next(iter(train_loader))[0].shape[-1]

model = LSTMTCNRegressor(input_dim=input_dim)
trained_model = train_model(model, train_loader, val_loader, num_epochs=30)

# Evaluate
evaluate_model(trained_model, val_loader)

Training Progress: 100%|██████████| 30/30 [00:27<00:00,  1.11it/s]

Final Validation RMSE: 19.0617
 Final Validation MAE: 15.5421





(19.061686, 15.542116)

| Strategy | Why it would help |
|:--|:--|
| Slightly increase LSTM hidden size (e.g., 128) | Capture more complex patterns |
| Add dropout (e.g., 0.2) between LSTM layers | Reduce overfitting further |
| Use a learning rate scheduler (cosine annealing) | Achieve more stable final convergence |
| Train for more epochs with early stopping | Allow better convergence and avoid overfitting |
| Normalize yield targets across counties | Handle variance across different yield scales |

Key hyperparameters to tune

| Hyperparameter | Values to try |
|:--|:--|
| LSTM hidden_dim | [64, 128, 256] |
| LSTM num_layers | [1, 2] |
| TCN channels | [[64, 32], [128, 64], [128, 128]] |
| Dropout | [0.0, 0.1, 0.2, 0.3] |
| Learning Rate | [1e-3, 5e-4, 1e-4] |

In [57]:
import random
import numpy as np

def random_search_hyperparameters(
    dataset,
    num_trials=10,
    batch_size=32,
    num_epochs=30,
    device=None
):
    best_rmse = np.inf
    best_params = None
    best_model = None

    train_loader, val_loader = get_dataloaders(dataset, batch_size=batch_size)

    input_dim = next(iter(train_loader))[0].shape[-1]

    for trial in range(num_trials):
        # Randomly sample hyperparameters
        hidden_dim = random.choice([64, 128, 256])
        lstm_layers = random.choice([1, 2])
        tcn_channels = random.choice([[64, 32], [128, 64], [128, 128]])
        dropout_rate = random.choice([0.0, 0.1, 0.2, 0.3])
        lr = random.choice([1e-3, 5e-4, 1e-4])

        print(f"\nTrial {trial+1}/{num_trials}")
        print(f"LSTM hidden_dim: {hidden_dim}, LSTM layers: {lstm_layers}, TCN channels: {tcn_channels}, Dropout: {dropout_rate}, LR: {lr}")

        model = LSTMTCNRegressor(
            input_dim=input_dim,
            hidden_dim=hidden_dim,
            lstm_layers=lstm_layers,
            tcn_channels=tcn_channels
        )

        trained_model = train_model(model, train_loader, val_loader, num_epochs=num_epochs, lr=lr)
        
        # Evaluate
        rmse, mae = evaluate_model(trained_model, val_loader)

        # Save if best
        if rmse < best_rmse:
            best_rmse = rmse
            best_params = {
                'hidden_dim': hidden_dim,
                'lstm_layers': lstm_layers,
                'tcn_channels': tcn_channels,
                'dropout': dropout_rate,
                'lr': lr
            }
            best_model = trained_model

    print("\n🏆 Best RMSE:", best_rmse)
    print("🏆 Best Hyperparameters:", best_params)

    return best_model, best_params

In [58]:
dataset = CropYieldDataset(crop_name="corn")
best_model, best_params = random_search_hyperparameters(dataset, num_trials=10, batch_size=32, num_epochs=30)


Trial 1/10
LSTM hidden_dim: 256, LSTM layers: 2, TCN channels: [128, 64], Dropout: 0.2, LR: 0.0005


Training Progress: 100%|██████████| 30/30 [08:47<00:00, 17.58s/it]


Final Validation RMSE: 21.1194
 Final Validation MAE: 17.4745

Trial 2/10
LSTM hidden_dim: 128, LSTM layers: 2, TCN channels: [64, 32], Dropout: 0.0, LR: 0.0005


Training Progress: 100%|██████████| 30/30 [02:30<00:00,  5.00s/it]


Final Validation RMSE: 21.1733
 Final Validation MAE: 17.0673

Trial 3/10
LSTM hidden_dim: 128, LSTM layers: 1, TCN channels: [128, 128], Dropout: 0.1, LR: 0.0001


Training Progress: 100%|██████████| 30/30 [02:16<00:00,  4.53s/it]


Final Validation RMSE: 21.1231
 Final Validation MAE: 17.0922

Trial 4/10
LSTM hidden_dim: 256, LSTM layers: 1, TCN channels: [128, 64], Dropout: 0.3, LR: 0.0005


Training Progress:   0%|          | 0/30 [00:00<?, ?it/s]


KeyboardInterrupt: 

In [None]:
torch.save(best_model.state_dict(), "best_model.pt")