In [8]:
from torch.utils.data import Dataset
import torch
import pandas as pd
import json
import os
from pathlib import Path
import torch
import torch.nn as nn

class CropYieldDataset(Dataset):
    def __init__(self, data_lake_dir="../data/data_lake_organized", crop_name="corn", transform=None):
        self.samples = []
        self.transform = transform
        self.crop_name = crop_name.lower()
        
        fips_folders = [f for f in Path(data_lake_dir).iterdir() if f.is_dir()]
        
        for fips_folder in fips_folders:
            crop_json_path = fips_folder / f"{self.crop_name}.json"
            if not crop_json_path.exists():
                continue
            
            with open(crop_json_path, 'r') as f:
                yield_data = json.load(f)
            
            year_folders = [y for y in fips_folder.iterdir() if y.is_dir()]
            
            for year_folder in year_folders:
                year = year_folder.name
                weather_csv = year_folder / f"WeatherTimeSeries{year}.csv"
                
                if not weather_csv.exists():
                    continue
                if year not in yield_data:
                    continue
                
                df = pd.read_csv(weather_csv)
                
                # Only keep April–October
                df = df[(df['Month'] >= 4) & (df['Month'] <= 10)]

                # Drop non-weather columns
                df = df.drop(columns=['Year', 'Month', 'Day'], errors='ignore')

                # Make sure it's float tensor
                weather_tensor = torch.tensor(df.values, dtype=torch.float32)

                # Target
                yield_target = torch.tensor(yield_data[year]['yield'], dtype=torch.float32)

                self.samples.append((weather_tensor, yield_target))
    
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        x, y = self.samples[idx]
        if self.transform:
            x = self.transform(x)
        return x, y

LSTM + TCN MODEL

In [9]:
class LSTMTCNRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, lstm_layers=1, tcn_channels=[64, 32]):
        super(LSTMTCNRegressor, self).__init__()
        
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=lstm_layers, batch_first=True)
        
        # TCN part: using 1D Convolutions
        self.tcn = nn.Sequential(
            nn.Conv1d(hidden_dim, tcn_channels[0], kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(tcn_channels[0], tcn_channels[1], kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1)  # output shape: (batch, channels, 1)
        )
        
        self.fc = nn.Linear(tcn_channels[-1], 1)
    
    def forward(self, x):
        # x shape: (batch, time_steps, features)
        out, _ = self.lstm(x)  # (batch, time_steps, hidden_dim)
        out = out.permute(0, 2, 1)  # (batch, hidden_dim, time_steps) for Conv1D
        out = self.tcn(out)  # (batch, channels, 1)
        out = out.squeeze(-1)  # (batch, channels)
        out = self.fc(out)  # (batch, 1)
        return out.squeeze(-1)

Training Loop

In [None]:
from torch.utils.data import DataLoader
import torch.optim as optim
import tqdm

def train_model(model, dataloader, num_epochs=30, lr=1e-3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    model.train()
    for epoch in tqdm.tqdm(range(num_epochs), desc="Training Progress"):
        epoch_loss = 0
        for x_batch, y_batch in dataloader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        """ tqdm.tqdm.write(f"Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss/len(dataloader):.4f}")
        tqdm.tqdm.set_description(f"Training Progress (Epoch {epoch+1} Loss: {epoch_loss/len(dataloader):.4f})") """
    
    return model

In [15]:
dataset = CropYieldDataset(crop_name="corn")
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=lambda x: (nn.utils.rnn.pad_sequence([i[0] for i in x], batch_first=True), torch.stack([i[1] for i in x])))

# Example input_dim = number of weather features
input_dim = next(iter(dataloader))[0].shape[-1]

model = LSTMTCNRegressor(input_dim=input_dim)
trained_model = train_model(model, dataloader)

Training Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 1/30 - Loss: 37302.2196





AttributeError: 'str' object has no attribute 'desc'