In [1]:
import pandas as pd
import numpy as np
from tabpfn import TabPFNRegressor
from sklearn.metrics import r2_score, mean_squared_error

# 1. Load
train = pd.read_csv('train.csv')
val   = pd.read_csv('val.csv')
test  = pd.read_csv('test.csv')


In [2]:
# 1. Compute fill‐values on TRAIN only
mg_mode      = train['MG'].mode()[0]           # mode for categorical MG
lon_median   = train['Longitude'].median()     # median for longitude
mean_cols    = ['Lodging','PlantHeight','SeedSize','Protein','Oil']
mean_values  = train[mean_cols].mean()         # means for plant characteristics

# 2. Fill missing in train/val/test
for df in (train, val, test):
    df['MG']        = df['MG'].fillna(mg_mode)
    df['Longitude'] = df['Longitude'].fillna(lon_median)
    for col in mean_cols:
        df[col]     = df[col].fillna(mean_values[col])

# 3. Define your features

In [3]:
# make sure these are defined at top‐level:
temporal_feats = ['MaxTemp','MinTemp','AvgTemp','AvgHumidity','Precipitation','Radiation']
static_feats   = ['Latitude','Longitude','Row.Spacing']
plant_feats    = ['Lodging','PlantHeight','SeedSize','Protein','Oil']
cluster_feats  = [f'Cluster_{i}' for i in range(40)]

def aggregate_sequences(df, target='Yield', agg_target='mean'):
    agg_dict = {}

    # 1. temporal: mean & std
    for feat in temporal_feats:
        agg_dict[f'{feat}_mean'] = (feat, 'mean')
        agg_dict[f'{feat}_std']  = (feat, 'std')

    # 2. static geography: take first (constant per sequence)
    for feat in static_feats:
        agg_dict[feat] = (feat, 'first')

    # 3. plant features:
    #    - MG (categorical) → mode  
    agg_dict['MG'] = ('MG', lambda x: x.mode().iloc[0])
    #    - Lodging, PlantHeight, SeedSize, Protein, Oil → first
    for feat in plant_feats:
        agg_dict[feat] = (feat, 'first')

    # 4. cluster indicators: proportion of time in each cluster + variability
    for feat in cluster_feats:
        agg_dict[f'{feat}_mean'] = (feat, 'mean')
        agg_dict[f'{feat}_std']  = (feat, 'std')

    # 5. target: mean or final
    if agg_target == 'mean':
        agg_dict[target] = (target, 'mean')
    elif agg_target == 'final':
        agg_dict[target] = (target, lambda x: x.iloc[-1])
    else:
        raise ValueError("agg_target must be 'mean' or 'final'")

    # apply the aggregation
    grouped = df.groupby('TimeSeriesLabel').agg(**agg_dict)
    return grouped.reset_index(drop=True)


In [4]:
train_agg = aggregate_sequences(train, agg_target='mean')
val_agg   = aggregate_sequences(val,   agg_target='mean')
test_agg  = aggregate_sequences(test,  agg_target='mean')

In [5]:
# 5. Split features / target
X_train = train_agg.drop('Yield', axis=1)
y_train = train_agg['Yield']
X_val   = val_agg.drop('Yield',   axis=1)
y_val   = val_agg['Yield']
X_test  = test_agg.drop('Yield',  axis=1)
y_test  = test_agg['Yield']

In [6]:
# %% [MLP baseline: PyTorch, early stopping, proper preprocessing]
import numpy as np
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import r2_score, mean_squared_error

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# ---------- 1) Preprocess (fit on TRAIN only) ----------
num_sel = make_column_selector(dtype_include=np.number)
cat_sel = make_column_selector(dtype_exclude=np.number)

pre = ColumnTransformer([
    ("num", StandardScaler(), num_sel),
    ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), cat_sel),
], remainder="drop")

pre.fit(X_train)

Xtr = pre.transform(X_train).astype(np.float32)
Xva = pre.transform(X_val).astype(np.float32)
Xte = pre.transform(X_test).astype(np.float32)

ytr = y_train.to_numpy(dtype=np.float32).reshape(-1, 1)
yva = y_val.to_numpy(dtype=np.float32).reshape(-1, 1)
yte = y_test.to_numpy(dtype=np.float32).reshape(-1, 1)

# ---------- 2) DataLoaders ----------
BATCH_SIZE = 1024
train_loader = DataLoader(TensorDataset(torch.from_numpy(Xtr), torch.from_numpy(ytr)),
                          batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
val_loader   = DataLoader(TensorDataset(torch.from_numpy(Xva), torch.from_numpy(yva)),
                          batch_size=BATCH_SIZE, shuffle=False, drop_last=False)
test_tensorX = torch.from_numpy(Xte)
test_tensory = torch.from_numpy(yte)

# ---------- 3) Model ----------
class MLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 256), nn.ReLU(),
            nn.BatchNorm1d(256), nn.Dropout(0.2),
            nn.Linear(256, 256), nn.ReLU(),
            nn.BatchNorm1d(256), nn.Dropout(0.2),
            nn.Linear(256, 128), nn.ReLU(),
            nn.BatchNorm1d(128), nn.Dropout(0.1),
            nn.Linear(128, 1)
        )
    def forward(self, x): return self.net(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLP(in_dim=Xtr.shape[1]).to(device)

# ---------- 4) Train loop (early stopping on Val RMSE) ----------
EPOCHS = 200
PATIENCE = 20
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=5)
loss_fn = nn.MSELoss()

best_val = float("inf")
best_state = None
no_improve = 0

for epoch in range(1, EPOCHS + 1):
    # train
    model.train()
    train_loss = 0.0
    for xb, yb in train_loader:
        xb = xb.to(device); yb = yb.to(device)
        optimizer.zero_grad()
        pred = model(xb)
        loss = loss_fn(pred, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0)
        optimizer.step()
        train_loss += loss.item() * xb.size(0)
    train_loss /= len(train_loader.dataset)

    # validate
    model.eval()
    with torch.no_grad():
        va_preds = []
        va_targets = []
        for xb, yb in val_loader:
            xb = xb.to(device)
            pred = model(xb).cpu().numpy().ravel()
            va_preds.append(pred)
            va_targets.append(yb.numpy().ravel())
        va_preds = np.concatenate(va_preds)
        va_targets = np.concatenate(va_targets)
        va_rmse = rmse(va_targets, va_preds)

    scheduler.step(va_rmse)

    if va_rmse < best_val - 1e-6:
        best_val = va_rmse
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        no_improve = 0
    else:
        no_improve += 1

    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d} | Train MSE: {train_loss:.5f} | Val RMSE: {va_rmse:.4f}")

    if no_improve >= PATIENCE:
        print(f"Early stopping at epoch {epoch}. Best Val RMSE: {best_val:.4f}")
        break

# restore best weights
if best_state is not None:
    model.load_state_dict({k: v.to(device) for k, v in best_state.items()})

# ---------- 5) Final eval on Val/Test ----------
def predict_torch(net, X):
    net.eval()
    with torch.no_grad():
        preds = net(X.to(device)).cpu().numpy().ravel()
    return preds

val_preds  = predict_torch(model, torch.from_numpy(Xva))
test_preds = predict_torch(model, test_tensorX)

print(f"MLP  Val R²  : {r2_score(yva.ravel(),  val_preds):.4f}")
print(f"MLP  Val RMSE: {rmse(yva.ravel(),      val_preds):.4f}")
print(f"MLP  Test R² : {r2_score(yte.ravel(),  test_preds):.4f}")
print(f"MLP  Test RMSE:{rmse(yte.ravel(),      test_preds):.4f}")

Epoch 001 | Train MSE: 2588.76693 | Val RMSE: 50.7239
Epoch 010 | Train MSE: 86.89405 | Val RMSE: 8.5867
Epoch 020 | Train MSE: 67.00433 | Val RMSE: 7.4442
Epoch 030 | Train MSE: 59.57152 | Val RMSE: 7.0438
Epoch 040 | Train MSE: 55.08892 | Val RMSE: 6.8310
Epoch 050 | Train MSE: 52.29147 | Val RMSE: 6.7542
Epoch 060 | Train MSE: 50.48360 | Val RMSE: 6.6373
Epoch 070 | Train MSE: 49.22202 | Val RMSE: 6.5701
Epoch 080 | Train MSE: 47.58975 | Val RMSE: 6.5068
Epoch 090 | Train MSE: 45.49518 | Val RMSE: 6.4167
Epoch 100 | Train MSE: 44.36301 | Val RMSE: 6.3782
Epoch 110 | Train MSE: 43.89861 | Val RMSE: 6.3323
Epoch 120 | Train MSE: 43.55989 | Val RMSE: 6.3414
Epoch 130 | Train MSE: 42.03292 | Val RMSE: 6.3131
Epoch 140 | Train MSE: 41.77255 | Val RMSE: 6.3085
Epoch 150 | Train MSE: 41.10797 | Val RMSE: 6.2908
Epoch 160 | Train MSE: 40.82531 | Val RMSE: 6.2904
Epoch 170 | Train MSE: 40.95342 | Val RMSE: 6.2910
Early stopping at epoch 172. Best Val RMSE: 6.2835
MLP  Val R²  : 0.8243
MLP  V