In [1]:
# %%
import pandas as pd
import numpy as np
from tabpfn import TabPFNRegressor
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.preprocessing import KBinsDiscretizer  # For quantization (binning)

# 1. Load datasets
train = pd.read_csv('train.csv')
val   = pd.read_csv('val.csv')
test  = pd.read_csv('test.csv')

In [2]:
# 2. Impute missing values (unchanged from your code)
mg_mode      = train['MG'].mode()[0]
lon_median   = train['Longitude'].median()
mean_cols    = ['Lodging', 'PlantHeight', 'SeedSize', 'Protein', 'Oil']
mean_values  = train[mean_cols].mean()

for df in (train, val, test):
    df['MG']         = df['MG'].fillna(mg_mode)
    df['Longitude']  = df['Longitude'].fillna(lon_median)
    for col in mean_cols:
        df[col]      = df[col].fillna(mean_values[col])

# 3. Define feature groups (unchanged)
temporal_feats = ['MaxTemp', 'MinTemp', 'AvgTemp', 'AvgHumidity', 'Precipitation', 'Radiation']  # 6
plant_feats    = ['Lodging', 'PlantHeight', 'SeedSize', 'MG', 'Protein', 'Oil']                  # 6
geo_feats      = ['Latitude', 'Longitude', 'Row.Spacing']                                        # 3
cluster_feats  = [f'Cluster_{i}' for i in range(40)]                                             # 40

# Combine all static features (geo + plant + cluster)
static_feats = geo_feats + plant_feats + cluster_feats  # Now 3 + 6 + 40 = 49 static

# 4. Aggregate each sequence → one row (updated to include all static feats)
def aggregate_sequences(df, target='Yield', agg_target='mean'):
    agg_dict = {}
    # Temporal: mean & std
    for feat in temporal_feats:
        agg_dict[feat + '_mean'] = (feat, 'mean')
        agg_dict[feat + '_std']  = (feat, 'std')
    # Static: take first (assuming constant per sequence)
    for feat in static_feats:
        agg_dict[feat] = (feat, 'first')
    # Target: mean or final
    if agg_target == 'mean':
        agg_dict[target] = (target, 'mean')
    elif agg_target == 'final':
        agg_dict[target] = (target, lambda x: x.iloc[-1])
    else:
        raise ValueError("agg_target must be 'mean' or 'final'")

    grouped = df.groupby('TimeSeriesLabel').agg(**agg_dict)
    return grouped.reset_index(drop=True)

train_agg = aggregate_sequences(train, agg_target='mean')
val_agg   = aggregate_sequences(val,   agg_target='mean')
test_agg  = aggregate_sequences(test,  agg_target='mean')

# Verify shapes (should now have 12 temporal aggregates + 49 static + 1 target = 62 columns)
print(f"Train agg shape: {train_agg.shape}")
print(f"Val agg shape: {val_agg.shape}")
print(f"Test agg shape: {test_agg.shape}")

# 5. Split features / target
X_train = train_agg.drop('Yield', axis=1)
y_train = train_agg['Yield']
X_val   = val_agg.drop('Yield',   axis=1)
y_val   = val_agg['Yield']
X_test  = test_agg.drop('Yield',  axis=1)
y_test  = test_agg['Yield']

# Optional: Evaluate baseline with added features (before quantization)
# Subsample 10k and fit TabPFN
train_sub = train_agg.sample(n=10_000, random_state=42)
X_sub     = train_sub.drop('Yield', axis=1).to_numpy()
y_sub     = train_sub['Yield'].to_numpy()

model_baseline = TabPFNRegressor()
model_baseline.fit(X_sub, y_sub)

for name, X_np, y_np in [('Val', X_val.to_numpy(), y_val.to_numpy()), ('Test', X_test.to_numpy(), y_test.to_numpy())]:
    preds = model_baseline.predict(X_np)
    print(f"Baseline (with added feats) {name} R²   : {r2_score(y_np, preds):.4f}")
    print(f"Baseline (with added feats) {name} RMSE : {root_mean_squared_error(y_np, preds):.4f}")

# 6. Apply quantization (binning) to continuous features
# Select features to quantize: all temporal aggregates (means/stds) + continuous static (e.g., exclude categorical like 'MG' if it's discrete)
# Assume 5 bins per feature; adjust as needed (more bins = finer granularity, but risk of overfitting)
features_to_quantize = [f + '_mean' for f in temporal_feats] + [f + '_std' for f in temporal_feats] + \
                       ['Latitude', 'Longitude', 'Row.Spacing', 'Lodging', 'PlantHeight', 'SeedSize', 'Protein', 'Oil']
                       # Exclude 'MG' (categorical), clusters (likely already discrete/one-hot)

n_bins = 5  # Quantile strategy ensures equal-population bins
discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile', subsample=None)

# Fit on train only, transform all
X_train_quant = X_train.copy()
X_val_quant   = X_val.copy()
X_test_quant  = X_test.copy()

discretizer.fit(X_train[features_to_quantize])
X_train_quant[features_to_quantize] = discretizer.transform(X_train[features_to_quantize])
X_val_quant[features_to_quantize]   = discretizer.transform(X_val[features_to_quantize])
X_test_quant[features_to_quantize]  = discretizer.transform(X_test[features_to_quantize])

# 7. Train and evaluate with quantization
train_sub_quant = pd.concat([X_train_quant, y_train], axis=1).sample(n=10_000, random_state=42)
X_sub_quant     = train_sub_quant.drop('Yield', axis=1).to_numpy()
y_sub_quant     = train_sub_quant['Yield'].to_numpy()

model_quant = TabPFNRegressor()
model_quant.fit(X_sub_quant, y_sub_quant)

for name, X_np, y_np in [('Val', X_val_quant.to_numpy(), y_val.to_numpy()), ('Test', X_test_quant.to_numpy(), y_test.to_numpy())]:
    preds = model_quant.predict(X_np)
    print(f"Quantized {name} R²   : {r2_score(y_np, preds):.4f}")
    print(f"Quantized {name} RMSE : {root_mean_squared_error(y_np, preds):.4f}")

Train agg shape: (86101, 62)
Val agg shape: (10763, 62)
Test agg shape: (10763, 62)
Baseline (with added feats) Val R²   : 0.7759
Baseline (with added feats) Val RMSE : 7.0956
Baseline (with added feats) Test R²   : 0.7784
Baseline (with added feats) Test RMSE : 7.0075




Quantized Val R²   : 0.6739
Quantized Val RMSE : 8.5593
Quantized Test R²   : 0.6817
Quantized Test RMSE : 8.3988
