In [1]:
import pandas as pd
import numpy as np
from tabpfn import TabPFNRegressor
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import r2_score, root_mean_squared_error

# ————————————————————————————————
# 1. Load + impute + aggregate (reuse your code)
# ————————————————————————————————
train = pd.read_csv('train.csv')
val   = pd.read_csv('val.csv')
test  = pd.read_csv('test.csv')


In [2]:
# Impute on TRAIN only
mg_mode   = train['MG'].mode()[0]
lon_med   = train['Longitude'].median()
mean_cols = ['Lodging','PlantHeight','SeedSize','Protein','Oil']
mean_vals = train[mean_cols].mean()

for df in (train, val, test):
    df['MG']        = df['MG'].fillna(mg_mode)
    df['Longitude'] = df['Longitude'].fillna(lon_med)
    for c in mean_cols:
        df[c]      = df[c].fillna(mean_vals[c])

# Aggregation (exactly as before)
temporal_feats = ['MaxTemp','MinTemp','AvgTemp','AvgHumidity','Precipitation','Radiation']
static_feats   = ['Latitude','Longitude','Row.Spacing']
plant_feats    = ['Lodging','PlantHeight','SeedSize','Protein','Oil']
cluster_feats  = [f'Cluster_{i}' for i in range(40)]

def aggregate_sequences(df, target='Yield', agg_target='mean'):
    agg = {}
    # temporal: mean & std
    for f in temporal_feats:
        agg[f'{f}_mean'] = (f, 'mean')
        agg[f'{f}_std']  = (f, 'std')
    # static
    for f in static_feats:
        agg[f] = (f, 'first')
    # plant
    agg['MG'] = ('MG', lambda x: x.mode().iloc[0])
    for f in plant_feats:
        agg[f] = (f, 'first')
    # clusters: mean & std
    for f in cluster_feats:
        agg[f'{f}_mean'] = (f, 'mean')
        agg[f'{f}_std']  = (f, 'std')
    # target
    if agg_target=='mean':
        agg[target] = (target, 'mean')
    else:
        agg[target] = (target, lambda x: x.iloc[-1])
    return df.groupby('TimeSeriesLabel').agg(**agg).reset_index(drop=True)

train_agg = aggregate_sequences(train)
val_agg   = aggregate_sequences(val)
test_agg  = aggregate_sequences(test)



In [3]:
# ————————————————————————————————
# 2. Decide uniform vs. quantile
# ————————————————————————————————
# Quantile‐bin highly skewed weather features:
quantile_feats = [
    'Precipitation_mean','Precipitation_std',
    'Radiation_mean','Radiation_std'
]

# Everything else numeric → uniform 2ᵇ levels
uniform_feats = [
    'MaxTemp_mean','MaxTemp_std','MinTemp_mean','MinTemp_std',
    'AvgTemp_mean','AvgTemp_std','AvgHumidity_mean','AvgHumidity_std',
    'Latitude','Longitude','Row.Spacing',
    'MG','Lodging','PlantHeight','SeedSize','Protein','Oil'
] + [f'Cluster_{i}_mean' for i in range(40)] \
  + [f'Cluster_{i}_std'  for i in range(40)]

# ————————————————————————————————
# 3. Fit transformers on TRAIN only
# ————————————————————————————————
# 3a) uniform
def fit_uniform(df, b):
    levels = 2**b
    params = {}
    for col in df.columns:
        xmin, xmax = df[col].min(), df[col].max()
        if xmax==xmin: continue
        delta = (xmax-xmin)/(levels-1)
        params[col] = (xmin, delta)
    return params

def apply_uniform(df, params):
    df_q = df.copy()
    for col,(xmin,delta) in params.items():
        df_q[col] = xmin + delta*np.round((df_q[col]-xmin)/delta)
    return df_q

# 3b) quantile
kb = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
kb.fit(train_agg[quantile_feats])

# 3c) learn uniform params
b = 8  # try 8 bits (256 levels)—you can experiment
uni_params = fit_uniform(train_agg[uniform_feats], b)


In [4]:
# ————————————————————————————————
# 4. Apply to TRAIN / VAL / TEST
# ————————————————————————————————
def hybrid_transform(df):
    df2 = df.copy()
    # uniform part
    df2[uniform_feats] = apply_uniform(df2[uniform_feats], uni_params)
    # quantile part
    df2[quantile_feats] = kb.transform(df2[quantile_feats])
    return df2

train_q = hybrid_transform(train_agg)
val_q   = hybrid_transform(val_agg)
test_q  = hybrid_transform(test_agg)

# ————————————————————————————————
# 5. Split, subsample & fit TabPFN
# ————————————————————————————————
Xtr, ytr = train_q.drop(columns='Yield'), train_q['Yield'].to_numpy()
Xvl, yvl = val_q.drop(columns='Yield'),   val_q['Yield'].to_numpy()
Xte, yte = test_q.drop(columns='Yield'),  test_q['Yield'].to_numpy()

idx        = np.random.RandomState(42).choice(len(Xtr), size=10_000, replace=False)
Xsub, ysub = Xtr.to_numpy()[idx], ytr[idx]

model = TabPFNRegressor()
model.fit(Xsub, ysub)

# ————————————————————————————————
# 6. Evaluate
# ————————————————————————————————
for name, Xset, yset in [('Val', Xvl, yvl), ('Test', Xte, yte)]:
    preds = model.predict(Xset.to_numpy())
    print(f"{name} R²:   {r2_score(yset, preds):.4f}")
    print(f"{name} RMSE: {root_mean_squared_error(yset, preds):.4f}")


Val R²:   0.7774
Val RMSE: 7.0718
Test R²:   0.7806
Test RMSE: 6.9739


In [5]:
b = 64  # try 8 bits (256 levels)—you can experiment
uni_params = fit_uniform(train_agg[uniform_feats], b)

# ————————————————————————————————
# 4. Apply to TRAIN / VAL / TEST
# ————————————————————————————————
def hybrid_transform(df):
    df2 = df.copy()
    # uniform part
    df2[uniform_feats] = apply_uniform(df2[uniform_feats], uni_params)
    # quantile part
    df2[quantile_feats] = kb.transform(df2[quantile_feats])
    return df2

train_q = hybrid_transform(train_agg)
val_q   = hybrid_transform(val_agg)
test_q  = hybrid_transform(test_agg)

# ————————————————————————————————
# 5. Split, subsample & fit TabPFN
# ————————————————————————————————
Xtr, ytr = train_q.drop(columns='Yield'), train_q['Yield'].to_numpy()
Xvl, yvl = val_q.drop(columns='Yield'),   val_q['Yield'].to_numpy()
Xte, yte = test_q.drop(columns='Yield'),  test_q['Yield'].to_numpy()

idx        = np.random.RandomState(42).choice(len(Xtr), size=10_000, replace=False)
Xsub, ysub = Xtr.to_numpy()[idx], ytr[idx]

model = TabPFNRegressor()
model.fit(Xsub, ysub)

# ————————————————————————————————
# 6. Evaluate
# ————————————————————————————————
for name, Xset, yset in [('Val', Xvl, yvl), ('Test', Xte, yte)]:
    preds = model.predict(Xset.to_numpy())
    print(f"{name} R²:   {r2_score(yset, preds):.4f}")
    print(f"{name} RMSE: {root_mean_squared_error(yset, preds):.4f}")



Val R²:   0.7762
Val RMSE: 7.0918
Test R²:   0.7787
Test RMSE: 7.0039
