In [1]:
import pandas as pd
# 1. Load your datasets
train = pd.read_csv('train.csv')
val   = pd.read_csv('val.csv')
test  = pd.read_csv('test.csv')

In [2]:
# 2. Inspect shapes and columns
for name, df in {'train': train, 'val': val, 'test': test}.items():
    print(f"\n=== {name.upper()} ===")
    print(f"Shape: {df.shape}")
    print("Columns:", list(df.columns))


=== TRAIN ===
Shape: (18425614, 57)
Columns: ['MaxTemp', 'MinTemp', 'AvgTemp', 'AvgHumidity', 'Precipitation', 'Radiation', 'Lodging', 'PlantHeight', 'SeedSize', 'Yield', 'MG', 'Latitude', 'Longitude', 'Row.Spacing', 'TimeSeriesLabel', 'Protein', 'Oil', 'Cluster_0', 'Cluster_1', 'Cluster_2', 'Cluster_3', 'Cluster_4', 'Cluster_5', 'Cluster_6', 'Cluster_7', 'Cluster_8', 'Cluster_9', 'Cluster_10', 'Cluster_11', 'Cluster_12', 'Cluster_13', 'Cluster_14', 'Cluster_15', 'Cluster_16', 'Cluster_17', 'Cluster_18', 'Cluster_19', 'Cluster_20', 'Cluster_21', 'Cluster_22', 'Cluster_23', 'Cluster_24', 'Cluster_25', 'Cluster_26', 'Cluster_27', 'Cluster_28', 'Cluster_29', 'Cluster_30', 'Cluster_31', 'Cluster_32', 'Cluster_33', 'Cluster_34', 'Cluster_35', 'Cluster_36', 'Cluster_37', 'Cluster_38', 'Cluster_39']

=== VAL ===
Shape: (2303282, 57)
Columns: ['MaxTemp', 'MinTemp', 'AvgTemp', 'AvgHumidity', 'Precipitation', 'Radiation', 'Lodging', 'PlantHeight', 'SeedSize', 'Yield', 'MG', 'Latitude', 'Longitu

In [3]:
# 3. Check for missing values
for name, df in {'train': train, 'val': val, 'test': test}.items():
    missing = df.isnull().sum()
    print(f"\n{name.upper()} missing values per column:")
    print(missing[missing > 0] if missing.any() else "No missing values!")


TRAIN missing values per column:
Lodging           642
PlantHeight       428
SeedSize          214
MG             617604
Longitude        6634
Protein           214
Oil               214
dtype: int64

VAL missing values per column:
MG           79180
Longitude      856
Protein        214
Oil            214
dtype: int64

TEST missing values per column:
MG           78966
Longitude     1070
dtype: int64


# Training

In [4]:
# 1. Compute fill‐values on TRAIN only
mg_mode      = train['MG'].mode()[0]           # mode for categorical MG
lon_median   = train['Longitude'].median()     # median for longitude
mean_cols    = ['Lodging','PlantHeight','SeedSize','Protein','Oil']
mean_values  = train[mean_cols].mean()         # means for plant characteristics

# 2. Fill missing in train/val/test
for df in (train, val, test):
    df['MG']         = df['MG'].fillna(mg_mode)
    df['Longitude']  = df['Longitude'].fillna(lon_median)
    for col in mean_cols:
        df[col]     = df[col].fillna(mean_values[col])

# 3. Verify no missing remain in those columns
for name, df in {'train': train, 'val': val, 'test': test}.items():
    rem = df[['MG','Longitude'] + mean_cols].isnull().sum()
    print(f"{name.upper()} remaining nulls:\n", rem)

# 4. Define your feature groups
weather_feats = ['MaxTemp','MinTemp','AvgTemp','AvgHumidity','Precipitation','Radiation']  # 6
plant_feats   = ['Lodging','PlantHeight','SeedSize','MG','Protein','Oil']                  # 6
geo_feats     = ['Latitude','Longitude','Row.Spacing']                                    # 3
cluster_feats = [f'Cluster_{i}' for i in range(40)]                                       # 40

feature_columns = weather_feats + plant_feats + geo_feats + cluster_feats                  # total 55
target_column   = 'Yield'

print(f"\nUsing {len(feature_columns)} features: {feature_columns}")
print(f"Target = '{target_column}'")


TRAIN remaining nulls:
 MG             0
Longitude      0
Lodging        0
PlantHeight    0
SeedSize       0
Protein        0
Oil            0
dtype: int64
VAL remaining nulls:
 MG             0
Longitude      0
Lodging        0
PlantHeight    0
SeedSize       0
Protein        0
Oil            0
dtype: int64
TEST remaining nulls:
 MG             0
Longitude      0
Lodging        0
PlantHeight    0
SeedSize       0
Protein        0
Oil            0
dtype: int64

Using 55 features: ['MaxTemp', 'MinTemp', 'AvgTemp', 'AvgHumidity', 'Precipitation', 'Radiation', 'Lodging', 'PlantHeight', 'SeedSize', 'MG', 'Protein', 'Oil', 'Latitude', 'Longitude', 'Row.Spacing', 'Cluster_0', 'Cluster_1', 'Cluster_2', 'Cluster_3', 'Cluster_4', 'Cluster_5', 'Cluster_6', 'Cluster_7', 'Cluster_8', 'Cluster_9', 'Cluster_10', 'Cluster_11', 'Cluster_12', 'Cluster_13', 'Cluster_14', 'Cluster_15', 'Cluster_16', 'Cluster_17', 'Cluster_18', 'Cluster_19', 'Cluster_20', 'Cluster_21', 'Cluster_22', 'Cluster_23', 'Cluster

In [5]:
feature_columns

['MaxTemp',
 'MinTemp',
 'AvgTemp',
 'AvgHumidity',
 'Precipitation',
 'Radiation',
 'Lodging',
 'PlantHeight',
 'SeedSize',
 'MG',
 'Protein',
 'Oil',
 'Latitude',
 'Longitude',
 'Row.Spacing',
 'Cluster_0',
 'Cluster_1',
 'Cluster_2',
 'Cluster_3',
 'Cluster_4',
 'Cluster_5',
 'Cluster_6',
 'Cluster_7',
 'Cluster_8',
 'Cluster_9',
 'Cluster_10',
 'Cluster_11',
 'Cluster_12',
 'Cluster_13',
 'Cluster_14',
 'Cluster_15',
 'Cluster_16',
 'Cluster_17',
 'Cluster_18',
 'Cluster_19',
 'Cluster_20',
 'Cluster_21',
 'Cluster_22',
 'Cluster_23',
 'Cluster_24',
 'Cluster_25',
 'Cluster_26',
 'Cluster_27',
 'Cluster_28',
 'Cluster_29',
 'Cluster_30',
 'Cluster_31',
 'Cluster_32',
 'Cluster_33',
 'Cluster_34',
 'Cluster_35',
 'Cluster_36',
 'Cluster_37',
 'Cluster_38',
 'Cluster_39']

In [6]:
target_column

'Yield'

In [7]:
n_sequences = train['TimeSeriesLabel'].nunique()
avg_length   = train.groupby('TimeSeriesLabel').size().unique()

print(n_sequences)
print(avg_length)


86101
[214]


In [8]:
print(train['TimeSeriesLabel'].unique()[:10])  # first 10 values
print(f"Number of unique TimeSeriesLabel values: {train['TimeSeriesLabel'].nunique()}")

[ 1  2  3  4  6  7  8  9 10 11]
Number of unique TimeSeriesLabel values: 86101


In [9]:
import pandas as pd
import numpy as np
from tabpfn import TabPFNRegressor
from sklearn.metrics import r2_score, mean_squared_error

# 1. Load
train = pd.read_csv('train.csv')
val   = pd.read_csv('val.csv')
test  = pd.read_csv('test.csv')


In [10]:
# 1. Compute fill‐values on TRAIN only
mg_mode      = train['MG'].mode()[0]           # mode for categorical MG
lon_median   = train['Longitude'].median()     # median for longitude
mean_cols    = ['Lodging','PlantHeight','SeedSize','Protein','Oil']
mean_values  = train[mean_cols].mean()         # means for plant characteristics

# 2. Fill missing in train/val/test
for df in (train, val, test):
    df['MG']        = df['MG'].fillna(mg_mode)
    df['Longitude'] = df['Longitude'].fillna(lon_median)
    for col in mean_cols:
        df[col]     = df[col].fillna(mean_values[col])

# 3. Define your features
temporal_feats = ['MaxTemp','MinTemp','AvgTemp','AvgHumidity','Precipitation','Radiation']  # 6
static_feats   = ['Latitude','Longitude','Row.Spacing']                                    # 3


In [11]:

# 4. Aggregate each sequence → one row
def aggregate_sequences(df, target='Yield', agg_target='mean'):
    # build dict of aggregation funcs
    agg_dict = {}
    # temporal: mean & std
    for feat in temporal_feats:
        agg_dict[feat + '_mean'] = (feat, 'mean')
        agg_dict[feat + '_std']  = (feat, 'std')
    # static: just take first (they’re constant per sequence)
    for feat in static_feats:
        agg_dict[feat] = (feat, 'first')
    # target: either mean or final
    if agg_target == 'mean':
        agg_dict[target] = (target, 'mean')
    elif agg_target == 'final':
        agg_dict[target] = (target, lambda x: x.iloc[-1])
    else:
        raise ValueError("agg_target must be 'mean' or 'final'")

    grouped = df.groupby('TimeSeriesLabel').agg(**agg_dict)
    return grouped.reset_index(drop=True)

train_agg = aggregate_sequences(train, agg_target='mean')
val_agg   = aggregate_sequences(val,   agg_target='mean')
test_agg  = aggregate_sequences(test,  agg_target='mean')


In [12]:
train_agg

Unnamed: 0,MaxTemp_mean,MaxTemp_std,MinTemp_mean,MinTemp_std,AvgTemp_mean,AvgTemp_std,AvgHumidity_mean,AvgHumidity_std,Precipitation_mean,Precipitation_std,Radiation_mean,Radiation_std,Latitude,Longitude,Row.Spacing,Yield
0,21.591636,5.542713,13.296636,5.857184,17.480981,5.559346,78.880047,9.105114,3.880467,7.773874,16.948645,6.906221,40.2162,-74.248,8.0,70.1
1,21.591636,5.542713,13.296636,5.857184,17.480981,5.559346,78.880047,9.105114,3.880467,7.773874,16.948645,6.906221,40.2162,-74.248,8.0,54.2
2,21.591636,5.542713,13.296636,5.857184,17.480981,5.559346,78.880047,9.105114,3.880467,7.773874,16.948645,6.906221,40.2162,-74.248,8.0,63.2
3,21.591636,5.542713,13.296636,5.857184,17.480981,5.559346,78.880047,9.105114,3.880467,7.773874,16.948645,6.906221,40.2162,-74.248,8.0,59.0
4,21.591636,5.542713,13.296636,5.857184,17.480981,5.559346,78.880047,9.105114,3.880467,7.773874,16.948645,6.906221,40.2162,-74.248,8.0,66.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86096,26.312664,7.165520,13.399860,6.585044,19.750374,6.735163,62.278364,12.020334,2.978738,6.245919,19.075140,11.052535,40.1200,96.660,30.0,85.5
86097,26.312664,7.165520,13.399860,6.585044,19.750374,6.735163,62.278364,12.020334,2.978738,6.245919,19.075140,11.052535,40.1200,96.660,30.0,77.6
86098,26.312664,7.165520,13.399860,6.585044,19.750374,6.735163,62.278364,12.020334,2.978738,6.245919,19.075140,11.052535,40.1200,96.660,30.0,90.2
86099,26.312664,7.165520,13.399860,6.585044,19.750374,6.735163,62.278364,12.020334,2.978738,6.245919,19.075140,11.052535,40.1200,96.660,30.0,75.5


In [13]:
test_agg

Unnamed: 0,MaxTemp_mean,MaxTemp_std,MinTemp_mean,MinTemp_std,AvgTemp_mean,AvgTemp_std,AvgHumidity_mean,AvgHumidity_std,Precipitation_mean,Precipitation_std,Radiation_mean,Radiation_std,Latitude,Longitude,Row.Spacing,Yield
0,21.591636,5.542713,13.296636,5.857184,17.480981,5.559346,78.880047,9.105114,3.880467,7.773874,16.948645,6.906221,40.2162,-74.248,8.0,71.5
1,21.591636,5.542713,13.296636,5.857184,17.480981,5.559346,78.880047,9.105114,3.880467,7.773874,16.948645,6.906221,40.2162,-74.248,8.0,53.3
2,21.591636,5.542713,13.296636,5.857184,17.480981,5.559346,78.880047,9.105114,3.880467,7.773874,16.948645,6.906221,40.2162,-74.248,8.0,50.8
3,21.591636,5.542713,13.296636,5.857184,17.480981,5.559346,78.880047,9.105114,3.880467,7.773874,16.948645,6.906221,40.2162,-74.248,8.0,52.9
4,21.591636,5.542713,13.296636,5.857184,17.480981,5.559346,78.880047,9.105114,3.880467,7.773874,16.948645,6.906221,40.2162,-74.248,8.0,61.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10758,26.312664,7.165520,13.399860,6.585044,19.750374,6.735163,62.278364,12.020334,2.978738,6.245919,19.075140,11.052535,40.1200,96.660,30.0,70.2
10759,26.312664,7.165520,13.399860,6.585044,19.750374,6.735163,62.278364,12.020334,2.978738,6.245919,19.075140,11.052535,40.1200,96.660,30.0,72.6
10760,26.312664,7.165520,13.399860,6.585044,19.750374,6.735163,62.278364,12.020334,2.978738,6.245919,19.075140,11.052535,40.1200,96.660,30.0,98.4
10761,26.312664,7.165520,13.399860,6.585044,19.750374,6.735163,62.278364,12.020334,2.978738,6.245919,19.075140,11.052535,40.1200,96.660,30.0,74.8


In [14]:
# 5. Split features / target
X_train = train_agg.drop('Yield', axis=1)
y_train = train_agg['Yield']
X_val   = val_agg.drop('Yield',   axis=1)
y_val   = val_agg['Yield']
X_test  = test_agg.drop('Yield',  axis=1)
y_test  = test_agg['Yield']


In [15]:
y_test

0        71.5
1        53.3
2        50.8
3        52.9
4        61.5
         ... 
10758    70.2
10759    72.6
10760    98.4
10761    74.8
10762    62.1
Name: Yield, Length: 10763, dtype: float64

In [16]:
import numpy as np
from tabpfn import TabPFNRegressor
from sklearn.metrics import r2_score,  root_mean_squared_error


# 6. Subsample 10k and fit TabPFN
train_sub = train_agg.sample(n=10_000, random_state=42)
X_sub     = train_sub.drop('Yield', axis=1).to_numpy()
y_sub     = train_sub['Yield'].to_numpy()

model = TabPFNRegressor()
model.fit(X_sub, y_sub)

# 7. Evaluate on full val/test sets
for name, X_np, y_np in [
        ('Val',  X_val.to_numpy(),  y_val.to_numpy()),
        ('Test', X_test.to_numpy(), y_test.to_numpy())
    ]:
    preds = model.predict(X_np)
    print(f"{name} R²   : {r2_score(y_np, preds):.4f}")
    print(f"{name} RMSE : {root_mean_squared_error(y_np, preds):.4f}")


Val R²   : 0.6974
Val RMSE : 8.2461
Test R²   : 0.6942
Test RMSE : 8.2331


APPLYING QUANTIZATION

In [19]:
import numpy as np
import pandas as pd
from tabpfn import TabPFNRegressor
#from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import r2_score,  root_mean_squared_error

# --- 1. Your quantization helper, modified so we compute min/max on TRAIN only ---
def fit_quantizer(df: pd.DataFrame, b: int):
    levels = 2**b
    params = {}
    for col in df.columns:
        xmin, xmax = df[col].min(), df[col].max()
        # constant columns get skipped
        if xmax == xmin:
            continue
        delta = (xmax - xmin) / (levels - 1)
        params[col] = (xmin, delta)
    return params

def apply_quantizer(df: pd.DataFrame, params: dict, b: int) -> pd.DataFrame:
    df_q = df.copy()
    levels = 2**b
    for col, (xmin, delta) in params.items():
        df_q[col] = xmin + delta * np.round((df_q[col] - xmin) / delta)
    return df_q

# --- 2. After your aggregation step: train_agg, val_agg, test_agg
X_train = train_agg.drop('Yield', axis=1)
y_train = train_agg['Yield']
X_val   = val_agg  .drop('Yield', axis=1)
y_val   = val_agg  ['Yield']
X_test  = test_agg .drop('Yield', axis=1)
y_test  = test_agg ['Yield']

# --- 3. Try several bit-depths and compare R²/RMSE ---
for b in [2, 4, 6, 8]:
    # 3a) learn quantization grid on TRAIN
    qt_params = fit_quantizer(X_train, b)
    # 3b) apply to all sets
    Xtr_q = apply_quantizer(X_train, qt_params, b).to_numpy()
    Xvl_q = apply_quantizer(X_val,   qt_params, b).to_numpy()
    Xte_q = apply_quantizer(X_test,  qt_params, b).to_numpy()

    # 3c) subsample & fit
    idx = np.random.RandomState(42).choice(len(Xtr_q), size=10_000, replace=False)
    Xsub, ysub = Xtr_q[idx], y_train.to_numpy()[idx]

    model = TabPFNRegressor()
    model.fit(Xsub, ysub)

    # 3d) eval on VAL and TEST
    for name, Xq, y in [('Val', Xvl_q, y_val.to_numpy()),
                       ('Test', Xte_q, y_test.to_numpy())]:
        preds = model.predict(Xq)
        print(f"b={b} → {name} R²: {r2_score(y, preds):.4f}, RMSE: {root_mean_squared_error(y, preds):.4f}")
    print("-" * 40)


b=2 → Val R²: 0.4876, RMSE: 10.7300
b=2 → Test R²: 0.4991, RMSE: 10.5362
----------------------------------------
b=4 → Val R²: 0.6942, RMSE: 8.2888
b=4 → Test R²: 0.6902, RMSE: 8.2867
----------------------------------------
b=6 → Val R²: 0.6942, RMSE: 8.2893
b=6 → Test R²: 0.6916, RMSE: 8.2674
----------------------------------------
b=8 → Val R²: 0.6955, RMSE: 8.2712
b=8 → Test R²: 0.6923, RMSE: 8.2587
----------------------------------------


In [20]:
for b in [8, 16, 32, 64, 128, 256]:
    # 3a) learn quantization grid on TRAIN
    qt_params = fit_quantizer(X_train, b)
    # 3b) apply to all sets
    Xtr_q = apply_quantizer(X_train, qt_params, b).to_numpy()
    Xvl_q = apply_quantizer(X_val,   qt_params, b).to_numpy()
    Xte_q = apply_quantizer(X_test,  qt_params, b).to_numpy()

    # 3c) subsample & fit
    idx = np.random.RandomState(42).choice(len(Xtr_q), size=10_000, replace=False)
    Xsub, ysub = Xtr_q[idx], y_train.to_numpy()[idx]

    model = TabPFNRegressor()
    model.fit(Xsub, ysub)

    # 3d) eval on VAL and TEST
    for name, Xq, y in [('Val', Xvl_q, y_val.to_numpy()),
                       ('Test', Xte_q, y_test.to_numpy())]:
        preds = model.predict(Xq)
        print(f"b={b} → {name} R²: {r2_score(y, preds):.4f}, RMSE: {root_mean_squared_error(y, preds):.4f}")
    print("-" * 40)


b=8 → Val R²: 0.6955, RMSE: 8.2712
b=8 → Test R²: 0.6923, RMSE: 8.2587
----------------------------------------
b=16 → Val R²: 0.6970, RMSE: 8.2510
b=16 → Test R²: 0.6940, RMSE: 8.2351
----------------------------------------
b=32 → Val R²: 0.6971, RMSE: 8.2499
b=32 → Test R²: 0.6940, RMSE: 8.2349
----------------------------------------
b=64 → Val R²: 0.6971, RMSE: 8.2494
b=64 → Test R²: 0.6941, RMSE: 8.2338
----------------------------------------
b=128 → Val R²: 0.6971, RMSE: 8.2494
b=128 → Test R²: 0.6941, RMSE: 8.2338
----------------------------------------
b=256 → Val R²: 0.6971, RMSE: 8.2494
b=256 → Test R²: 0.6941, RMSE: 8.2338
----------------------------------------
