In [1]:
import pandas as pd
import numpy as np
from tabpfn import TabPFNRegressor
from sklearn.metrics import r2_score, mean_squared_error

# 1. Load
train = pd.read_csv('train.csv')
val   = pd.read_csv('val.csv')
test  = pd.read_csv('test.csv')

In [2]:
from sklearn.metrics import r2_score, root_mean_squared_error

In [3]:

# 1. Load raw CSVs
train = pd.read_csv('train.csv')
val   = pd.read_csv('val.csv')
test  = pd.read_csv('test.csv')

# 2. Impute missing values (TRAIN-only statistics)
mg_mode    = train['MG'].mode()[0]
lon_med    = train['Longitude'].median()
mean_cols  = ['Lodging','PlantHeight','SeedSize','Protein','Oil']
mean_vals  = train[mean_cols].mean()

for df in (train, val, test):
    df['MG']        = df['MG'].fillna(mg_mode)
    df['Longitude'] = df['Longitude'].fillna(lon_med)
    for c in mean_cols:
        df[c]      = df[c].fillna(mean_vals[c])

# 3. Feature definitions
temporal_feats = ['MaxTemp','MinTemp','AvgTemp','AvgHumidity','Precipitation','Radiation']
static_feats   = ['Latitude','Longitude','Row.Spacing']
plant_feats    = ['Lodging','PlantHeight','SeedSize','Protein','Oil']
cluster_feats  = [f'Cluster_{i}' for i in range(40)]

# 4. Aggregation function
def aggregate_sequences(df, target='Yield', agg_target='mean'):
    agg = {}
    # temporal: mean & std
    for f in temporal_feats:
        agg[f'{f}_mean'] = (f, 'mean')
        agg[f'{f}_std']  = (f, 'std')
    # static geography: first
    for f in static_feats:
        agg[f] = (f, 'first')
    # plant: MG by mode, others by first
    agg['MG'] = ('MG', lambda x: x.mode().iloc[0])
    for f in plant_feats:
        agg[f] = (f, 'first')
    # clusters: mean & std
    for f in cluster_feats:
        agg[f'{f}_mean'] = (f, 'mean')
        agg[f'{f}_std']  = (f, 'std')
    # target
    if agg_target=='mean':
        agg[target] = (target, 'mean')
    else:
        agg[target] = (target, lambda x: x.iloc[-1])
    return df.groupby('TimeSeriesLabel').agg(**agg).reset_index(drop=True)

train_agg = aggregate_sequences(train)
val_agg   = aggregate_sequences(val)
test_agg  = aggregate_sequences(test)

# 5. Split into X/y
X_train = train_agg.drop('Yield', axis=1)
y_train = train_agg['Yield']

X_val   = val_agg.drop('Yield',   axis=1)
y_val   = val_agg['Yield']

X_test  = test_agg.drop('Yield',  axis=1)
y_test  = test_agg['Yield']


In [4]:
# 6. Quantization helpers
def fit_quantizer(df, b):
    levels = 2**b
    params = {}
    for col in df.columns:
        xmin, xmax = df[col].min(), df[col].max()
        if xmax==xmin: continue
        delta = (xmax-xmin)/(levels-1)
        params[col] = (xmin, delta)
    return params

def apply_quantizer(df, params):
    df_q = df.copy()
    for col, (xmin, delta) in params.items():
        df_q[col] = xmin + delta * np.round((df_q[col]-xmin)/delta)
    return df_q

In [7]:
# 7. Loop over bit-depths and evaluate
for b in [2,4,6,8,16,32,64,128,256]:
    # learn grid on X_train
    q_params = fit_quantizer(X_train, b)
    Xtr_q = apply_quantizer(X_train, q_params).to_numpy()
    Xvl_q = apply_quantizer(X_val,   q_params).to_numpy()
    Xte_q = apply_quantizer(X_test,  q_params).to_numpy()

    # subsample 10k for fitting
    idx    = np.random.RandomState(42).choice(len(Xtr_q),  size=10_000, replace=False)
    Xsub, ysub = Xtr_q[idx], y_train.to_numpy()[idx]

    model = TabPFNRegressor()
    model.fit(Xsub, ysub)

    # evaluate
    for name, Xq, y in [('Val', Xvl_q, y_val.to_numpy()),
                       ('Test', Xte_q, y_test.to_numpy())]:
        preds = model.predict(Xq)
        print(f"b={b} → {name} R²: {r2_score(y,preds):.4f}, RMSE: {root_mean_squared_error(y,preds):.4f}")
    print('-'*40)

b=2 → Val R²: 0.5212, RMSE: 10.3723
b=2 → Test R²: 0.5267, RMSE: 10.2425
----------------------------------------
b=4 → Val R²: 0.7651, RMSE: 7.2655
b=4 → Test R²: 0.7680, RMSE: 7.1704
----------------------------------------
b=6 → Val R²: 0.7741, RMSE: 7.1243
b=6 → Test R²: 0.7779, RMSE: 7.0164
----------------------------------------
b=8 → Val R²: 0.7772, RMSE: 7.0752
b=8 → Test R²: 0.7790, RMSE: 6.9979
----------------------------------------
b=16 → Val R²: 0.7775, RMSE: 7.0705
b=16 → Test R²: 0.7797, RMSE: 6.9869
----------------------------------------
b=32 → Val R²: 0.7775, RMSE: 7.0698
b=32 → Test R²: 0.7798, RMSE: 6.9860
----------------------------------------
b=64 → Val R²: 0.7775, RMSE: 7.0698
b=64 → Test R²: 0.7798, RMSE: 6.9862
----------------------------------------
b=128 → Val R²: 0.7775, RMSE: 7.0698
b=128 → Test R²: 0.7798, RMSE: 6.9862
----------------------------------------
b=256 → Val R²: 0.7775, RMSE: 7.0698
b=256 → Test R²: 0.7798, RMSE: 6.9862
-----------------

#Applying Autogluon

In [5]:
from autogluon.tabular import TabularDataset, TabularPredictor

# Prepare train/val/test dataframes by joining X and y
train_autogluon = pd.concat([X_train, y_train.rename("Yield")], axis=1)
val_autogluon   = pd.concat([X_val,   y_val.rename("Yield")], axis=1)
test_autogluon  = pd.concat([X_test,  y_test.rename("Yield")], axis=1)

# Combine train + val for stronger training
full_train = pd.concat([train_autogluon, val_autogluon], axis=0)

# Train AutoGluon Predictor
predictor = TabularPredictor(label='Yield', problem_type='regression').fit(train_data=full_train)

# Evaluate
preds = predictor.predict(test_autogluon.drop(columns=['Yield']))
r2 = r2_score(test_autogluon['Yield'], preds)
rmse = np.sqrt(mean_squared_error(test_autogluon['Yield'], preds))

print(f"[AutoGluon] R²: {r2:.4f}, RMSE: {rmse:.4f}")


No path specified. Models will be saved in: "AutogluonModels\ag-20250726_065718"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.9.23
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          32
Memory Avail:       23.10 GB / 63.70 GB (36.3%)
Disk Space Avail:   651.85 GB / 1883.61 GB (34.6%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	pr

[1000]	valid_set's rmse: 6.88191
[2000]	valid_set's rmse: 6.49003
[3000]	valid_set's rmse: 6.32587
[4000]	valid_set's rmse: 6.22362
[5000]	valid_set's rmse: 6.16246
[6000]	valid_set's rmse: 6.11801
[7000]	valid_set's rmse: 6.08595
[8000]	valid_set's rmse: 6.06226
[9000]	valid_set's rmse: 6.04346
[10000]	valid_set's rmse: 6.02306


	-6.0227	 = Validation score   (-root_mean_squared_error)
	16.52s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's rmse: 6.51626
[2000]	valid_set's rmse: 6.20191
[3000]	valid_set's rmse: 6.07512
[4000]	valid_set's rmse: 6.03353
[5000]	valid_set's rmse: 6.00412
[6000]	valid_set's rmse: 6.00156
[7000]	valid_set's rmse: 5.98794
[8000]	valid_set's rmse: 5.98226
[9000]	valid_set's rmse: 5.98325
[10000]	valid_set's rmse: 5.98357


	-5.9796	 = Validation score   (-root_mean_squared_error)
	15.38s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-6.2361	 = Validation score   (-root_mean_squared_error)
	11.41s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ...
	-6.0113	 = Validation score   (-root_mean_squared_error)
	70.07s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-6.0757	 = Validation score   (-root_mean_squared_error)
	5.74s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
		Import fastai failed. A quick tip is to install via `pip install autogluon.tabular[fastai]==1.3.1`. 
Fitting model: XGBoost ...
	-6.0171	 = Validation score   (-root_mean_squared_error)
	13.3s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: NeuralNetTorch ...
		__init__() got an unexpected keyword argument 'force_int_remainder_cols'
Detailed Traceback:
Traceback (most recent call

[1000]	valid_set's rmse: 6.21494
[2000]	valid_set's rmse: 6.09353
[3000]	valid_set's rmse: 6.05084
[4000]	valid_set's rmse: 6.04852


	-6.0423	 = Validation score   (-root_mean_squared_error)
	14.85s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'LightGBM': 0.238, 'XGBoost': 0.238, 'LightGBMXT': 0.19, 'ExtraTreesMSE': 0.143, 'KNeighborsDist': 0.095, 'RandomForestMSE': 0.048, 'CatBoost': 0.048}
	-5.8738	 = Validation score   (-root_mean_squared_error)
	0.02s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 151.47s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 7562.6 rows/s (2500 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("c:\Users\mohd7\TabPFN-data\Weather\AutogluonModels\ag-20250726_065718")


[AutoGluon] R²: 0.8462, RMSE: 5.8391
