In [None]:
# %%
import pandas as pd
import numpy as np
from tabpfn import TabPFNRegressor
from sklearn.metrics import r2_score, root_mean_squared_error

# %%
# 1. Load your datasets
train = pd.read_csv('train.csv')
val   = pd.read_csv('val.csv')
test  = pd.read_csv('test.csv')

In [None]:
# 2. Inspect shapes and columns
for name, df in {'train': train, 'val': val, 'test': test}.items():
    print(f"\n=== {name.upper()} ===")
    print(f"Shape: {df.shape}")
    print("Columns:", list(df.columns))

# %%
# 3. Check for missing values
for name, df in {'train': train, 'val': val, 'test': test}.items():
    missing = df.isnull().sum()
    print(f"\n{name.upper()} missing values per column:")
    print(missing[missing > 0] if missing.any() else "No missing values!")

# %% [markdown]
# # Data Preprocessing

# %%
# 1. Compute fill‐values on TRAIN only
mg_mode      = train['MG'].mode()[0]           # mode for categorical MG
lon_median   = train['Longitude'].median()     # median for longitude
mean_cols    = ['Lodging','PlantHeight','SeedSize','Protein','Oil']
mean_values  = train[mean_cols].mean()         # means for plant characteristics

# 2. Fill missing in train/val/test
for df in (train, val, test):
    df['MG']        = df['MG'].fillna(mg_mode)
    df['Longitude'] = df['Longitude'].fillna(lon_median)
    for col in mean_cols:
        df[col]     = df[col].fillna(mean_values[col])

# 3. Verify no missing remain in those columns
for name, df in {'train': train, 'val': val, 'test': test}.items():
    rem = df[['MG','Longitude'] + mean_cols].isnull().sum()
    print(f"{name.upper()} remaining nulls:\n", rem)

# 4. Define your features
temporal_feats = ['MaxTemp','MinTemp','AvgTemp','AvgHumidity','Precipitation','Radiation']  # 6
static_feats   = ['Latitude','Longitude','Row.Spacing']                                    # 3

# %% [markdown]
# # Sequence Aggregation

# %%
# 5. Aggregate each sequence → one row
def aggregate_sequences(df, target='Yield', agg_target='mean'):
    # build dict of aggregation funcs
    agg_dict = {}
    # temporal: mean & std
    for feat in temporal_feats:
        agg_dict[feat + '_mean'] = (feat, 'mean')
        agg_dict[feat + '_std']  = (feat, 'std')
    # static: just take first (they're constant per sequence)
    for feat in static_feats:
        agg_dict[feat] = (feat, 'first')
    # target: either mean or final
    if agg_target == 'mean':
        agg_dict[target] = (target, 'mean')
    elif agg_target == 'final':
        agg_dict[target] = (target, lambda x: x.iloc[-1])
    else:
        raise ValueError("agg_target must be 'mean' or 'final'")

    grouped = df.groupby('TimeSeriesLabel').agg(**agg_dict)
    return grouped.reset_index(drop=True)

train_agg = aggregate_sequences(train, agg_target='mean')
val_agg   = aggregate_sequences(val,   agg_target='mean')
test_agg  = aggregate_sequences(test,  agg_target='mean')

print(f"Train aggregated shape: {train_agg.shape}")
print(f"Val aggregated shape: {val_agg.shape}")
print(f"Test aggregated shape: {test_agg.shape}")

# %%
# 6. Split features / target
X_train = train_agg.drop('Yield', axis=1)
y_train = train_agg['Yield']
X_val   = val_agg.drop('Yield',   axis=1)
y_val   = val_agg['Yield']
X_test  = test_agg.drop('Yield',  axis=1)
y_test  = test_agg['Yield']

# %% [markdown]
# # Baseline Model (No Quantization)

# %%
# 7. Subsample 10k and fit TabPFN
np.random.seed(42)
train_sub = train_agg.sample(n=10_000, random_state=42)
X_sub     = train_sub.drop('Yield', axis=1).to_numpy()
y_sub     = train_sub['Yield'].to_numpy()

model = TabPFNRegressor()
model.fit(X_sub, y_sub)

# 8. Evaluate on full val/test sets
print("=== BASELINE RESULTS (No Quantization) ===")
for name, X_np, y_np in [
        ('Val',  X_val.to_numpy(),  y_val.to_numpy()),
        ('Test', X_test.to_numpy(), y_test.to_numpy())
    ]:
    preds = model.predict(X_np)
    print(f"{name} R²   : {r2_score(y_np, preds):.4f}")
    print(f"{name} RMSE : {root_mean_squared_error(y_np, preds):.4f}")

# %% [markdown]
# # Quantization Helper

# %%
def quantize_df(df: pd.DataFrame, b: int) -> pd.DataFrame:
    """
    Quantize each column in df to 2^b levels between min and max.
    
    Args:
        df: DataFrame to quantize
        b: Number of bits (2^b levels)
    
    Returns:
        Quantized DataFrame
    """
    df_q = df.copy()
    levels = 2 ** b
    
    for col in df_q.columns:
        xmin, xmax = df_q[col].min(), df_q[col].max()
        # if constant column, skip
        if xmax == xmin:
            continue
        delta = (xmax - xmin) / (levels - 1)
        df_q[col] = xmin + delta * np.round((df_q[col] - xmin) / delta)
    
    return df_q

# %% [markdown]
# # Apply Quantization with Different Bit Levels

# %%
# Store results for comparison
results = {
    'baseline': {
        'val_r2': r2_score(y_val.to_numpy(), model.predict(X_val.to_numpy())),
        'val_rmse': root_mean_squared_error(y_val.to_numpy(), model.predict(X_val.to_numpy())),
        'test_r2': r2_score(y_test.to_numpy(), model.predict(X_test.to_numpy())),
        'test_rmse': root_mean_squared_error(y_test.to_numpy(), model.predict(X_test.to_numpy()))
    }
}

# Try different quantization levels
for b in [7, 8]:  # 8, 16, 32, 64, 128, 256 levels
    print(f"\n=== QUANTIZATION with b={b} ({2**b} levels) ===")
    
    # Convert numpy arrays back to DataFrames for quantization
    X_sub_df = pd.DataFrame(X_sub, columns=X_train.columns)
    X_val_df = pd.DataFrame(X_val.to_numpy(), columns=X_val.columns)
    X_test_df = pd.DataFrame(X_test.to_numpy(), columns=X_test.columns)
    
    # Quantize training subset
    X_sub_q = quantize_df(X_sub_df, b).to_numpy()
    
    # Fit model on quantized data
    model_q = TabPFNRegressor()
    model_q.fit(X_sub_q, y_sub)
    
    # Quantize and evaluate validation/test
    X_val_q = quantize_df(X_val_df, b).to_numpy()
    X_test_q = quantize_df(X_test_df, b).to_numpy()
    
    # Store results
    results[f'b={b}'] = {}
    
    for name, X_q, y in [('val', X_val_q, y_val.to_numpy()), 
                         ('test', X_test_q, y_test.to_numpy())]:
        preds = model_q.predict(X_q)
        r2 = r2_score(y, preds)
        rmse = root_mean_squared_error(y, preds)
        
        results[f'b={b}'][f'{name}_r2'] = r2
        results[f'b={b}'][f'{name}_rmse'] = rmse
        
        print(f"{name.upper()} R²: {r2:.4f}, RMSE: {rmse:.4f}")

# %% [markdown]
# # Results Summary

# %%
# Create summary DataFrame
summary_data = []
for method, metrics in results.items():
    summary_data.append({
        'Method': method,
        'Val R²': metrics['val_r2'],
        'Val RMSE': metrics['val_rmse'],
        'Test R²': metrics['test_r2'],
        'Test RMSE': metrics['test_rmse']
    })

summary_df = pd.DataFrame(summary_data)
print("\n=== RESULTS SUMMARY ===")
print(summary_df.to_string(index=False))

# Find best quantization level
best_val_r2_idx = summary_df['Val R²'].idxmax()
print(f"\nBest validation R²: {summary_df.iloc[best_val_r2_idx]['Method']}")

# %% [markdown]
# # Visualize Quantization Effect

# %%
import matplotlib.pyplot as plt

# Plot R² scores
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

methods = summary_df['Method'].values
val_r2 = summary_df['Val R²'].values
test_r2 = summary_df['Test R²'].values

ax1.plot(methods, val_r2, 'bo-', label='Validation')
ax1.plot(methods, test_r2, 'ro-', label='Test')
ax1.set_xlabel('Quantization Method')
ax1.set_ylabel('R² Score')
ax1.set_title('R² Score vs Quantization Level')
ax1.legend()
ax1.grid(True, alpha=0.3)
ax1.set_xticklabels(methods, rotation=45)

# Plot RMSE scores
val_rmse = summary_df['Val RMSE'].values
test_rmse = summary_df['Test RMSE'].values

ax2.plot(methods, val_rmse, 'bo-', label='Validation')
ax2.plot(methods, test_rmse, 'ro-', label='Test')
ax2.set_xlabel('Quantization Method')
ax2.set_ylabel('RMSE')
ax2.set_title('RMSE vs Quantization Level')
ax2.legend()
ax2.grid(True, alpha=0.3)
ax2.set_xticklabels(methods, rotation=45)

plt.tight_layout()
plt.show()

# %%
# Optional: Examine quantization effect on a single feature
feature_to_examine = 'MaxTemp_mean'
if feature_to_examine in X_train.columns:
    fig, axes = plt.subplots(2, 3, figsize=(15, 8))
    axes = axes.flatten()
    
    original_values = X_train[feature_to_examine].values[:1000]  # First 1000 samples
    
    for i, b in enumerate([3, 4, 5, 6, 7, 8]):
        ax = axes[i]
        quantized_values = quantize_df(pd.DataFrame({feature_to_examine: original_values}), b)[feature_to_examine].values
        
        ax.scatter(original_values, quantized_values, alpha=0.5, s=1)
        ax.plot([original_values.min(), original_values.max()], 
                [original_values.min(), original_values.max()], 
                'r--', label='y=x')
        ax.set_xlabel('Original Values')
        ax.set_ylabel('Quantized Values')
        ax.set_title(f'b={b} ({2**b} levels)')
        ax.grid(True, alpha=0.3)
    
    plt.suptitle(f'Quantization Effect on {feature_to_examine}')
    plt.tight_layout()
    plt.show()