In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error



In [3]:

# 1. Load
train = pd.read_csv('train.csv')
val   = pd.read_csv('val.csv')
test  = pd.read_csv('test.csv')

In [4]:
# 1. Compute fill‐values on TRAIN only
mg_mode      = train['MG'].mode()[0]           # mode for categorical MG
lon_median   = train['Longitude'].median()     # median for longitude
mean_cols    = ['Lodging','PlantHeight','SeedSize','Protein','Oil']
mean_values  = train[mean_cols].mean()         # means for plant characteristics

# 2. Fill missing in train/val/test
for df in (train, val, test):
    df['MG']        = df['MG'].fillna(mg_mode)
    df['Longitude'] = df['Longitude'].fillna(lon_median)
    for col in mean_cols:
        df[col]     = df[col].fillna(mean_values[col])

# 3. Define your features

In [5]:
# make sure these are defined at top‐level:
temporal_feats = ['MaxTemp','MinTemp','AvgTemp','AvgHumidity','Precipitation','Radiation']
static_feats   = ['Latitude','Longitude','Row.Spacing']
plant_feats    = ['Lodging','PlantHeight','SeedSize','Protein','Oil']
cluster_feats  = [f'Cluster_{i}' for i in range(40)]

def aggregate_sequences(df, target='Yield', agg_target='mean'):
    agg_dict = {}

    # 1. temporal: mean & std
    for feat in temporal_feats:
        agg_dict[f'{feat}_mean'] = (feat, 'mean')
        agg_dict[f'{feat}_std']  = (feat, 'std')

    # 2. static geography: take first (constant per sequence)
    for feat in static_feats:
        agg_dict[feat] = (feat, 'first')

    # 3. plant features:
    #    - MG (categorical) → mode  
    agg_dict['MG'] = ('MG', lambda x: x.mode().iloc[0])
    #    - Lodging, PlantHeight, SeedSize, Protein, Oil → first
    for feat in plant_feats:
        agg_dict[feat] = (feat, 'first')

    # 4. cluster indicators: proportion of time in each cluster + variability
    for feat in cluster_feats:
        agg_dict[f'{feat}_mean'] = (feat, 'mean')
        agg_dict[f'{feat}_std']  = (feat, 'std')

    # 5. target: mean or final
    if agg_target == 'mean':
        agg_dict[target] = (target, 'mean')
    elif agg_target == 'final':
        agg_dict[target] = (target, lambda x: x.iloc[-1])
    else:
        raise ValueError("agg_target must be 'mean' or 'final'")

    # apply the aggregation
    grouped = df.groupby('TimeSeriesLabel').agg(**agg_dict)
    return grouped.reset_index(drop=True)


In [6]:
train_agg = aggregate_sequences(train, agg_target='mean')
val_agg   = aggregate_sequences(val,   agg_target='mean')
test_agg  = aggregate_sequences(test,  agg_target='mean')

In [7]:
# 5. Split features / target
X_train = train_agg.drop('Yield', axis=1)
y_train = train_agg['Yield']
X_val   = val_agg.drop('Yield',   axis=1)
y_val   = val_agg['Yield']
X_test  = test_agg.drop('Yield',  axis=1)
y_test  = test_agg['Yield']

In [8]:
import numpy as np
from tabdpt import TabDPTRegressor
from sklearn.metrics import r2_score, mean_squared_error   # <-- use this call

# 1 . sample 10 k rows for quick training
train_sub = train_agg.sample(n=10_000, random_state=42)
X_sub = train_sub.drop('Yield', axis=1).to_numpy()
y_sub = train_sub['Yield'].to_numpy()

# 2 . build the model
model = TabDPTRegressor(
    device="cpu"
)

# 3 . fit
model.fit(X_sub, y_sub)

# 4 . evaluate (keep context_size ≤ 8192 so SDPA never overflows)
for name, X_np, y_np in [
    ("Val",  X_val.to_numpy(),  y_val.to_numpy()),
    ("Test", X_test.to_numpy(), y_test.to_numpy())
]:
    preds = model.predict(X_np, context_size=4096, n_ensembles=1)
    rmse  = mean_squared_error(y_np, preds, squared=False)  # root-MSE
    print(f"{name} R²   : {r2_score(y_np, preds):.4f}")
    print(f"{name} RMSE : {rmse:.4f}")


InductorError: RuntimeError: Compiler: cl is not found.

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


In [9]:
# %%
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
import torch
import gc

# %%
# 1. Load
train = pd.read_csv('train.csv')
val   = pd.read_csv('val.csv')
test  = pd.read_csv('test.csv')

# %%
# 1. Compute fill‐values on TRAIN only
mg_mode      = train['MG'].mode()[0]           # mode for categorical MG
lon_median   = train['Longitude'].median()     # median for longitude
mean_cols    = ['Lodging','PlantHeight','SeedSize','Protein','Oil']
mean_values  = train[mean_cols].mean()         # means for plant characteristics

# 2. Fill missing in train/val/test
for df in (train, val, test):
    df['MG']        = df['MG'].fillna(mg_mode)
    df['Longitude'] = df['Longitude'].fillna(lon_median)
    for col in mean_cols:
        df[col]     = df[col].fillna(mean_values[col])

# 3. Define your features
# %%
# make sure these are defined at top‐level:
temporal_feats = ['MaxTemp','MinTemp','AvgTemp','AvgHumidity','Precipitation','Radiation']
static_feats   = ['Latitude','Longitude','Row.Spacing']
plant_feats    = ['Lodging','PlantHeight','SeedSize','Protein','Oil']
cluster_feats  = [f'Cluster_{i}' for i in range(40)]

def aggregate_sequences(df, target='Yield', agg_target='mean'):
    agg_dict = {}
    # 1. temporal: mean & std
    for feat in temporal_feats:
        agg_dict[f'{feat}_mean'] = (feat, 'mean')
        agg_dict[f'{feat}_std']  = (feat, 'std')
    # 2. static geography: take first (constant per sequence)
    for feat in static_feats:
        agg_dict[feat] = (feat, 'first')
    # 3. plant features:
    #    - MG (categorical) → mode  
    agg_dict['MG'] = ('MG', lambda x: x.mode().iloc[0])
    #    - Lodging, PlantHeight, SeedSize, Protein, Oil → first
    for feat in plant_feats:
        agg_dict[feat] = (feat, 'first')
    # 4. cluster indicators: proportion of time in each cluster + variability
    for feat in cluster_feats:
        agg_dict[f'{feat}_mean'] = (feat, 'mean')
        agg_dict[f'{feat}_std']  = (feat, 'std')
    # 5. target: mean or final
    if agg_target == 'mean':
        agg_dict[target] = (target, 'mean')
    elif agg_target == 'final':
        agg_dict[target] = (target, lambda x: x.iloc[-1])
    else:
        raise ValueError("agg_target must be 'mean' or 'final'")
    # apply the aggregation
    grouped = df.groupby('TimeSeriesLabel').agg(**agg_dict)
    return grouped.reset_index(drop=True)

# %%
train_agg = aggregate_sequences(train, agg_target='mean')
val_agg   = aggregate_sequences(val,   agg_target='mean')
test_agg  = aggregate_sequences(test,  agg_target='mean')

# %%
# 5. Split features / target
X_train = train_agg.drop('Yield', axis=1)
y_train = train_agg['Yield']
X_val   = val_agg.drop('Yield',   axis=1)
y_val   = val_agg['Yield']
X_test  = test_agg.drop('Yield',  axis=1)
y_test  = test_agg['Yield']

In [13]:
# %%
# SOLUTION: Completely disable PyTorch compilation and use CPU
import os
import warnings
warnings.filterwarnings('ignore')

# Disable ALL compilation attempts
os.environ['TORCH_COMPILE_DISABLE'] = '1'
os.environ['TORCHINDUCTOR_DISABLE'] = '1'
os.environ['TORCH_CUDNN_V8_API_DISABLED'] = '1'
os.environ['PYTORCH_DISABLE_LIBRARY'] = '1'

# %%
import torch
import torch._dynamo
import torch._inductor

# Disable dynamo and inductor completely
torch._dynamo.config.disable = True
torch._dynamo.config.suppress_errors = True
torch._inductor.config.disable_progress = True
torch._inductor.config.triton.cudagraphs = False

# Force CPU backend
torch.set_default_tensor_type(torch.FloatTensor)

# %%
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
import gc

# Import TabDPT AFTER disabling compilation
from tabdpt import TabDPTRegressor

# %%
# Clear any GPU memory and force CPU
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    # Explicitly disable CUDA
    torch.cuda.is_available = lambda: False

# %%
print("Loading data and preparing for CPU-only execution...")

# Your existing data should be loaded here
# train_agg, val_agg, test_agg, X_train, y_train, X_val, y_val, X_test, y_test

# %%
# Create a small training subset
n_samples = min(2000, len(train_agg))
print(f"Using {n_samples} samples for training")

train_sub = train_agg.sample(n=n_samples, random_state=42)
X_sub = train_sub.drop('Yield', axis=1).to_numpy().astype(np.float32)
y_sub = train_sub['Yield'].to_numpy().astype(np.float32)

# %%
# Initialize model - FORCE CPU
print("Initializing TabDPT model on CPU...")
model = TabDPTRegressor(device="cpu")

# Double-check it's on CPU
if hasattr(model, 'model'):
    for param in model.model.parameters():
        if param.is_cuda:
            param.data = param.data.cpu()

# %%
# Fit the model
print("Training model...")
try:
    model.fit(X_sub, y_sub)
    print("Model training completed successfully!")
except Exception as e:
    print(f"Training error: {e}")
    print("\nTrying with even smaller dataset...")
    # Try with just 500 samples
    X_mini = X_sub[:500]
    y_mini = y_sub[:500]
    model.fit(X_mini, y_mini)
    print("Model trained on reduced dataset")

# %%
# Define safe prediction function
def safe_predict(model, X_data, batch_size=50):
    """Predict in small batches with error handling"""
    n_samples = len(X_data)
    predictions = []
    
    for i in range(0, n_samples, batch_size):
        end_idx = min(i + batch_size, n_samples)
        batch = X_data[i:end_idx]
        
        # Multiple attempts with decreasing context size
        for context_size in [64, 32, 16, 8, 4, 1]:
            try:
                # Ensure batch is on CPU and correct dtype
                if isinstance(batch, np.ndarray):
                    batch = batch.astype(np.float32)
                
                # Predict with minimal settings
                preds = model.predict(
                    batch,
                    context_size=min(context_size, len(batch)),
                    n_ensembles=1
                )
                
                predictions.extend(preds)
                break  # Success, move to next batch
                
            except Exception as e:
                if context_size == 1:
                    print(f"Batch {i//batch_size} failed even with context_size=1: {str(e)[:100]}")
                    # Skip this batch
                    predictions.extend([0] * len(batch))  # Placeholder predictions
                continue
    
    return np.array(predictions, dtype=np.float32)

# %%
# Evaluate on validation and test sets
print("\nEvaluating model...")

for name, X_eval, y_eval in [("Validation", X_val, y_val), ("Test", X_test, y_test)]:
    print(f"\n{name} Set Evaluation:")
    
    # Convert to numpy and ensure float32
    X_np = X_eval.to_numpy().astype(np.float32)
    y_np = y_eval.to_numpy().astype(np.float32)
    
    # Get predictions
    print(f"Predicting on {len(X_np)} samples...")
    predictions = safe_predict(model, X_np, batch_size=25)
    
    # Calculate metrics only for successful predictions
    valid_mask = predictions != 0  # Remove placeholder predictions
    if valid_mask.sum() > 0:
        valid_preds = predictions[valid_mask]
        valid_y = y_np[valid_mask]
        
        rmse = mean_squared_error(valid_y, valid_preds, squared=False)
        r2 = r2_score(valid_y, valid_preds)
        
        print(f"Successfully predicted {valid_mask.sum()}/{len(y_np)} samples")
        print(f"{name} R²   : {r2:.4f}")
        print(f"{name} RMSE : {rmse:.4f}")
    else:
        print(f"Failed to get valid predictions for {name} set")

# %%
# Alternative: If TabDPT still fails, here's a minimal working predictor
print("\n" + "="*60)
print("If TabDPT continues to fail, here's a simple baseline:")
print("="*60)

from sklearn.ensemble import RandomForestRegressor

# Train a simple Random Forest as baseline
rf_model = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42, n_jobs=1)
rf_model.fit(X_sub, y_sub)

# Evaluate RF baseline
for name, X_eval, y_eval in [("Validation", X_val, y_val), ("Test", X_test, y_test)]:
    preds = rf_model.predict(X_eval)
    rmse = mean_squared_error(y_eval, preds, squared=False)
    r2 = r2_score(y_eval, preds)
    print(f"\nRandom Forest {name} - R²: {r2:.4f}, RMSE: {rmse:.4f}")

# %%
print("\n" + "="*60)
print("TROUBLESHOOTING SUMMARY")
print("="*60)
print("The 'cl is not found' error occurs because:")
print("1. PyTorch is trying to compile custom CUDA kernels")
print("2. Microsoft Visual C++ compiler (cl.exe) is not installed")
print("3. This is a Windows-specific issue")
print("\nThis script forces CPU-only execution and disables compilation.")
print("\nIf you still get errors:")
print("1. Restart your Python kernel/interpreter")
print("2. Make sure you run the environment setup BEFORE importing torch")
print("3. Consider using WSL2 or Linux where these issues don't occur")
print("4. Or use the Random Forest baseline which always works")
print("="*60)

Loading data and preparing for CPU-only execution...
Using 2000 samples for training
Initializing TabDPT model on CPU...
Training model...
Model training completed successfully!

Evaluating model...

Validation Set Evaluation:
Predicting on 10763 samples...
Batch 0 failed even with context_size=1: FlashAttention requires CUDA support
Batch 1 failed even with context_size=1: FlashAttention requires CUDA support
Batch 2 failed even with context_size=1: FlashAttention requires CUDA support
Batch 3 failed even with context_size=1: FlashAttention requires CUDA support
Batch 4 failed even with context_size=1: FlashAttention requires CUDA support
Batch 5 failed even with context_size=1: FlashAttention requires CUDA support
Batch 6 failed even with context_size=1: FlashAttention requires CUDA support
Batch 7 failed even with context_size=1: FlashAttention requires CUDA support
Batch 8 failed even with context_size=1: FlashAttention requires CUDA support
Batch 9 failed even with context_size=1:

KeyboardInterrupt: 