# Data Loading

In [None]:
!pip install tab_transformer_pytorch

Collecting tab_transformer_pytorch
  Downloading tab_transformer_pytorch-0.4.2-py3-none-any.whl.metadata (914 bytes)
Collecting hyper-connections>=0.1.15 (from tab_transformer_pytorch)
  Downloading hyper_connections-0.1.15-py3-none-any.whl.metadata (5.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.3->tab_transformer_pytorch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.3->tab_transformer_pytorch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.3->tab_transformer_pytorch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.3->tab_transformer_pytorch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collect

In [None]:
!pip install hyper-connections



In [None]:

import torch
import torch.nn as nn
from tab_transformer_pytorch import TabTransformer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler





In [None]:
# 1. Data Loading with proper handling of mixed types
template = pd.read_csv('1_Submission_Template_2024 (2).csv', low_memory=False)

In [None]:
comp_testing = pd.read_csv('testing_data (1).csv', low_memory=False)

In [None]:
training_all= pd.read_csv('training_data (1).csv', low_memory=False)

In [None]:
# subset 2000 rows in training_with_common
small_training = training_all.head(2000)

In [None]:
# 2. Data Preprocessing
target_col = "Yield_Mg_ha"
df = training_all.dropna(subset=[target_col])

# Separate features by type
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
numerical_cols.remove(target_col)

# Print column info for debugging
print("Number of categorical columns:", len(categorical_cols))
print("Number of numerical columns:", len(numerical_cols))

# Remove any columns with all NaN values
numerical_cols = [col for col in numerical_cols if not df[col].isna().all()]
print("Number of numerical columns after removing all-NaN columns:", len(numerical_cols))

# Handle remaining NaN values in numerical columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# Initialize encoders
label_encoders = {}
scaler = StandardScaler()

# Process categorical columns with LabelEncoder
categorical_data = np.zeros((len(df), len(categorical_cols)), dtype=np.int64)
for i, col in enumerate(categorical_cols):
    label_encoders[col] = LabelEncoder()
    categorical_data[:, i] = label_encoders[col].fit_transform(df[col].astype(str).values)

# Process numerical columns with StandardScaler
numerical_data = scaler.fit_transform(df[numerical_cols].values).astype(np.float32)

# Define num_continuous before using it
num_continuous = len(numerical_cols)
print("Number of continuous features:", num_continuous)


Number of categorical columns: 3
Number of numerical columns: 658
Number of numerical columns after removing all-NaN columns: 658
Number of continuous features: 658


In [None]:
# 2. Data Preprocessing
target_col = "Yield_Mg_ha"
df = training_all.dropna(subset=[target_col])

# Separate features by type
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
numerical_cols.remove(target_col)

# Print column info for debugging
print("Number of categorical columns:", len(categorical_cols))
print("Number of numerical columns:", len(numerical_cols))

# Remove any columns with all NaN values
numerical_cols = [col for col in numerical_cols if not df[col].isna().all()]
print("Number of numerical columns after removing all-NaN columns:", len(numerical_cols))

# Handle remaining NaN values in numerical columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# Initialize encoders
label_encoders = {}
scaler = StandardScaler()

# Process categorical columns with LabelEncoder
categorical_data = np.zeros((len(df), len(categorical_cols)), dtype=np.int64)
for i, col in enumerate(categorical_cols):
    label_encoders[col] = LabelEncoder()
    categorical_data[:, i] = label_encoders[col].fit_transform(df[col].astype(str).values)

# Process numerical columns with StandardScaler
numerical_data = scaler.fit_transform(df[numerical_cols].values).astype(np.float32)

# Define num_continuous before using it
num_continuous = len(numerical_cols)
print("Number of continuous features:", num_continuous)


Number of categorical columns: 3
Number of numerical columns: 658
Number of numerical columns after removing all-NaN columns: 658
Number of continuous features: 658


In [None]:
# Prepare target with scaling
target_scaler = StandardScaler()
y = target_scaler.fit_transform(df[target_col].values.reshape(-1, 1)).astype(np.float32)

# Print data statistics for debugging
print("\nNumerical data stats:")
print("Mean:", np.mean(numerical_data))
print("Std:", np.std(numerical_data))
print("Any NaN:", np.isnan(numerical_data).any())
print("Shape:", numerical_data.shape)

print("\nCategorical data stats:")
print("Min:", np.min(categorical_data))
print("Max:", np.max(categorical_data))
print("Any NaN:", np.isnan(categorical_data).any())
print("Shape:", categorical_data.shape)

print("\nTarget stats:")
print("Mean:", np.mean(y))
print("Std:", np.std(y))
print("Any NaN:", np.isnan(y).any())
print("Shape:", y.shape)



Numerical data stats:
Mean: 1.0168962e-09
Std: 1.0000013
Any NaN: False
Shape: (164921, 658)

Categorical data stats:
Min: 0
Max: 5026
Any NaN: False
Shape: (164921, 3)

Target stats:
Mean: 5.1812212e-09
Std: 1.0
Any NaN: False
Shape: (164921, 1)


# Transformer

In [None]:
# 3. Train-Test Split
cat_train, cat_test, num_train, num_test, y_train, y_test = train_test_split(
    categorical_data, numerical_data, y,
    test_size=0.2, random_state=42
)

# 4. Calculate continuous mean and std
cont_mean_std = torch.tensor(
    np.stack([
        np.zeros(num_continuous),  # mean is 0 after StandardScaler
        np.ones(num_continuous)    # std is 1 after StandardScaler
    ], axis=1),
    dtype=torch.float32
)

# 5. Model Initialization with smaller architecture
categories = tuple([len(label_encoders[col].classes_) for col in categorical_cols])

model = TabTransformer(
    categories=categories,
    num_continuous=num_continuous,
    dim=16,
    dim_out=1,
    depth=3,
    heads=4,
    attn_dropout=0.1,
    ff_dropout=0.1,
    mlp_hidden_mults=(2, 2),
    mlp_act=nn.ReLU(),
    continuous_mean_std=cont_mean_std
)

# Rest of the code remains the same...



In [None]:
# 6. Training Loop with gradient clipping
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)  # reduced learning rate
criterion = nn.MSELoss()

# Convert data to tensors
cat_train_tensor = torch.tensor(cat_train, dtype=torch.long)
num_train_tensor = torch.tensor(num_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)

# Training loop with added checks
epochs = 10
batch_size = 64  # increased batch size
n_samples = len(cat_train)
max_grad_norm = 1.0  # for gradient clipping

for epoch in range(epochs):
    model.train()
    total_loss = 0
    valid_batches = 0

    # Mini-batch training
    for i in range(0, n_samples, batch_size):
        batch_cat = cat_train_tensor[i:i+batch_size]
        batch_num = num_train_tensor[i:i+batch_size]
        batch_y = y_train_tensor[i:i+batch_size]

        optimizer.zero_grad()
        outputs = model(batch_cat, batch_num)

        # Check for NaN in outputs
        if torch.isnan(outputs).any():
            print(f"NaN detected in outputs at batch starting with index {i}")
            continue

        loss = criterion(outputs, batch_y)

        # Check for NaN in loss
        if torch.isnan(loss):
            print(f"NaN detected in loss at batch starting with index {i}")
            continue

        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        optimizer.step()

        total_loss += loss.item()
        valid_batches += 1

        # Print batch loss for monitoring
        if i % (5 * batch_size) == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Batch [{i//batch_size}], Loss: {loss.item():.4f}')

    if valid_batches > 0:
        avg_loss = total_loss / valid_batches
        print(f'Epoch [{epoch+1}/{epochs}], Average Loss: {avg_loss:.4f}')
    else:
        print(f'Epoch [{epoch+1}/{epochs}]: No valid batches!')

# 7. Evaluation
model.eval()
with torch.no_grad():
    cat_test_tensor = torch.tensor(cat_test, dtype=torch.long)
    num_test_tensor = torch.tensor(num_test, dtype=torch.float32)
    y_pred = model(cat_test_tensor, num_test_tensor).numpy()

    # Inverse transform predictions and actual values
    y_pred_original = target_scaler.inverse_transform(y_pred)
    y_test_original = target_scaler.inverse_transform(y_test.reshape(-1, 1))

    rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
    print(f"Test RMSE: {rmse:.4f}")
    # Calculate R-squared
    from sklearn.metrics import r2_score
    r2 = r2_score(y_test_original, y_pred_original)
    print(f"Test R-squared: {r2:.4f}")

    # Calculate Correlation
    correlation = np.corrcoef(y_test_original.ravel(), y_pred_original.ravel())[0, 1]
    print(f"Test Correlation: {correlation:.4f}")

Epoch [1/10], Batch [0], Loss: 0.4559
Epoch [1/10], Batch [5], Loss: 0.4824
Epoch [1/10], Batch [10], Loss: 0.4185
Epoch [1/10], Batch [15], Loss: 0.4884
Epoch [1/10], Batch [20], Loss: 0.4272
Epoch [1/10], Batch [25], Loss: 0.5522
Epoch [1/10], Batch [30], Loss: 0.4616
Epoch [1/10], Batch [35], Loss: 0.6066
Epoch [1/10], Batch [40], Loss: 0.2928
Epoch [1/10], Batch [45], Loss: 0.4129
Epoch [1/10], Batch [50], Loss: 0.5504
Epoch [1/10], Batch [55], Loss: 0.5235
Epoch [1/10], Batch [60], Loss: 0.4905
Epoch [1/10], Batch [65], Loss: 0.4739
Epoch [1/10], Batch [70], Loss: 0.5149
Epoch [1/10], Batch [75], Loss: 0.5726
Epoch [1/10], Batch [80], Loss: 0.4694
Epoch [1/10], Batch [85], Loss: 0.5072
Epoch [1/10], Batch [90], Loss: 0.5936
Epoch [1/10], Batch [95], Loss: 0.6558
Epoch [1/10], Batch [100], Loss: 0.2611
Epoch [1/10], Batch [105], Loss: 0.3685
Epoch [1/10], Batch [110], Loss: 0.4372
Epoch [1/10], Batch [115], Loss: 0.6182
Epoch [1/10], Batch [120], Loss: 0.4433
Epoch [1/10], Batch [1

# Tabnet

In [None]:
pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.1.0


In [None]:
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming small_training is your DataFrame
target_column = 'Yield_Mg_ha'

# 1. Separate features (X) and target (y)
X = small_training.drop(columns=[target_column])
y = small_training[target_column]

# 2. Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Further split train set into train and validation (80% train, 20% validation)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# 4. Identify categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(include=['number']).columns.tolist()

# 5. TabNet model initialization and training
clf = TabNetRegressor()

clf.fit(
    X_train=X_train.values,
    y_train=y_train.values.reshape(-1, 1),  # Reshape to (n_samples, 1)
    eval_set=[(X_valid.values, y_valid.values.reshape(-1, 1))],
    eval_metric=['rmse'],  # Use RMSE for evaluation
    cat_idxs=[X_train.columns.get_loc(col) for col in categorical_features] if categorical_features else [], # Define categorical feature indices
    cat_dims=[len(X_train[col].unique()) for col in categorical_features] if categorical_features else [] # Define categorical feature dimensionalities
)

# 6. Prediction
preds = clf.predict(X_test.values)

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

clf = TabNetClassifier()  #TabNetRegressor()
clf.fit(
  X_train, Y_train,
  eval_set=[(X_valid, y_valid)]
)
preds = clf.predict(X_test)



NameError: name 'X_train' is not defined