## I have a tabular dataset, I found XGBOOST gets very high R2 score, Im training neural network to behave like it so that we get high R2

In [1]:
# !pip install skorch

Collecting skorch
  Downloading skorch-1.1.0-py3-none-any.whl.metadata (11 kB)
Downloading skorch-1.1.0-py3-none-any.whl (228 kB)
Installing collected packages: skorch
Successfully installed skorch-1.1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [18]:
# !pip install skorch
import torch
from torch import nn
from torch.optim import Adam
from skorch import NeuralNetRegressor
from skorch.callbacks import EarlyStopping
from skorch.dataset import ValidSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import r2_score

class ParallelActivationLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        
        # Linear transformations for each activation function
        self.linear_tanh = nn.Linear(in_features, out_features)
        self.linear_leaky_relu = nn.Linear(in_features, out_features)
        self.linear_linear = nn.Linear(in_features, out_features)
        
        # Weights for combining activations (learnable)
        self.activation_weights = nn.Parameter(torch.ones(out_features, 3) / 3)
        
        # Activation functions
        self.tanh = nn.Tanh()
        self.leaky_relu = nn.LeakyReLU(0.01)
        
    def forward(self, x):
        # Apply each linear transformation and activation
        out_tanh = self.tanh(self.linear_tanh(x))
        out_leaky_relu = self.leaky_relu(self.linear_leaky_relu(x))
        out_linear = self.linear_linear(x)
        
        # Stack the outputs
        activations = torch.stack([out_tanh, out_leaky_relu, out_linear], dim=-1)
        
        # Apply softmax to weights to ensure they sum to 1
        weights = torch.softmax(self.activation_weights, dim=-1)
        
        # Weighted combination
        output = torch.sum(activations * weights.unsqueeze(0), dim=-1)
        
        return output

class MyModule(nn.Module):
    def __init__(self, num_units=128):
        super().__init__()
        
        # Use parallel activation layers
        self.layer1 = ParallelActivationLayer(8, num_units)
        self.bn1 = nn.BatchNorm1d(num_units)
        self.layer2 = ParallelActivationLayer(num_units, num_units//2)
        self.bn2 = nn.BatchNorm1d(num_units//2)
        self.layer3 = ParallelActivationLayer(num_units//2, num_units//4)
        self.bn3 = nn.BatchNorm1d(num_units//4)
        self.output = nn.Linear(num_units//4, 1)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, X, **kwargs):
        X = self.dropout(self.bn1(self.layer1(X)))
        X = self.dropout(self.bn2(self.layer2(X)))
        X = self.dropout(self.bn3(self.layer3(X)))
        X = self.output(X)
        return X

X, y = fetch_california_housing(return_X_y=True)
X = X.astype('float32')
y = y.astype('float32').reshape(-1, 1)

net = NeuralNetRegressor(
    MyModule,
    max_epochs=20,
    lr=0.01,
    optimizer=Adam,
    optimizer__weight_decay=1e-4,
    batch_size=64,
    iterator_train__shuffle=True,
    train_split=ValidSplit(cv=0.2),
    callbacks=[EarlyStopping(patience=20)],
    device='mps',
    verbose=1,
)

pipe = Pipeline([
    ('scale', StandardScaler()),
    ('net', net),
])

pipe.fit(X, y)
y_pred = pipe.predict(X)
r2 = r2_score(y, y_pred)
print(f"RÂ² Score: {r2}")

# Analyze activation function usage
def analyze_activation_weights(model):
    print("\nActivation Function Analysis:")
    print("-" * 40)
    
    total_tanh = 0
    total_leaky_relu = 0
    total_linear = 0
    total_neurons = 0
    
    for name, module in model.named_modules():
        if isinstance(module, ParallelActivationLayer):
            weights = torch.softmax(module.activation_weights, dim=-1)
            layer_tanh = weights[:, 0].mean().item()
            layer_leaky_relu = weights[:, 1].mean().item()
            layer_linear = weights[:, 2].mean().item()
            
            print(f"{name}:")
            print(f"  Tanh: {layer_tanh:.2%}")
            print(f"  LeakyReLU: {layer_leaky_relu:.2%}")
            print(f"  Linear: {layer_linear:.2%}")
            
            num_neurons = weights.shape[0]
            total_tanh += layer_tanh * num_neurons
            total_leaky_relu += layer_leaky_relu * num_neurons
            total_linear += layer_linear * num_neurons
            total_neurons += num_neurons
    
    if total_neurons > 0:
        print(f"\nOverall Average:")
        print(f"  Tanh: {total_tanh/total_neurons:.2%}")
        print(f"  LeakyReLU: {total_leaky_relu/total_neurons:.2%}")
        print(f"  Linear: {total_linear/total_neurons:.2%}")

# Analyze the trained model
analyze_activation_weights(net.module_)

  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1        [36m0.8339[0m        [32m0.3810[0m  4.0089
      2        [36m0.4608[0m        [32m0.3693[0m  3.3376
      3        [36m0.4332[0m        0.6266  3.2480
      4        [36m0.4124[0m        0.3706  3.1778
      5        [36m0.4079[0m        0.3810  3.1768
      6        [36m0.3936[0m        0.3823  3.1335
      7        [36m0.3777[0m        0.3888  3.1820
      8        0.3797        [32m0.3044[0m  3.1744
      9        0.3780        0.3234  3.2151
     10        [36m0.3770[0m        0.3322  3.1595
     11        [36m0.3716[0m        0.3169  3.4444
     12        [36m0.3694[0m        0.3164  3.4831
     13        [36m0.3679[0m        [32m0.2850[0m  3.4621
     14        0.3856        0.2975  3.1952
     15        [36m0.3663[0m        0.3942  3.2034
     16        [36m0.3633[0m        0.3589  3.1812
     17        [36m0.3600[0m        0.3355  3.2748
  

In [21]:
import xgboost as xgb
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import pandas as pd

# Load data
X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Basic XGBoost
print("Training Basic XGBoost...")
xgb_basic = xgb.XGBRegressor(
    n_estimators=1000,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

xgb_basic.fit(X_train, y_train)
y_pred_basic = xgb_basic.predict(X_test)
r2_basic = r2_score(y_test, y_pred_basic)
print(f"Basic XGBoost RÂ² Score: {r2_basic:.6f}")

# Hyperparameter tuning with RandomizedSearchCV
print("\nHyperparameter tuning...")
param_dist = {
    'n_estimators': [500, 1000, 1500, 2000],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0, 0.1, 0.5, 1.0, 2.0],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.5]
}

xgb_random = xgb.XGBRegressor(random_state=42, n_jobs=-1)
random_search = RandomizedSearchCV(
    xgb_random, 
    param_dist, 
    n_iter=50, 
    scoring='r2', 
    cv=5, 
    verbose=1, 
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)
best_xgb = random_search.best_estimator_
y_pred_tuned = best_xgb.predict(X_test)
r2_tuned = r2_score(y_test, y_pred_tuned)
print(f"Tuned XGBoost RÂ² Score: {r2_tuned:.6f}")
print(f"Best parameters: {random_search.best_params_}")

# Advanced XGBoost with feature engineering
print("\nAdvanced XGBoost with feature engineering...")

# Create additional features
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])

# Add polynomial features
df['feature_0_sq'] = df['feature_0'] ** 2
df['feature_1_sq'] = df['feature_1'] ** 2
df['feature_0_1'] = df['feature_0'] * df['feature_1']
df['feature_2_3'] = df['feature_2'] * df['feature_3']
df['feature_4_5'] = df['feature_4'] * df['feature_5']

# Add ratios
df['ratio_0_1'] = df['feature_0'] / (df['feature_1'] + 1e-8)
df['ratio_2_3'] = df['feature_2'] / (df['feature_3'] + 1e-8)

X_enhanced = df.values
X_train_enh, X_test_enh, y_train_enh, y_test_enh = train_test_split(
    X_enhanced, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_enh_scaled = scaler.fit_transform(X_train_enh)
X_test_enh_scaled = scaler.transform(X_test_enh)

xgb_advanced = xgb.XGBRegressor(
    n_estimators=2000,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    min_child_weight=3,
    gamma=0.1,
    random_state=42,
    n_jobs=-1
)

xgb_advanced.fit(X_train_enh_scaled, y_train_enh)
y_pred_advanced = xgb_advanced.predict(X_test_enh_scaled)
r2_advanced = r2_score(y_test_enh, y_pred_advanced)
print(f"Advanced XGBoost RÂ² Score: {r2_advanced:.6f}")

# Ensemble of XGBoost models
print("\nEnsemble XGBoost...")

class XGBoostEnsemble:
    def __init__(self, n_models=5):
        self.models = []
        self.n_models = n_models
        
    def fit(self, X, y):
        # Different configurations for ensemble diversity
        configs = [
            {'max_depth': 6, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8},
            {'max_depth': 7, 'learning_rate': 0.05, 'subsample': 0.9, 'colsample_bytree': 0.7},
            {'max_depth': 5, 'learning_rate': 0.15, 'subsample': 0.7, 'colsample_bytree': 0.9},
            {'max_depth': 8, 'learning_rate': 0.03, 'subsample': 0.85, 'colsample_bytree': 0.85},
            {'max_depth': 4, 'learning_rate': 0.2, 'subsample': 0.75, 'colsample_bytree': 0.75}
        ]
        
        for i in range(self.n_models):
            print(f"Training ensemble model {i+1}/{self.n_models}")
            config = configs[i]
            
            model = xgb.XGBRegressor(
                n_estimators=1500,
                reg_alpha=0.1,
                reg_lambda=1.0,
                min_child_weight=3,
                random_state=42 + i,
                n_jobs=-1,
                **config
            )
            
            model.fit(X, y)
            self.models.append(model)
    
    def predict(self, X):
        predictions = np.array([model.predict(X) for model in self.models])
        return np.mean(predictions, axis=0)

ensemble = XGBoostEnsemble(n_models=5)
ensemble.fit(X_train_enh_scaled, y_train_enh)
y_pred_ensemble = ensemble.predict(X_test_enh_scaled)
r2_ensemble = r2_score(y_test_enh, y_pred_ensemble)
print(f"Ensemble XGBoost RÂ² Score: {r2_ensemble:.6f}")

# Cross-validation for more robust evaluation
print("\nCross-validation scores...")
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(best_xgb, X_train, y_train, cv=10, scoring='r2')
print(f"CV RÂ² Score: {cv_scores.mean():.6f} (+/- {cv_scores.std() * 2:.6f})")

# Feature importance
print("\nTop 10 feature importances:")
feature_names = [f'feature_{i}' for i in range(8)] + ['feature_0_sq', 'feature_1_sq', 'feature_0_1', 'feature_2_3', 'feature_4_5', 'ratio_0_1', 'ratio_2_3']
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': xgb_advanced.feature_importances_
}).sort_values('importance', ascending=False)

print(importance_df.head(10))

# Summary
print("\n" + "="*50)
print("RESULTS SUMMARY")
print("="*50)
print(f"Basic XGBoost RÂ²:     {r2_basic:.6f}")
print(f"Tuned XGBoost RÂ²:     {r2_tuned:.6f}")
print(f"Advanced XGBoost RÂ²:  {r2_advanced:.6f}")
print(f"Ensemble XGBoost RÂ²:  {r2_ensemble:.6f}")
print("="*50)

Training Basic XGBoost...
Basic XGBoost RÂ² Score: 0.849273

Hyperparameter tuning...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Tuned XGBoost RÂ² Score: 0.852921
Best parameters: {'subsample': 0.7, 'reg_lambda': 2.0, 'reg_alpha': 0.1, 'n_estimators': 2000, 'min_child_weight': 1, 'max_depth': 7, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.8}

Advanced XGBoost with feature engineering...
Advanced XGBoost RÂ² Score: 0.848310

Ensemble XGBoost...
Training ensemble model 1/5
Training ensemble model 2/5
Training ensemble model 3/5
Training ensemble model 4/5
Training ensemble model 5/5
Ensemble XGBoost RÂ² Score: 0.854406

Cross-validation scores...
CV RÂ² Score: 0.851676 (+/- 0.018981)

Top 10 feature importances:
         feature  importance
0      feature_0    0.336188
8   feature_0_sq    0.285399
5      feature_5    0.071422
7      feature_7    0.053860
10   feature_0_1    0.052052
6      feature_6    0.051723
14     ratio_2_3    0.024971
2      feature

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import numpy as np

# 1. Fixed TabNet-inspired architecture
class FeatureTransformer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, output_dim)
        self.bn = nn.BatchNorm1d(output_dim)
        
    def forward(self, x):
        return self.bn(self.fc(x))

class AttentiveTransformer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, output_dim)
        self.bn = nn.BatchNorm1d(output_dim)
        
    def forward(self, x):
        return self.bn(self.fc(x))

class TabNetLike(nn.Module):
    def __init__(self, input_dim, n_d=64, n_a=64, n_steps=3):
        super().__init__()
        self.input_dim = input_dim
        self.n_d = n_d
        self.n_a = n_a
        self.n_steps = n_steps
        
        # Feature transformer
        self.initial_bn = nn.BatchNorm1d(input_dim)
        
        # Attention transformers for each step - FIXED DIMENSIONS
        self.att_transformers = nn.ModuleList([
            AttentiveTransformer(input_dim, input_dim) for _ in range(n_steps)
        ])
        
        # Feature transformers for each step
        self.feat_transformers = nn.ModuleList([
            FeatureTransformer(input_dim, n_d + n_a) for _ in range(n_steps)
        ])
        
        # Final output layer
        self.final_mapping = nn.Linear(n_d, 1)
        
    def forward(self, x):
        x = self.initial_bn(x)
        bs = x.shape[0]
        
        # Initialize prior
        prior = torch.ones(bs, self.input_dim).to(x.device)
        
        # Aggregate decision output
        output_agg = torch.zeros(bs, self.n_d).to(x.device)
        
        for step in range(self.n_steps):
            # Attentive transformer
            mask_values = self.att_transformers[step](prior)
            mask_values = torch.softmax(mask_values, dim=-1)
            
            # Feature selection
            masked_x = mask_values * x
            
            # Feature transformer
            out = self.feat_transformers[step](masked_x)
            d = torch.relu(out[:, :self.n_d])
            
            # Aggregate
            output_agg += d
            
            # Update prior
            prior = mask_values
            
        return self.final_mapping(output_agg)

# 2. Simplified Neural Decision Tree
class NeuralDecisionTree(nn.Module):
    def __init__(self, input_dim, n_trees=10, tree_depth=4):
        super().__init__()
        self.n_trees = n_trees
        self.tree_depth = tree_depth
        
        # Multiple trees
        self.trees = nn.ModuleList([
            nn.Sequential(
                nn.Linear(input_dim, 64),
                nn.ReLU(),
                nn.Dropout(0.2),
                nn.Linear(64, 32),
                nn.ReLU(),
                nn.Linear(32, 1)
            ) for _ in range(n_trees)
        ])
        
        # Tree weights
        self.tree_weights = nn.Parameter(torch.ones(n_trees) / n_trees)
        
    def forward(self, x):
        tree_outputs = []
        for tree in self.trees:
            tree_outputs.append(tree(x))
        
        # Weighted combination
        tree_outputs = torch.stack(tree_outputs, dim=-1)
        weights = torch.softmax(self.tree_weights, dim=0)
        output = torch.sum(tree_outputs * weights, dim=-1)
        
        return output

# 3. Simplified Neural Gradient Boosting
class WeakNeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim=32):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, x):
        return self.net(x)

class NeuralGradientBoosting:
    def __init__(self, input_dim, n_estimators=50, learning_rate=0.1):
        self.input_dim = input_dim
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.models = []
        self.device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
        
    def fit(self, X, y):
        X_tensor = torch.FloatTensor(X).to(self.device)
        y_tensor = torch.FloatTensor(y).reshape(-1, 1).to(self.device)
        
        # Initialize prediction with mean
        pred = torch.full_like(y_tensor, y_tensor.mean())
        
        for i in range(self.n_estimators):
            # Calculate residuals
            residuals = y_tensor - pred
            
            # Train weak learner on residuals
            model = WeakNeuralNetwork(self.input_dim).to(self.device)
            optimizer = optim.Adam(model.parameters(), lr=0.01)
            criterion = nn.MSELoss()
            
            # Quick training
            dataset = TensorDataset(X_tensor, residuals)
            loader = DataLoader(dataset, batch_size=256, shuffle=True)
            
            model.train()
            for epoch in range(20):
                for batch_x, batch_y in loader:
                    optimizer.zero_grad()
                    output = model(batch_x)
                    loss = criterion(output, batch_y)
                    loss.backward()
                    optimizer.step()
            
            # Add to ensemble
            model.eval()
            with torch.no_grad():
                pred += self.learning_rate * model(X_tensor)
            
            self.models.append(model)
            
            if i % 10 == 0:
                print(f"Estimator {i}, Loss: {loss.item():.6f}")
    
    def predict(self, X):
        X_tensor = torch.FloatTensor(X).to(self.device)
        
        with torch.no_grad():
            # Start with mean from training
            pred = torch.zeros(X.shape[0], 1).to(self.device)
            for model in self.models:
                model.eval()
                pred += self.learning_rate * model(X_tensor)
        
        return pred.cpu().numpy().flatten()

# 4. Feature Engineering Network
class FeatureEngineeringNet(nn.Module):
    def __init__(self, input_dim, hidden_dim=256):
        super().__init__()
        self.input_dim = input_dim
        
        # Feature interaction layers
        self.pairwise_net = nn.Sequential(
            nn.Linear(input_dim * input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim//2)
        )
        
        # Original features processor
        self.original_net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim//2),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Main network
        self.main_net = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.BatchNorm1d(hidden_dim//2),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim//2, hidden_dim//4),
            nn.BatchNorm1d(hidden_dim//4),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim//4, 1)
        )
        
    def forward(self, x):
        # Create pairwise interactions
        x_expanded = x.unsqueeze(2)  # (batch, features, 1)
        x_T = x.unsqueeze(1)        # (batch, 1, features)
        interactions = (x_expanded * x_T).view(x.shape[0], -1)  # (batch, features^2)
        
        # Process interactions
        interaction_features = self.pairwise_net(interactions)
        
        # Process original features
        original_features = self.original_net(x)
        
        # Combine and process
        combined = torch.cat([interaction_features, original_features], dim=1)
        return self.main_net(combined)

# 5. Tree-inspired Network with explicit splits
class TreeInspiredNet(nn.Module):
    def __init__(self, input_dim, n_leaves=64):
        super().__init__()
        self.input_dim = input_dim
        self.n_leaves = n_leaves
        
        # Feature selection for splits
        self.feature_selectors = nn.ModuleList([
            nn.Linear(input_dim, 1) for _ in range(n_leaves)
        ])
        
        # Threshold learners
        self.thresholds = nn.Parameter(torch.randn(n_leaves))
        
        # Leaf predictors
        self.leaf_predictors = nn.ModuleList([
            nn.Sequential(
                nn.Linear(input_dim, 32),
                nn.ReLU(),
                nn.Linear(32, 1)
            ) for _ in range(n_leaves)
        ])
        
        # Gating network
        self.gate = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, n_leaves),
            nn.Softmax(dim=1)
        )
        
    def forward(self, x):
        # Get gate weights
        gate_weights = self.gate(x)
        
        # Get predictions from each leaf
        leaf_outputs = []
        for i, predictor in enumerate(self.leaf_predictors):
            leaf_outputs.append(predictor(x))
        
        # Stack and weight
        leaf_outputs = torch.stack(leaf_outputs, dim=-1).squeeze(1)
        output = torch.sum(gate_weights * leaf_outputs, dim=1, keepdim=True)
        
        return output

# Test all approaches
def test_approaches():
    # Load data
    X, y = fetch_california_housing(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    results = {}
    device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
    
    # Common training setup
    criterion = nn.MSELoss()
    train_dataset = TensorDataset(torch.FloatTensor(X_train_scaled), torch.FloatTensor(y_train))
    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
    
    # 1. TabNet-like
    print("Training TabNet-like model...")
    model1 = TabNetLike(X.shape[1]).to(device)
    optimizer1 = optim.Adam(model1.parameters(), lr=0.001)
    
    model1.train()
    for epoch in range(100):
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer1.zero_grad()
            pred = model1(batch_x).squeeze()
            loss = criterion(pred, batch_y)
            loss.backward()
            optimizer1.step()
    
    model1.eval()
    with torch.no_grad():
        test_tensor = torch.FloatTensor(X_test_scaled).to(device)
        pred1 = model1(test_tensor).squeeze().cpu().numpy()
    results['TabNet-like'] = r2_score(y_test, pred1)
    
    # 2. Neural Decision Tree
    print("Training Neural Decision Tree...")
    model2 = NeuralDecisionTree(X.shape[1]).to(device)
    optimizer2 = optim.Adam(model2.parameters(), lr=0.001)
    
    model2.train()
    for epoch in range(100):
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer2.zero_grad()
            pred = model2(batch_x).squeeze()
            loss = criterion(pred, batch_y)
            loss.backward()
            optimizer2.step()
    
    model2.eval()
    with torch.no_grad():
        pred2 = model2(test_tensor).squeeze().cpu().numpy()
    results['Neural Decision Tree'] = r2_score(y_test, pred2)
    
    # 3. Tree-inspired Network
    print("Training Tree-inspired Network...")
    model3 = TreeInspiredNet(X.shape[1]).to(device)
    optimizer3 = optim.Adam(model3.parameters(), lr=0.001)
    
    model3.train()
    for epoch in range(100):
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer3.zero_grad()
            pred = model3(batch_x).squeeze()
            loss = criterion(pred, batch_y)
            loss.backward()
            optimizer3.step()
    
    model3.eval()
    with torch.no_grad():
        pred3 = model3(test_tensor).squeeze().cpu().numpy()
    results['Tree-inspired Net'] = r2_score(y_test, pred3)
    
    # 4. Feature Engineering Net
    print("Training Feature Engineering Net...")
    model4 = FeatureEngineeringNet(X.shape[1]).to(device)
    optimizer4 = optim.Adam(model4.parameters(), lr=0.0005)
    
    model4.train()
    for epoch in range(150):
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer4.zero_grad()
            pred = model4(batch_x).squeeze()
            loss = criterion(pred, batch_y)
            loss.backward()
            optimizer4.step()
    
    model4.eval()
    with torch.no_grad():
        pred4 = model4(test_tensor).squeeze().cpu().numpy()
    results['Feature Engineering Net'] = r2_score(y_test, pred4)
    
    # 5. Neural Gradient Boosting
    print("Training Neural Gradient Boosting...")
    ngb = NeuralGradientBoosting(X.shape[1], n_estimators=30)
    ngb.fit(X_train_scaled, y_train)
    pred5 = ngb.predict(X_test_scaled)
    results['Neural Gradient Boosting'] = r2_score(y_test, pred5)
    
    # Print results
    print("\n" + "="*50)
    print("RESULTS COMPARISON")
    print("="*50)
    print(f"XGBoost baseline:           0.849273")
    for name, score in results.items():
        print(f"{name:<25}: {score:.6f}")
    print("="*50)

test_approaches()

Training TabNet-like model...
Training Neural Decision Tree...


Exception ignored in: <function ResourceTracker.__del__ at 0x104b6ee80>
Traceback (most recent call last):
  File "/Users/calld-admin/.pyenv/versions/3.12.10/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/Users/calld-admin/.pyenv/versions/3.12.10/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/Users/calld-admin/.pyenv/versions/3.12.10/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x104b56e80>
Traceback (most recent call last):
  File "/Users/calld-admin/.pyenv/versions/3.12.10/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/Users/calld-admin/.pyenv/versions/3.12.10/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/Users/calld-admin/.pyenv/versions/3.12.10/lib/python3.12/multiprocessing/resource_tracker.py", line 111, i

Training Tree-inspired Network...
Training Feature Engineering Net...
Training Neural Gradient Boosting...
Estimator 0, Loss: 0.324452
Estimator 10, Loss: 0.320138
Estimator 20, Loss: 0.249264

RESULTS COMPARISON
XGBoost baseline:           0.849273
TabNet-like              : -25.061192
Neural Decision Tree     : 0.790182
Tree-inspired Net        : 0.788188
Feature Engineering Net  : 0.792173
Neural Gradient Boosting : -2.542184


In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import numpy as np

# 1. Simple but Effective Deep Network
class SimpleDeepNet(nn.Module):
    def __init__(self, input_dim, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            # Input layer
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            # Hidden layers
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(dropout/2),
            
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            
            # Output
            nn.Linear(32, 1)
        )
        
        # Initialize weights properly
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)
        elif isinstance(module, nn.BatchNorm1d):
            nn.init.constant_(module.weight, 1)
            nn.init.constant_(module.bias, 0)
    
    def forward(self, x):
        return self.net(x)

# 2. Wide & Deep Network (Google's approach)
class WideAndDeep(nn.Module):
    def __init__(self, input_dim, wide_dim=None, deep_hidden=[512, 256, 128]):
        super().__init__()
        
        if wide_dim is None:
            wide_dim = input_dim
        
        # Wide part (linear model)
        self.wide = nn.Linear(wide_dim, 1)
        
        # Deep part
        deep_layers = []
        prev_dim = input_dim
        
        for hidden_dim in deep_hidden:
            deep_layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(0.2)
            ])
            prev_dim = hidden_dim
        
        deep_layers.append(nn.Linear(prev_dim, 1))
        self.deep = nn.Sequential(*deep_layers)
        
        # Combination
        self.output_bias = nn.Parameter(torch.zeros(1))
        
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
    
    def forward(self, x):
        wide_out = self.wide(x)
        deep_out = self.deep(x)
        return wide_out + deep_out + self.output_bias

# 3. Residual Network for Tabular Data
class TabularResNet(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, n_blocks=4):
        super().__init__()
        
        self.input_layer = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU()
        )
        
        # Residual blocks
        self.blocks = nn.ModuleList([
            self._make_block(hidden_dim) for _ in range(n_blocks)
        ])
        
        self.output_layer = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim // 2, 1)
        )
        
        self.apply(self._init_weights)
    
    def _make_block(self, dim):
        return nn.Sequential(
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim)
        )
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.kaiming_normal_(module.weight)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
    
    def forward(self, x):
        x = self.input_layer(x)
        
        for block in self.blocks:
            residual = x
            x = block(x)
            x = torch.relu(x + residual)  # Residual connection
        
        return self.output_layer(x)

# 4. Ensemble of Different Architectures
class HeterogeneousEnsemble:
    def __init__(self, input_dim):
        self.models = []
        self.input_dim = input_dim
        self.device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
        
    def fit(self, X, y):
        # Convert to tensors
        X_tensor = torch.FloatTensor(X).to(self.device)
        y_tensor = torch.FloatTensor(y).to(self.device)
        
        # Create dataset
        dataset = TensorDataset(X_tensor, y_tensor)
        train_loader = DataLoader(dataset, batch_size=512, shuffle=True)
        
        # Model configurations
        model_configs = [
            ('SimpleDeep', SimpleDeepNet(self.input_dim, dropout=0.2)),
            ('WideDeep', WideAndDeep(self.input_dim)),
            ('ResNet', TabularResNet(self.input_dim, hidden_dim=512, n_blocks=3)),
            ('SimpleDeep2', SimpleDeepNet(self.input_dim, dropout=0.4)),
            ('ResNet2', TabularResNet(self.input_dim, hidden_dim=256, n_blocks=6))
        ]
        
        for name, model in model_configs:
            print(f"Training {name}...")
            model = model.to(self.device)
            
            # Different optimizers and learning rates
            if 'Wide' in name:
                optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
            else:
                optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)
            
            scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                optimizer, patience=10, factor=0.5, verbose=False
            )
            
            criterion = nn.MSELoss()
            best_loss = float('inf')
            patience = 0
            
            model.train()
            for epoch in range(200):
                epoch_loss = 0
                for batch_x, batch_y in train_loader:
                    optimizer.zero_grad()
                    pred = model(batch_x).squeeze()
                    loss = criterion(pred, batch_y)
                    loss.backward()
                    
                    # Gradient clipping
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    
                    optimizer.step()
                    epoch_loss += loss.item()
                
                avg_loss = epoch_loss / len(train_loader)
                scheduler.step(avg_loss)
                
                # Early stopping
                if avg_loss < best_loss:
                    best_loss = avg_loss
                    patience = 0
                else:
                    patience += 1
                    if patience >= 20:
                        break
            
            model.eval()
            self.models.append((name, model))
    
    def predict(self, X):
        X_tensor = torch.FloatTensor(X).to(self.device)
        
        predictions = []
        with torch.no_grad():
            for name, model in self.models:
                model.eval()
                pred = model(X_tensor).squeeze().cpu().numpy()
                predictions.append(pred)
        
        # Simple average ensemble
        return np.mean(predictions, axis=0)

# Enhanced preprocessing
def advanced_preprocessing(X_train, X_test):
    """Apply advanced preprocessing that neural networks benefit from"""
    
    # 1. Quantile transformation (makes features more Gaussian)
    qt = QuantileTransformer(output_distribution='normal', random_state=42)
    X_train_qt = qt.fit_transform(X_train)
    X_test_qt = qt.transform(X_test)
    
    # 2. Standard scaling after quantile transform
    scaler = StandardScaler()
    X_train_final = scaler.fit_transform(X_train_qt)
    X_test_final = scaler.transform(X_test_qt)
    
    return X_train_final, X_test_final

# Main training function
def train_neural_networks():
    # Load data
    X, y = fetch_california_housing(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Enhanced preprocessing
    X_train_processed, X_test_processed = advanced_preprocessing(X_train, X_test)
    
    print("Original data shape:", X.shape)
    print("Processed data shape:", X_train_processed.shape)
    
    results = {}
    device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
    
    # Common training setup
    criterion = nn.MSELoss()
    
    # 1. Simple Deep Network
    print("\n1. Training Simple Deep Network...")
    model1 = SimpleDeepNet(X.shape[1]).to(device)
    optimizer1 = optim.AdamW(model1.parameters(), lr=0.001, weight_decay=0.01)
    scheduler1 = optim.lr_scheduler.OneCycleLR(
        optimizer1, max_lr=0.01, epochs=300, 
        steps_per_epoch=len(X_train_processed)//512 + 1
    )
    
    train_dataset1 = TensorDataset(
        torch.FloatTensor(X_train_processed), 
        torch.FloatTensor(y_train)
    )
    train_loader1 = DataLoader(train_dataset1, batch_size=512, shuffle=True)
    
    model1.train()
    for epoch in range(300):
        for batch_x, batch_y in train_loader1:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer1.zero_grad()
            pred = model1(batch_x).squeeze()
            loss = criterion(pred, batch_y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model1.parameters(), max_norm=1.0)
            optimizer1.step()
            scheduler1.step()
    
    model1.eval()
    with torch.no_grad():
        test_tensor = torch.FloatTensor(X_test_processed).to(device)
        pred1 = model1(test_tensor).squeeze().cpu().numpy()
    results['Simple Deep Net'] = r2_score(y_test, pred1)
    
    # 2. Wide & Deep
    print("\n2. Training Wide & Deep Network...")
    model2 = WideAndDeep(X.shape[1]).to(device)
    optimizer2 = optim.Adam(model2.parameters(), lr=0.001, weight_decay=0.001)
    
    train_dataset2 = TensorDataset(
        torch.FloatTensor(X_train_processed), 
        torch.FloatTensor(y_train)
    )
    train_loader2 = DataLoader(train_dataset2, batch_size=512, shuffle=True)
    
    model2.train()
    for epoch in range(200):
        for batch_x, batch_y in train_loader2:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer2.zero_grad()
            pred = model2(batch_x).squeeze()
            loss = criterion(pred, batch_y)
            loss.backward()
            optimizer2.step()
    
    model2.eval()
    with torch.no_grad():
        pred2 = model2(test_tensor).squeeze().cpu().numpy()
    results['Wide & Deep'] = r2_score(y_test, pred2)
    
    # 3. Tabular ResNet
    print("\n3. Training Tabular ResNet...")
    model3 = TabularResNet(X.shape[1]).to(device)
    optimizer3 = optim.Adam(model3.parameters(), lr=0.001, weight_decay=0.001)
    
    model3.train()
    for epoch in range(200):
        for batch_x, batch_y in train_loader2:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer3.zero_grad()
            pred = model3(batch_x).squeeze()
            loss = criterion(pred, batch_y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model3.parameters(), max_norm=1.0)
            optimizer3.step()
    
    model3.eval()
    with torch.no_grad():
        pred3 = model3(test_tensor).squeeze().cpu().numpy()
    results['Tabular ResNet'] = r2_score(y_test, pred3)
    
    # 4. Ensemble
    print("\n4. Training Heterogeneous Ensemble...")
    ensemble = HeterogeneousEnsemble(X.shape[1])
    ensemble.fit(X_train_processed, y_train)
    pred_ensemble = ensemble.predict(X_test_processed)
    results['Ensemble'] = r2_score(y_test, pred_ensemble)
    
    # Print results
    print("\n" + "="*60)
    print("NEURAL NETWORK RESULTS (with proper training)")
    print("="*60)
    print(f"XGBoost baseline:                    0.849273")
    print("-"*60)
    for name, score in results.items():
        gap = abs(0.849273 - score)
        print(f"{name:<30}: {score:.6f} (gap: {gap:.6f})")
    print("="*60)
    
    return results

# Run the comparison
results = train_neural_networks()

Original data shape: (20640, 8)
Processed data shape: (16512, 8)

1. Training Simple Deep Network...

2. Training Wide & Deep Network...

3. Training Tabular ResNet...

4. Training Heterogeneous Ensemble...
Training SimpleDeep...




Training WideDeep...
Training ResNet...
Training SimpleDeep2...
Training ResNet2...

NEURAL NETWORK RESULTS (with proper training)
XGBoost baseline:                    0.849273
------------------------------------------------------------
Simple Deep Net               : 0.815544 (gap: 0.033729)
Wide & Deep                   : 0.818977 (gap: 0.030296)
Tabular ResNet                : 0.800976 (gap: 0.048297)
Ensemble                      : 0.827754 (gap: 0.021519)


In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PolynomialFeatures
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd

# Fixed Advanced Feature Engineering
def create_advanced_features(X_train, X_test):
    """Create XGBoost-like feature interactions consistently across train/test"""
    
    # Combine for consistent feature engineering
    X_combined = np.vstack([X_train, X_test])
    n_train = len(X_train)
    
    features_list = []
    
    # Original features
    features_list.append(X_combined)
    
    # Polynomial features (degree 2, interactions only)
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    poly_features = poly.fit_transform(X_combined)
    # Only new features (skip original ones)
    features_list.append(poly_features[:, X_combined.shape[1]:])
    
    # Log transformations (for positive features)
    for i in range(X_combined.shape[1]):
        if np.all(X_combined[:, i] > 0):
            log_feat = np.log1p(X_combined[:, i:i+1])
            features_list.append(log_feat)
    
    # Ratios between features
    for i in range(X_combined.shape[1]):
        for j in range(i+1, X_combined.shape[1]):
            if np.all(X_combined[:, j] != 0):
                ratio = X_combined[:, i:i+1] / (X_combined[:, j:j+1] + 1e-8)
                features_list.append(ratio)
    
    # Binning features (using quantiles from combined data)
    for i in range(X_combined.shape[1]):
        # Use quantiles for consistent binning
        quantiles = np.percentile(X_combined[:, i], [0, 20, 40, 60, 80, 100])
        quantiles = np.unique(quantiles)  # Remove duplicates
        
        if len(quantiles) > 1:
            binned = np.digitize(X_combined[:, i], quantiles[1:-1])
            # One-hot encode
            n_bins = len(quantiles) - 1
            binned_onehot = np.eye(n_bins)[binned]
            features_list.append(binned_onehot)
    
    # Combine all features
    X_enhanced = np.hstack(features_list).astype(np.float32)
    
    # Split back
    X_train_enhanced = X_enhanced[:n_train]
    X_test_enhanced = X_enhanced[n_train:]
    
    return X_train_enhanced, X_test_enhanced

# Self-Attention Network for Tabular Data
class SelfAttentionTabular(nn.Module):
    def __init__(self, input_dim, embed_dim=128, num_heads=8):
        super().__init__()
        
        # Feature embedding
        self.feature_embed = nn.Linear(input_dim, embed_dim)
        
        # Multi-head self-attention
        self.self_attn = nn.MultiheadAttention(
            embed_dim, num_heads, dropout=0.1, batch_first=True
        )
        
        # Feed-forward network
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * 2),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(embed_dim * 2, embed_dim)
        )
        
        # Layer normalization
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        
        # Output head
        self.output = nn.Sequential(
            nn.Linear(embed_dim, embed_dim // 2),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(embed_dim // 2, 1)
        )
        
    def forward(self, x):
        # Embed features
        x = self.feature_embed(x)  # (batch, embed_dim)
        
        # Add position-like encoding (feature importance)
        x = x.unsqueeze(1)  # (batch, 1, embed_dim)
        
        # Self-attention
        attn_out, _ = self.self_attn(x, x, x)
        x = self.norm1(x + attn_out)
        
        # Feed-forward
        ffn_out = self.ffn(x)
        x = self.norm2(x + ffn_out)
        
        # Output
        x = x.squeeze(1)  # (batch, embed_dim)
        return self.output(x)

# Swish activation
class Swish(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(x)

# Simplified but Effective Ensemble
class AdvancedEnsemble:
    def __init__(self, input_dim):
        self.input_dim = input_dim
        self.device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
        self.models = []
        
    def create_models(self):
        """Create diverse models"""
        
        # Model 1: Deep Network with GELU
        model1 = nn.Sequential(
            nn.Linear(self.input_dim, 1024),
            nn.BatchNorm1d(1024),
            nn.GELU(),
            nn.Dropout(0.3),
            
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Dropout(0.3),
            
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.Dropout(0.2),
            
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.Dropout(0.1),
            
            nn.Linear(128, 1)
        )
        
        # Model 2: Wide & Deep
        class WideDeep(nn.Module):
            def __init__(self, input_dim):
                super().__init__()
                self.wide = nn.Linear(input_dim, 1)
                self.deep = nn.Sequential(
                    nn.Linear(input_dim, 512),
                    nn.BatchNorm1d(512),
                    nn.ReLU(),
                    nn.Dropout(0.2),
                    nn.Linear(512, 256),
                    nn.BatchNorm1d(256),
                    nn.ReLU(),
                    nn.Dropout(0.2),
                    nn.Linear(256, 128),
                    nn.ReLU(),
                    nn.Linear(128, 1)
                )
                
            def forward(self, x):
                return self.wide(x) + self.deep(x)
        
        model2 = WideDeep(self.input_dim)
        
        # Model 3: ResNet-style (FIXED GELU)
        class ResBlock(nn.Module):
            def __init__(self, dim):
                super().__init__()
                self.net = nn.Sequential(
                    nn.Linear(dim, dim),
                    nn.BatchNorm1d(dim),
                    nn.GELU(),
                    nn.Dropout(0.1),
                    nn.Linear(dim, dim),
                    nn.BatchNorm1d(dim)
                )
                
            def forward(self, x):
                return F.gelu(x + self.net(x))  # Fixed: use F.gelu
        
        class ResNet(nn.Module):
            def __init__(self, input_dim):
                super().__init__()
                self.input_proj = nn.Sequential(
                    nn.Linear(input_dim, 512),
                    nn.BatchNorm1d(512),
                    nn.GELU()
                )
                self.blocks = nn.Sequential(*[ResBlock(512) for _ in range(6)])
                self.output = nn.Sequential(
                    nn.Dropout(0.3),
                    nn.Linear(512, 256),
                    nn.GELU(),
                    nn.Linear(256, 1)
                )
                
            def forward(self, x):
                x = self.input_proj(x)
                x = self.blocks(x)
                return self.output(x)
        
        model3 = ResNet(self.input_dim)
        
        # Model 4: Attention-based
        model4 = SelfAttentionTabular(self.input_dim, embed_dim=256, num_heads=8)
        
        # Model 5: Another deep net with Swish activation
        model5 = nn.Sequential(
            nn.Linear(self.input_dim, 768),
            nn.BatchNorm1d(768),
            Swish(),
            nn.Dropout(0.25),
            
            nn.Linear(768, 384),
            nn.BatchNorm1d(384),
            Swish(),
            nn.Dropout(0.25),
            
            nn.Linear(384, 192),
            nn.BatchNorm1d(192),
            Swish(),
            nn.Dropout(0.15),
            
            nn.Linear(192, 1)
        )
        
        return [
            ('DeepNet1', model1),
            ('WideDeep', model2),
            ('ResNet', model3),
            ('Attention', model4),
            ('DeepNet2', model5)
        ]
    
    def fit(self, X, y):
        models = self.create_models()
        
        for name, model in models:
            print(f"Training {name}...")
            model = model.to(self.device)
            
            # Different training configurations
            if 'Attention' in name:
                optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.01)
                epochs = 150
            elif 'ResNet' in name:
                optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.005)
                epochs = 200
            else:
                optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)
                epochs = 200
            
            # Learning rate scheduler
            scheduler = optim.lr_scheduler.OneCycleLR(
                optimizer, max_lr=optimizer.param_groups[0]['lr'] * 3,
                epochs=epochs, steps_per_epoch=len(X)//256 + 1
            )
            
            criterion = nn.MSELoss()
            
            # Create data loader
            dataset = TensorDataset(torch.FloatTensor(X), torch.FloatTensor(y))
            loader = DataLoader(dataset, batch_size=256, shuffle=True)
            
            # Training loop
            model.train()
            for epoch in range(epochs):
                epoch_loss = 0
                for batch_x, batch_y in loader:
                    batch_x, batch_y = batch_x.to(self.device), batch_y.to(self.device)
                    
                    optimizer.zero_grad()
                    pred = model(batch_x).squeeze()
                    loss = criterion(pred, batch_y)
                    loss.backward()
                    
                    # Gradient clipping
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    
                    optimizer.step()
                    scheduler.step()
                    epoch_loss += loss.item()
                
                if epoch % 50 == 0:
                    avg_loss = epoch_loss / len(loader)
                    print(f"  Epoch {epoch}, Loss: {avg_loss:.6f}")
            
            model.eval()
            self.models.append((name, model))
    
    def predict(self, X):
        predictions = []
        
        with torch.no_grad():
            X_tensor = torch.FloatTensor(X).to(self.device)
            
            for name, model in self.models:
                model.eval()
                pred = model(X_tensor).squeeze().cpu().numpy()
                predictions.append(pred)
        
        # Weighted average (you could learn these weights too)
        weights = np.array([0.25, 0.20, 0.20, 0.15, 0.20])  # Slightly prefer deep nets
        
        predictions = np.array(predictions)
        final_pred = np.average(predictions, axis=0, weights=weights)
        
        return final_pred

# Ultimate approach with all fixes
def ultimate_neural_network_approach():
    # Load data
    X, y = fetch_california_housing(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print("Creating advanced features...")
    # Create enhanced features (fixed version)
    X_train_enhanced, X_test_enhanced = create_advanced_features(X_train, X_test)
    
    print(f"Original features: {X.shape[1]}")
    print(f"Enhanced features: {X_train_enhanced.shape[1]}")
    
    # Advanced preprocessing
    print("Preprocessing...")
    qt = QuantileTransformer(output_distribution='normal', random_state=42)
    X_train_qt = qt.fit_transform(X_train_enhanced)
    X_test_qt = qt.transform(X_test_enhanced)
    
    scaler = StandardScaler()
    X_train_final = scaler.fit_transform(X_train_qt)
    X_test_final = scaler.transform(X_test_qt)
    
    # Train ensemble
    print("Training advanced ensemble...")
    ensemble = AdvancedEnsemble(X_train_final.shape[1])
    ensemble.fit(X_train_final, y_train)
    
    # Predict
    print("Making predictions...")
    y_pred = ensemble.predict(X_test_final)
    final_r2 = r2_score(y_test, y_pred)
    
    print("\n" + "="*60)
    print("ULTIMATE NEURAL NETWORK RESULTS")
    print("="*60)
    print(f"XGBoost baseline:                    0.849273")
    print(f"Ultimate Neural Network:             {final_r2:.6f}")
    print(f"Gap closed:                          {abs(0.849273 - final_r2):.6f}")
    
    if final_r2 > 0.84:
        print("ðŸŽ‰ EXCELLENT! Very close to XGBoost performance!")
    elif final_r2 > 0.82:
        print("âœ… GOOD! Getting close to XGBoost performance!")
    else:
        print("ðŸ“ˆ Room for improvement, but still decent!")
    
    print("="*60)
    
    return final_r2

# Run the ultimate approach
ultimate_neural_network_approach()

Creating advanced features...
Original features: 8
Enhanced features: 111
Preprocessing...
Training advanced ensemble...
Training DeepNet1...
  Epoch 0, Loss: 3.406161
  Epoch 50, Loss: 0.322034
  Epoch 100, Loss: 0.300962
  Epoch 150, Loss: 0.276026
Training WideDeep...
  Epoch 0, Loss: 2.396959
  Epoch 50, Loss: 0.279442
  Epoch 100, Loss: 0.257549
  Epoch 150, Loss: 0.207181
Training ResNet...
  Epoch 0, Loss: 0.864335
  Epoch 50, Loss: 0.305222
  Epoch 100, Loss: 0.280137
  Epoch 150, Loss: 0.258561
Training Attention...
  Epoch 0, Loss: 2.297671
  Epoch 50, Loss: 0.191397
  Epoch 100, Loss: 0.059614
Training DeepNet2...
  Epoch 0, Loss: 4.631473
  Epoch 50, Loss: 0.330715
  Epoch 100, Loss: 0.314646
  Epoch 150, Loss: 0.291762
Making predictions...

ULTIMATE NEURAL NETWORK RESULTS
XGBoost baseline:                    0.849273
Ultimate Neural Network:             0.817338
Gap closed:                          0.031935
ðŸ“ˆ Room for improvement, but still decent!


0.8173384968370929