In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor
import matplotlib as mpl
import matplotlib.patheffects as path_effects
import math
import random
from scipy import stats

In [None]:
file_path_1 = 'Encoded_train_data.csv'
X_train_encoded = pd.read_csv(file_path_1)
file_path_2 = 'Encoded_test_data.csv'
X_test_encoded = pd.read_csv(file_path_2)
file_path_3 = 'y_train.csv'
y_train = pd.read_csv(file_path_3)
file_path_4 = 'y_test.csv'
y_test = pd.read_csv(file_path_4)

In [None]:
base_learners = [
    ('rf', RandomForestRegressor(n_estimators=300, max_depth=13, min_samples_leaf=1, min_samples_split=2, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=257, learning_rate=0.125, max_depth=3, colsample_bytree=1, subsample=0.5, random_state=42)),
    ('catboost', CatBoostRegressor(
        iterations=300,
        learning_rate=0.191,
        depth=3,
        verbose=0,
        random_state=42
    )),
    ('gbr', GradientBoostingRegressor(n_estimators=277, learning_rate=0.069, max_depth=4, subsample=0.5, random_state=42)),
    ('lgbm', LGBMRegressor(n_estimators=300, learning_rate=0.165, max_depth=3, num_leaves=20, random_state=42)),
    
]

In [None]:
class PyTorchRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, input_dim, hidden_dim=64, dropout=0.2, lr=0.001, epochs=100, batch_size=32):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.dropout = dropout
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.build_model().to(self.device)
        self.history = {'loss': []}
        
    def build_model(self):
        model = nn.Sequential(
            nn.Linear(self.input_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dim, self.hidden_dim//2),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dim//2, 1)
        )
        return model
    
    def fit(self, X, y):
        self.model.train()
        X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
        y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1).to(self.device)
        
        dataset = TensorDataset(X_tensor, y_tensor)
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
        
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        
        for epoch in range(self.epochs):
            epoch_loss = 0.0
            for batch_X, batch_y in loader:
                optimizer.zero_grad()
                outputs = self.model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item() * batch_X.size(0)
            epoch_loss /= len(loader.dataset)
            self.history['loss'].append(epoch_loss)
            if (epoch+1) % 10 == 0 or epoch == 0:
                print(f'Epoch [{epoch+1}/{self.epochs}], Loss: {epoch_loss:.4f}')
        return self
    
    def predict(self, X):
        self.model.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
            outputs = self.model(X_tensor)
            return outputs.cpu().numpy().flatten()

In [None]:
meta_learner = PyTorchRegressor(
    input_dim=len(base_learners),
    hidden_dim=256,
    dropout=0.05,
    lr=0.00023,
    epochs=200,
    batch_size=16
)

stacking_regressor = StackingRegressor(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=20,
    n_jobs=-1,
    passthrough=False  
)

In [None]:
stacking_regressor.fit(X_train_encoded, y_train)

In [None]:
y_pred_stack = stacking_regressor.predict(X_test_encoded)

mse_stack = mean_squared_error(y_test, y_pred_stack)
r2_stack = r2_score(y_test, y_pred_stack)
mae_stack = mean_absolute_error(y_test,y_pred_stack)
rmse_stack= np.sqrt(mean_squared_error(y_test, y_pred_stack))

print(f"MSE:{mse_stack:.4f}")
print(f"MAE:{mae_stack:.4f}")
print(f"RMSE:{rmse_stack:.4f}")
print(f"R²;{r2_stack:.4f}")

In [None]:
y_pred_train = stacking_regressor.predict(X_train_encoded)
y_pred_test = stacking_regressor.predict(X_test_encoded)

In [None]:
r2_train = r2_score(y_train, y_pred_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))

r2_test = r2_score(y_test, y_pred_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

In [None]:
residuals = y_test - y_pred_stack

counts, bins = np.histogram(residuals, bins=30, density=True)
bin_centers = 0.5 * (bins[1:] + bins[:-1])

norm = mpl.colors.Normalize(vmin=min(counts), vmax=max(counts))
cmap = plt.get_cmap('Purples')
colors = cmap(norm(counts))

fig, ax = plt.subplots(figsize=(10, 6))

bars = ax.bar(bin_centers, counts, width=(bins[1]-bins[0]), color=colors, edgecolor='black', alpha=0.7, label='Residuals')

sns.kdeplot(residuals, color='darkblue', linewidth=2, ax=ax, label='KDE')

sm = mpl.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])  
cbar = fig.colorbar(sm, ax=ax)
cbar.set_label('Density', fontsize=12)
ax.set_title('Residuals Distribution', fontsize=18, fontweight='bold')
ax.set_xlabel('Residuals', fontsize=14)
ax.set_ylabel('Density', fontsize=14)
ax.tick_params(axis='both', which='major', labelsize=12)
ax.legend(fontsize=12)
for bar in bars:
    height = bar.get_height()
    ax.annotate(f'{height:.2f}',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),  # 3 points vertical offset
                textcoords='offset points',
                ha='center', va='bottom', fontsize=10)
mean_residual = np.mean(residuals)
std_residual = np.std(residuals)
ax.axvline(mean_residual, color='red', linestyle='--', linewidth=1)
ax.text(mean_residual, max(counts)*0.9, f'Mean: {mean_residual:.4f}', color='red', fontsize=12)
plt.tight_layout()
plt.show()
# plt.savefig('residuals_distribution.png', dpi=300)