In [1]:
!pip uninstall -q -y scikit-learn
!pip install -q scikit-learn==1.4.0

In [2]:
#========================= XGBOOST CLASSIFIER =========================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, classification_report

import xgboost as xgb

# Loading the breast cancer dataset from sklearn
data = load_breast_cancer()
X = data.data        # Features
y = data.target      # Target labels

# Splitting the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)

# Initializing the XGBoost classifier
xg_cl = xgb.XGBClassifier(
    objective='binary:logistic', # Binary classification task
    n_estimators=10,             # Number of boosting rounds (trees)
    seed=123                     # Seed for reproducibility
)

# Training the classifier on the training data
xg_cl.fit(X_train, y_train)

# Predicting the class labels for the test set
preds = xg_cl.predict(X_test)

# Calculating and displaying accuracy
accuracy = accuracy_score(y_test, preds)

# Displaying results in a clean and formatted manner
print("="*30 + " RESULTS " + "="*30)
print(f"Dataset: Breast Cancer (from sklearn)")
print(f"Accuracy: {accuracy:.3%}")
print("\nDetailed Classification Report:")
print(classification_report(y_test, preds, target_names=data.target_names))
print("="*70)

Dataset: Breast Cancer (from sklearn)
Accuracy: 98.246%

Detailed Classification Report:
              precision    recall  f1-score   support

   malignant       0.98      0.98      0.98        41
      benign       0.99      0.99      0.99        73

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



In [3]:
#========================= XGBOOST CLASSIFIER CROSS-VALIDATION =========================
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

import xgboost as xgb

# Loading the breast cancer dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)  # Features as DataFrame
y = pd.Series(data.target, name="target")                # Target as Series

# Splitting into training and testing sets (though only training is needed for CV)
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=123)

# Converting the training data into an XGBoost DMatrix
churn_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

# Setting parameters for the XGBoost model
params = {
    "objective": "binary:logistic",  # Binary classification
    "max_depth": 4                  # Maximum depth of the trees
}

# Performing cross-validation with 4 folds
cv_results = xgb.cv(
    dtrain=churn_dmatrix,           # Data for training
    params=params,                  # Model parameters
    nfold=4,                        # Number of cross-validation folds
    num_boost_round=10,             # Number of boosting rounds
    metrics="error",                # Evaluation metric
    as_pandas=True,                 # Return results as a DataFrame
    seed=123                        # Seed for reproducibility
)

# Calculating accuracy from cross-validation results
final_accuracy = (1 - cv_results["test-error-mean"]).iloc[-1]

# Displaying the results
print("="*30 + " RESULTS " + "="*30)
print(f"Dataset: Breast Cancer (from sklearn)")
print(f"Final Cross-Validated Accuracy: {final_accuracy:.2%}")
print("="*70)

Dataset: Breast Cancer (from sklearn)
Final Cross-Validated Accuracy: 95.82%


In [4]:
#========================= XGBOOST REGRESSION =========================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error

import xgboost as xgb

# Loading the California housing dataset
data = fetch_california_housing(as_frame=True)
X = data.data   # Features
y = data.target # Target (housing prices)

# Splitting into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)

#--------------------------- METHOD 1: XGBRegressor with Decision Tree Base Learners ---------------------------
# Initializing the XGBoost regressor
xg_reg = xgb.XGBRegressor(
    objective='reg:squarederror', # Regression task
    n_estimators=10,             # Number of boosting rounds
    seed=123                     # Seed for reproducibility
)

# Fitting the model
xg_reg.fit(X_train, y_train)

# Making predictions
preds = xg_reg.predict(X_test)

# Calculating RMSE
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("="*15 + " XGBRegressor with Decision Tree Base Learners Results " + "="*15)
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print("="*85)

#--------------------------- METHOD 2: XGBRegressor with Linear Base Learners ---------------------------
# Converting data to DMatrix format for xgb.train
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)

# Defining parameters for training
params = {
    "booster": "gblinear",          # Linear model as booster
    "objective": "reg:squarederror" # Regression task
}

# Training the model using xgb.train
xg_reg_dmatrix = xgb.train(
    params=params, dtrain=DM_train, num_boost_round=10
)

# Making predictions
preds_dmatrix = xg_reg_dmatrix.predict(DM_test)

# Calculating RMSE
rmse_dmatrix = np.sqrt(mean_squared_error(y_test, preds_dmatrix))
print("\n" + "="*15 + " XGBRegressor with Linear Base Learners Results " + "="*22)
print(f"Root Mean Squared Error (RMSE): {rmse_dmatrix:.4f}")
print("="*85)

Root Mean Squared Error (RMSE): 0.5378

Root Mean Squared Error (RMSE): 0.8951


In [5]:
#========================= XGBOOST REGRESSION L1 REGULARIZATION =========================
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

import xgboost as xgb

# Loading the California housing dataset
data = fetch_california_housing(as_frame=True)
X = data.data   # Features
y = data.target # Target (housing prices)

# Converting data to an XGBoost DMatrix
boston_dmatrix = xgb.DMatrix(data=X, label=y)

# Setting base parameters for the model
params = {
    "objective": "reg:squarederror", # Regression task
    "max_depth": 4                   # Maximum depth of the trees
}

# Defining different L1 regularization values (alpha)
l1_params = [1, 10, 100]

# List to store RMSE results for each alpha value
rmses_l1 = []

# Performing cross-validation for each L1 regularization value
for reg in l1_params:
    params["alpha"] = reg  # Setting L1 regularization parameter (alpha)
    cv_results = xgb.cv(
        dtrain=boston_dmatrix,      # Data for training
        params=params,              # Model parameters
        nfold=4,                    # Number of cross-validation folds
        num_boost_round=10,         # Number of boosting rounds
        metrics="rmse",             # Evaluation metric
        as_pandas=True,             # Return results as a DataFrame
        seed=123                    # Seed for reproducibility
    )
    # Extracting the final RMSE and appending to the results list
    rmses_l1.append(cv_results["test-rmse-mean"].iloc[-1])

# Creating a DataFrame to display L1 values and corresponding RMSE
results_df = pd.DataFrame(list(zip(l1_params, rmses_l1)), columns=["L1 (alpha)", "RMSE"])

# Displaying the results in a clean format
print("="*30 + " RESULTS " + "="*30)
print("Best RMSE as a function of L1 (alpha):")
print(results_df)
print("="*70)

Best RMSE as a function of L1 (alpha):
   L1 (alpha)      RMSE
0           1  0.588026
1          10  0.590160
2         100  0.626528


In [6]:
#==================== XGBOOST REGRESSION GRID AND RANDOMIZED SEARCH ====================
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import fetch_california_housing

import xgboost as xgb

# Loading the California housing dataset
data = fetch_california_housing(as_frame=True)
X = data.data   # Features
y = data.target # Target (housing prices)

#--------------------------- GRID SEARCH CV ---------------------------
print("="*30 + " GRID SEARCH CV " + "="*30)

# Defining parameter grid for GridSearchCV
gbm_param_grid = {
    "learning_rate": [0.01, 0.1, 0.5, 0.9], # Different learning rates
    "n_estimators": [200],                 # Number of trees
    "subsample": [0.3, 0.5, 0.9]           # Subsample ratios
}

# Initializing the XGBoost regressor
gbm = xgb.XGBRegressor(objective="reg:squarederror", seed=123)

# Performing GridSearchCV
grid_mse = GridSearchCV(
    estimator=gbm,
    param_grid=gbm_param_grid,
    scoring="neg_mean_squared_error",
    cv=4,
    verbose=1
)

# Fitting the model to the data
grid_mse.fit(X, y)

# Extracting the best parameters and lowest RMSE
print("Best Parameters Found:", grid_mse.best_params_)
print("Lowest RMSE Found:", np.sqrt(np.abs(grid_mse.best_score_)))

#--------------------------- RANDOMIZED SEARCH CV ---------------------------
print("\n" + "="*30 + " RANDOMIZED SEARCH CV " + "="*24)

# Defining parameter grid for RandomizedSearchCV
gbm_param_dist = {
    "learning_rate": np.arange(0.05, 1.05, 0.05), # Learning rates
    "n_estimators": [200],                        # Number of trees
    "subsample": np.arange(0.05, 1.05, 0.05)      # Subsample ratios
}

# Performing RandomizedSearchCV
randomized_mse = RandomizedSearchCV(
    estimator=gbm,
    param_distributions=gbm_param_dist,
    n_iter=25,  # Number of random combinations to test
    scoring="neg_mean_squared_error",
    cv=4,
    verbose=1,
    random_state=123
)

# Fitting the model to the data
randomized_mse.fit(X, y)

# Extracting the best parameters and lowest RMSE
print("Best Parameters Found:", randomized_mse.best_params_)
print("Lowest RMSE Found:", np.sqrt(np.abs(randomized_mse.best_score_)))

Fitting 4 folds for each of 12 candidates, totalling 48 fits
Best Parameters Found: {'learning_rate': 0.1, 'n_estimators': 200, 'subsample': 0.3}
Lowest RMSE Found: 0.6587735420293971

Fitting 4 folds for each of 25 candidates, totalling 100 fits
Best Parameters Found: {'subsample': 0.5, 'n_estimators': 200, 'learning_rate': 0.05}
Lowest RMSE Found: 0.6533834295714968


In [7]:
#==================== XGBOOST REGRESSION WITH PIPELINES ====================
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

# Loading the California housing dataset
data = fetch_california_housing(as_frame=True)
X = data.data   # Features
y = data.target # Target (housing prices)

# Creating a pipeline with StandardScaler and XGBoost regressor
xgb_pipeline = Pipeline([
    ("st_scaler", StandardScaler()),      # Standardize the features
    ("xgb_model", xgb.XGBRegressor(seed=123)) # XGBoost regressor
])

# Performing cross-validation with RMSE as the evaluation metric
scores = cross_val_score(
    xgb_pipeline, X, y,
    scoring="neg_mean_squared_error",
    cv=10
)

# Calculating the final average RMSE
final_avg_rmse = np.mean(np.sqrt(np.abs(scores)))

# Displaying the results
print("="*30 + " CROSS-VALIDATION RESULTS " + "="*30)
print(f"Final Average RMSE (10-fold CV): {final_avg_rmse:.4f}")
print("="*86)

#--------------------------- RANDOMIZED SEARCH CV FOR TUNING ---------------------------
# Defining the parameter grid for hyperparameter tuning
gbm_param_grid = {
    'xgb_model__subsample': np.arange(0.05, 1, 0.05),
    'xgb_model__max_depth': np.arange(3, 20, 1),
    'xgb_model__colsample_bytree': np.arange(0.1, 1.05, 0.05)
}

# Performing RandomizedSearchCV to find the best parameters
randomized_neg_mse = RandomizedSearchCV(
    estimator=xgb_pipeline,          # Pipeline as the estimator
    param_distributions=gbm_param_grid,
    n_iter=10,                       # Number of random combinations to test
    scoring="neg_mean_squared_error",
    cv=4,                            # 4-fold cross-validation
    verbose=0,                       # Verbosity for output
    random_state=123                 # Reproducibility
)

# Fitting the model
randomized_neg_mse.fit(X, y)

# Calculating the best RMSE and displaying the best model
best_rmse = np.sqrt(np.abs(randomized_neg_mse.best_score_))
print("\n" + "="*30 + " RANDOMIZED SEARCH RESULTS " + "="*30)
print(f"Best RMSE: {best_rmse:.4f}")
print("Best Model Configuration:")
print(randomized_neg_mse.best_estimator_)
print("="*86)

Final Average RMSE (10-fold CV): 0.6172

Best RMSE: 0.6842
Best Model Configuration:
Pipeline(steps=[('st_scaler', StandardScaler()),
                ('xgb_model',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=0.6500000000000001, device=None,
                              early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              feature_types=None, gamma=None, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None, learning_rate=None,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=4, max_leaves=None,
                              min_child_weight=None, mi