.Installing XGBoost

In [1]:
pip install xgboost



.Basic XGBoost Model

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Create an instance of XGBRegressor
xgb_model = xgb.XGBRegressor()

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

.Tuning Hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.5, 0.8, 1],
    'colsample_bytree': [0.5, 0.8, 1]
}

# Create an instance of XGBRegressor
xgb_model = xgb.XGBRegressor()

# Create the grid search object
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the grid search object to the data
grid_search.fit(X_train, y_train)

# Get the best combination of hyperparameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

# Train the model with the best hyperparameters
best_xgb_model = xgb.XGBRegressor(**best_params)
best_xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_xgb_model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.5, 0.8, 1],
    'colsample_bytree': [0.5, 0.8, 1]
}

# Create an instance of XGBRegressor
xgb_model = xgb.XGBRegressor()

# Create the grid search object
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the grid search object to the data
grid_search.fit(X_train, y_train)

# Get the best combination of hyperparameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

# Train the model with the best hyperparameters
best_xgb_model = xgb.XGBRegressor(**best_params)
best_xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_xgb_model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

.Advanced Techniques

Early Stopping


In [None]:
# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Fit the model with early stopping
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_val, y_val)])

Custom Loss Function

In [None]:
# Define custom objective function (example: Huber Loss)
def huber_approx_obj(y_true, y_pred):
    d = y_pred - y_true
    scale = 1 + (d / 2)
    squared_loss = np.square(d) / 2
    linear_loss = np.abs(d) - 0.5
    return np.where(np.abs(d) < 1, squared_loss, linear_loss) * scale

# Fit the model with custom objective function
xgb_model = xgb.XGBRegressor(obj=huber_approx_obj)
xgb_model.fit(X_train, y_train)

Feature Importance

In [None]:
# Fit the model
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)

# Get feature importances
feature_importances = xgb_model.feature_importances_

# Display feature importances
for i, importance in enumerate(feature_importances):
    print(f"Feature {i}: {importance}")

# Select the most important features (example: top 5)
top_n = 5
most_important_indices = np.argsort(feature_importances)[-top_n:]
X_train_selected = X_train[:, most_important_indices]
X_test_selected = X_test[:, most_important_indices]

Regularization

In [None]:
# Fit the model with L1 and L2 regularization (alpha and lambda)
xgb_model = xgb.XGBRegressor(alpha=1, lambda=1)
xgb_model.fit(X_train, y_train)

Stacking

In [None]:
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Ridge

# Create estimators for stacking
estimators = [
    ('xgb', xgb.XGBRegressor()),
    ('rf', RandomForestRegressor())
]

# Create the stacking regressor
stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=Ridge())

# Fit the stacking regressor
stacking_regressor.fit(X_train, y_train)

# Make predictions on the test data
y_pred = stacking_regressor.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

.What kind of problems can XGBoost solve?

Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

# Load the dataset
boston = load_boston()
X = boston.data
y = boston.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of XGBRegressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=3, learning_rate=0.1)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Binary classification

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import xgboost as xgb

# Load the dataset
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of XGBClassifier
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", classification_rep)

Multiclass classification

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import xgboost as xgb

# Load the dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of XGBClassifier
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", classification_rep)

 Ranking

In [None]:
import numpy as np
import xgboost as xgb
from sklearn.datasets import load_svmlight_file
from sklearn.metrics import ndcg_score

# Load the dataset
train_data = load_svmlight_file('mq2008.train')
X_train, y_train, query_train = train_data[0], train_data[1], train_data[2]

test_data = load_svmlight_file('mq2008.test')
X_test, y_test, query_test = test_data[0], test_data[1], test_data[2]

# Convert query ids to group sizes
def query_ids_to_groups(query_ids):
    _, group_counts = np.unique(query_ids, return_counts=True)
    return group_counts

train_groups = query_ids_to_groups(query_train)
test_groups = query_ids_to_groups(query_test)

# Create an instance of XGBRanker
xgb_ranker = xgb.XGBRanker(objective='rank:pairwise', n_estimators=100, max_depth=3, learning_rate=0.1)

# Fit the model on the training data
xgb_ranker.fit(X_train, y_train, group=train_groups)

# Make predictions on the test data
y_pred = xgb_ranker.predict(X_test)

# Evaluate the model's performance
ndcg = ndcg_score(test_groups, y_test, y_pred)
print("Normalized Discounted Cumulative Gain (NDCG):", ndcg)

Feature selection

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Load the dataset
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of XGBClassifier
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Get feature importances
feature_importances = xgb_model.feature_importances_

# Display feature importances
for i, importance in enumerate(feature_importances):
    print(f"Feature {i}: {importance}")

# Select the most important features (example: top 5)
top_n = 5
most_important_indices = np.argsort(feature_importances)[-top_n:]
X_train_selected = X_train[:, most_important_indices]
X_test_selected = X_test[:, most_important_indices]

Imbalanced datasets

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import xgboost as xgb

# Generate a synthetic imbalanced dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=2,
                           n_redundant=10, n_clusters_per_class=1,
                           weights=[0.99], flip_y=0, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate the scale_pos_weight
positive_class = np.sum(y_train == 1)
negative_class = np.sum(y_train == 0)
scale_pos_weight = negative_class / positive_class

# Create an instance of XGBClassifier with scale_pos_weight
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, scale_pos_weight=scale_pos_weight)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", classification_rep)

Ensemble learning

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

# Load the dataset
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create instances of the classifiers
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
logistic_model = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)

# Create an ensemble using VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('xgb', xgb_model),
    ('rf', rf_model),
    ('logistic', logistic_model)],
    voting='soft')

# Fit the ensemble model on the training data
ensemble_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = ensemble_model.predict(X_test)

# Evaluate the ensemble model's performance
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", classification_rep)

Time series forecasting

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Load the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/airline-passengers.csv"
data = pd.read_csv(url)
data['Month'] = pd.to_datetime(data['Month'])

# Create lag features
def create_lag_features(df, n_lags):
    for i in range(1, n_lags + 1):
        df[f'lag_{i}'] = df['Passengers'].shift(i)
    return df

n_lags = 3
data = create_lag_features(data, n_lags)

# Drop rows with NaN values
data = data.dropna()

# Split the dataset into training and testing sets
train_data = data[data['Month'] < '1958-01-01']
test_data = data[data['Month'] >= '1958-01-01']

X_train = train_data.drop(['Month', 'Passengers'], axis=1)
y_train = train_data['Passengers']

X_test = test_data.drop(['Month', 'Passengers'], axis=1)
y_test = test_data['Passengers']

# Create an instance of XGBRegressor
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_model.predict(X_test)

# Evaluate the model's performance
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

# Plot the actual vs. predicted values
plt.plot(test_data['Month'], y_test, label="Actual")
plt.plot(test_data['Month'], y_pred, label="Predicted")
plt.xlabel("Month")
plt.ylabel("Passengers")
plt.legend()
plt.show()

.Complete Use Case: Predicting House Prices

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load the dataset
boston = load_boston()
X = boston.data
y = boston.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of XGBRegressor
xgb_model = xgb.XGBRegressor()

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)