# Loading and Exploring the Data

In [2]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import xgboost as xgb

warnings.filterwarnings('ignore')
diamonds = sns.load_dataset('diamonds')
diamonds.head()

# in real-world datasets, need to explore, clean, and visualize the dataset first
# here, 5-number summary of the numeric and categorial features built-in to seaborn
diamonds.describe(exclude = np.number)

ModuleNotFoundError: No module named 'xgboost'

# How to Build an XGBoost DMatrix

In [12]:
from sklearn.model_selection import train_test_split

# goal: predict diamond prices using their physical measurements, so target will be the price column
# candidate features are isolated into X and target labels into y

# extract feature and target arrays
X, y = diamonds.drop('price', axis=1), diamonds[['price']]

# this dataset has three categorical columns. normally would encode with ordinal or one-hot encoding
# XGBoost as the ability to internally deal with categoricals by casting to pandas "category" data type

# extract text features
cats = X.select_dtypes(exclude = np.number).columns.tolist()

# convert to pandas category
for col in cats:
    X[col] = X[col].astype('category')

# should get three category features when printing dtypes attribute:
print(X.dtypes)

# split the data into train and test sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

# create regression matrices
import xgboost as xgb
print(xgb.__version__)
build_info = xgb.build_info()
for name in sorted(build_info.keys()):
    print(f'{name}: {build_info[name]}')

dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical = True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical = True)

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
x           float64
y           float64
z           float64
dtype: object
2.1.4
BUILTIN_PREFETCH_PRESENT: True
CLANG_VERSION: [15, 0, 0]
DEBUG: False
MM_PREFETCH_PRESENT: False
USE_CUDA: False
USE_DLOPEN_NCCL: False
USE_FEDERATED: False
USE_NCCL: False
USE_OPENMP: True
USE_RMM: False
libxgboost: /Users/jk755/Library/Python/3.9/lib/python/site-packages/xgboost/lib/libxgboost.dylib


# Python XGBoost Regression

**After building the DMatrices, need to choose a value for the `objective` parameter. This tells XGBoost the machine learning problem to be solved and what metrics or loss functions to use to solve that problem.**

## Training

The chosen objective function and any other hyperparameters of XGBoost should be specified in a dictionary, which by convention should be called params.

Inside these initial `params`, also set `tree_method` to `gpu_hist`, which enables GPU acceleration. If no GPU, can omit the parameter or set it to `hist`.

Then, set another parameter called `num_boost_round`, which stands for number of boosting rounds. Internally, XGBoost minimizes the loss function RMSE in small incremental rounds; this parameter specifies the number of those rounds.

Ideal number of rounds is usually found through hyperparameter tuning.

In [16]:
# define hyperparameters
params = {'objective': 'reg:squarederror', 'tree_method': 'gpu_hist'}

n = 100 
model = xgb.train(
    params = params,
    dtrain = dtrain_reg, 
    num_boost_round = n,
)

XGBoostError: [16:37:05] /Users/runner/work/xgboost/xgboost/src/gbm/../common/common.h:174: XGBoost version not compiled with GPU support.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000282820428 dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   libxgboost.dylib                    0x00000002829ec3f8 xgboost::gbm::GBTree::Configure(std::__1::vector<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>>> const&) + 1276
  [bt] (2) 3   libxgboost.dylib                    0x0000000282a0b54c xgboost::LearnerConfiguration::Configure() + 1272
  [bt] (3) 4   libxgboost.dylib                    0x0000000282a0b79c xgboost::LearnerImpl::UpdateOneIter(int, std::__1::shared_ptr<xgboost::DMatrix>) + 128
  [bt] (4) 5   libxgboost.dylib                    0x0000000282842b34 XGBoosterUpdateOneIter + 144
  [bt] (5) 6   libffi.dylib                        0x00000001a1a29050 ffi_call_SYSV + 80
  [bt] (6) 7   libffi.dylib                        0x00000001a1a31af8 ffi_call_int + 1208
  [bt] (7) 8   _ctypes.cpython-39-darwin.so        0x00000001053733cc PyInit__ctypes + 25392
  [bt] (8) 9   _ctypes.cpython-39-darwin.so        0x000000010536bed8 _ctypes.cpython-39-darwin.so + 16088

