# Maestría en Maestría en Ciencia de Datos e Inteligencia Artificial
#### 8. Machine Learning and Deep Learning
#### Docente: Msc. Renzo Claure Aracena.

### *XGBoost para regresión*

In [None]:
import os

In [None]:
os.getcwd()

In [None]:
#usaremos el set de datos de California Housing

In [1]:
from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing()
print(california_housing.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])


In [2]:
y = california_housing.target

In [3]:
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [4]:
X = california_housing.data

In [5]:
X.shape

(20640, 8)

In [6]:
california_housing.DESCR

'.. _california_housing_dataset:\n\nCalifornia Housing dataset\n--------------------------\n\n**Data Set Characteristics:**\n\n:Number of Instances: 20640\n\n:Number of Attributes: 8 numeric, predictive attributes and the target\n\n:Attribute Information:\n    - MedInc        median income in block group\n    - HouseAge      median house age in block group\n    - AveRooms      average number of rooms per household\n    - AveBedrms     average number of bedrooms per household\n    - Population    block group population\n    - AveOccup      average number of household members\n    - Latitude      block group latitude\n    - Longitude     block group longitude\n\n:Missing Attribute Values: None\n\nThis dataset was obtained from the StatLib repository.\nhttps://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html\n\nThe target variable is the median house value for California districts,\nexpressed in hundreds of thousands of dollars ($100,000).\n\nThis dataset was derived from the 1990 U.S

In [7]:
#importamos las librerias necesarias
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-manylinux_2_28_aarch64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-manylinux_2_28_aarch64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m848.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.4
[0m

In [8]:
#XG es una libreria separada de scikit learn
import xgboost as xgb

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
#XGBoost tiene su ´propio formato de estructura de datos

In [11]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [12]:
#Ahora definimos los hiperparámetros
params = {
    'objective': 'reg:squarederror',  #Error cuadrático medio (MSE)
    'eval_metric': 'rmse',  #Raíz del error cuadrático medio (RMSE)
    'max_depth': 4,  #Profundidad de los árboles
    'eta': 0.1,  #Ratiode aprendizaje
    'subsample': 0.8,  #Porcentaje de datos que se usan en cada iteración
    'colsample_bytree': 0.8  #Porcentaje de característicaaas usadas por cada árbol
}

In [13]:
#para entrenar al modelo
model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtest, 'test')], early_stopping_rounds=10)

[0]	test-rmse:1.08539
[1]	test-rmse:1.04617
[2]	test-rmse:1.01359
[3]	test-rmse:0.96428
[4]	test-rmse:0.92251
[5]	test-rmse:0.89603
[6]	test-rmse:0.86197
[7]	test-rmse:0.83060
[8]	test-rmse:0.80544
[9]	test-rmse:0.78337
[10]	test-rmse:0.76261
[11]	test-rmse:0.74615
[12]	test-rmse:0.73296
[13]	test-rmse:0.71907
[14]	test-rmse:0.70719
[15]	test-rmse:0.69434
[16]	test-rmse:0.68455
[17]	test-rmse:0.67849
[18]	test-rmse:0.67005
[19]	test-rmse:0.65789
[20]	test-rmse:0.65183
[21]	test-rmse:0.63647
[22]	test-rmse:0.62264
[23]	test-rmse:0.61805
[24]	test-rmse:0.60733
[25]	test-rmse:0.60431
[26]	test-rmse:0.60081
[27]	test-rmse:0.59225
[28]	test-rmse:0.58967
[29]	test-rmse:0.58221
[30]	test-rmse:0.57822
[31]	test-rmse:0.57525
[32]	test-rmse:0.57367
[33]	test-rmse:0.57038
[34]	test-rmse:0.56807
[35]	test-rmse:0.56435
[36]	test-rmse:0.56242
[37]	test-rmse:0.56048
[38]	test-rmse:0.55727
[39]	test-rmse:0.55632
[40]	test-rmse:0.55358
[41]	test-rmse:0.55251
[42]	test-rmse:0.55199
[43]	test-rmse:0.5500

In [None]:
#Evaluar el modelo

In [14]:
#Prediccion del set de comprobacion
y_pred = model.predict(dtest)

#Calcular métricas de rendimiento
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 0.2535126398685548
Mean Absolute Error (MAE): 0.34542839056713354
R-squared (R2): 0.8082138560250071


### *XGBoost para clasificacion*

In [21]:
import xgboost as xgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

#Ejmplo de cancer
data = load_breast_cancer()
X = data.data
y = data.target


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Crear el clasificador XGBoost con scikit-learn
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [17]:
# predicciones
y_pred = model.predict(X_test)

#métricas
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Mostrar los resultados
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)

# Informe de clasificación completo
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Matriz de confusión
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Recall: 0.971830985915493
Precision: 0.9583333333333334
F1 Score: 0.965034965034965

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114


Confusion Matrix:
 [[40  3]
 [ 2 69]]


In [None]:
#Podemos usar tb Gridsearch para la optimización de parámetros

In [18]:
#parámetros a buscar
param_grid = {
    'max_depth': [3, 4, 5],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2]
}

In [19]:
#con GridSearchCV para encontrar los mejores parámetros
modelo = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
grid_search = GridSearchCV(estimator = modelo,
                           param_grid=param_grid,
                           scoring='f1',
                           cv=5,
                           verbose=1)

grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 27 candidates, totalling 135 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

In [20]:

print("Mejores parámetros: ", grid_search.best_params_)

best_model = grid_search.best_estimator_

#Predecir con el mejor modelo
y_pred_best = best_model.predict(X_test)

# Evaluar el mejor modelo
recall_best = recall_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best)

print("\nBest Model Recall:", recall_best)
print("Best Model Precision:", precision_best)
print("Best Model F1 Score:", f1_best)


Mejores parámetros:  {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}

Best Model Recall: 0.971830985915493
Best Model Precision: 0.9583333333333334
Best Model F1 Score: 0.965034965034965
