# Maestría en Maestría en Ciencia de Datos e Inteligencia Artificial
#### 8. Machine Learning and Deep Learning
#### Docente: Msc. Renzo Claure Aracena.

### *XGBoost para regresión*

In [None]:
import os

In [None]:
os.getcwd()

In [None]:
#usaremos el set de datos de California Housing

In [None]:
from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing()
print(california_housing.keys())

In [None]:
y = california_housing.target

In [None]:
y

In [None]:
X = california_housing.data

In [None]:
X.shape

In [None]:
california_housing.DESCR

In [None]:
#importamos las librerias necesarias
#pip install xgboost

In [None]:
#XG es una libreria separada de scikit learn
import xgboost as xgb

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
#XGBoost tiene su ´propio formato de estructura de datos

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
#Ahora definimos los hiperparámetros
params = {
    'objective': 'reg:squarederror',  #Error cuadrático medio (MSE)
    'eval_metric': 'rmse',  #Raíz del error cuadrático medio (RMSE)
    'max_depth': 4,  #Profundidad de los árboles
    'eta': 0.1,  #Ratiode aprendizaje
    'subsample': 0.8,  #Porcentaje de datos que se usan en cada iteración
    'colsample_bytree': 0.8  #Porcentaje de característicaaas usadas por cada árbol
}

In [None]:
#para entrenar al modelo
model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtest, 'test')], early_stopping_rounds=10)

In [None]:
#Evaluar el modelo

In [None]:
#Prediccion del set de comprobacion
y_pred = model.predict(dtest)

#Calcular métricas de rendimiento
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

### *XGBoost para clasificacion*

In [None]:
import xgboost as xgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

#Ejmplo de cancer
data = load_breast_cancer()
X = data.data
y = data.target


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Crear el clasificador XGBoost con scikit-learn
model = xgb.XGBClassifier(eval_metric='logloss')

model.fit(X_train, y_train)

In [None]:
# predicciones
y_pred = model.predict(X_test)

#métricas
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Mostrar los resultados
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)

# Informe de clasificación completo
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Matriz de confusión
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
#Podemos usar tb Gridsearch para la optimización de parámetros

In [None]:
#parámetros a buscar
param_grid = {
    'max_depth': [3, 4, 5],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2]
}

In [None]:
#con GridSearchCV para encontrar los mejores parámetros
modelo = xgb.XGBClassifier(eval_metric='logloss')
grid_search = GridSearchCV(estimator = modelo,
                           param_grid=param_grid,
                           scoring='f1',
                           cv=5,
                           verbose=1)

grid_search.fit(X_train, y_train)


In [None]:
print("Mejores parámetros: ", grid_search.best_params_)

best_model = grid_search.best_estimator_

#Predecir con el mejor modelo
y_pred_best = best_model.predict(X_test)

# Evaluar el mejor modelo
recall_best = recall_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best)

print("\nBest Model Recall:", recall_best)
print("Best Model Precision:", precision_best)
print("Best Model F1 Score:", f1_best)
