In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv('../data/CTGsmt.csv', header=None)
data = df.to_numpy()
# Features matrix
X = data[:,0:22]
Y = data[:,23]

In [5]:
def experimentar_gbt(num_trees, X, Y):
    """funcion que realiza experimentos de arboles de decision
    num_trees: list[int] lista con el número de arboles usado para el RF
    X: matriz con las caractersiticas
    Y: matriz de numpy con etiquetas
    retorna: dataframe con:
        - numero de arboles usados
        - eficiencia de entrenamiento
        - desviacion de estandar eficiencia de entrenamiento
        - eficiencia de prueba
        - desviacion estandar eficiencia de prueba
    """
    folds = 4
    skf = StratifiedKFold(n_splits=folds)
    resultados = pd.DataFrame()
    idx = 0
    for trees in num_trees:
        ## para almacenar los errores intermedios
        EficienciaTrain = []
        EficienciaVal = []
        for train, test in skf.split(X, Y):
            Xtrain = X[train,:]
            Ytrain = Y[train]
            Xtest = X[test,:]
            Ytest = Y[test]
            #Haga el llamado a la función para crear y entrenar el modelo usando los datos de entrenamiento
            modelo= GradientBoostingClassifier(n_estimators=trees)
            modelo.fit(Xtrain, Ytrain)
            #predecir muestras de entrenamiento
            Ytrain_pred =  modelo.predict(Xtrain)
            #predecir muestras de pruebas
            Yest = modelo.predict(Xtest)
            #Evaluamos las predicciones del modelo con los datos de test
            EficienciaTrain.append(np.mean(Ytrain_pred.ravel() == Ytrain.ravel()))
            EficienciaVal.append(np.mean(Yest.ravel() == Ytest.ravel()))
            # Gmean
            Yest = modelo.predict(Xtest)
            gmean = geometric_mean_score(y_true = Ytest, y_pred=Yest, average="weighted")
            Ypred_proba = modelo.predict_proba(Xtest)
            # ROC
            roc_score = roc_auc_score(Ytest, Ypred_proba, multi_class="ovr", average="weighted")
            # F1
            f1 = f1_score(y_true = Ytest, y_pred=Yest, average = "weighted")

        resultados.loc[idx,'número de arboles'] = trees
        resultados.loc[idx,'GMEAN'] = gmean
        resultados.loc[idx,'F1 Score'] = f1
        resultados.loc[idx,'ROC curve'] = roc_score
        resultados.loc[idx,'desviacion estandar entrenamiento'] = np.std(EficienciaTrain)
        resultados.loc[idx,'eficiencia de prueba'] = np.mean(EficienciaVal)
        resultados.loc[idx,'desviacion estandar prueba'] = np.std(EficienciaVal)
        idx= idx +1
        
    return (resultados)

In [6]:
arboles = [5,10,20,50,100, 150]
resultados_gbt = experimentar_gbt(arboles, X, Y)
resultados_gbt

Unnamed: 0,número de arboles,GMEAN,F1 Score,ROC curve,desviacion estandar entrenamiento,eficiencia de prueba,desviacion estandar prueba
0,5.0,0.867761,0.817409,0.942799,0.010486,0.881771,0.036201
1,10.0,0.875769,0.826838,0.952265,0.008242,0.899294,0.038573
2,20.0,0.882525,0.835262,0.95113,0.00582,0.912586,0.04001
3,50.0,0.889887,0.844572,0.959057,0.003919,0.923665,0.04219
4,100.0,0.892951,0.848245,0.967714,0.002968,0.928096,0.04204
5,150.0,0.894791,0.850542,0.969875,0.001547,0.932124,0.043009
