In [5]:
import pandas as pd
import numpy as np
from IPython.display import display_html
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from utils import Utils
from sklearn.ensemble import GradientBoostingClassifier
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE

In [6]:
ut = Utils()
X, Y = ut.get_matrixes()

In [7]:
#ejercicio de código
def entrenamiento_sin_seleccion_caracteristicas(num_trees, X, Y):
    """
    Función que ejecuta el entrenamiento del modelo sin una selección particular
    de las características

      Parámetros:splits : numero de particiones  a realizar
      Retorna:
      1. El modelo entreando
      2. El vector de errores
      3. El Intervalo de confianza
      4. El tiempo de procesamiento
    """
    #Implemetamos la metodología de validación
    EficienciaTrain = []
    EficienciaVal = []
    f1 = [] 
    gmean =[]
    j = 0
    kf = StratifiedKFold(n_splits=4)
    for train, test in kf.split(X, Y):
        Xtrain, Ytrain, Xtest, Ytest = ut.get_training_test(X=X, Y=Y, train=train, test=test)
        
        #Creamos el clasificador SVM.
        clf = GradientBoostingClassifier(n_estimators=num_trees)
        clf.fit(Xtrain, Ytrain)
        f1_score_current, gmean_current, eficiencia_train_current, eficiencia_test_current = ut.get_metrics(model=clf, X_train=Xtrain, X_test=Xtest, y_train=Ytrain, Ytest=Ytest)
        tiempo_i = time.time()
        EficienciaTrain.append(eficiencia_train_current)
        EficienciaVal.append(eficiencia_test_current)
        gmean.append(gmean_current)           
        f1.append(f1_score_current)

    f1_ = np.mean(f1)
    f1_std = np.std(f1)
    gmean_ = np.mean(gmean)
    gmean_std = np.std(gmean)
    EficienciaVal_ = np.mean(EficienciaVal)
    EficienciaVal_std = np.std(EficienciaVal)
    EficienciaTrain_ = np.mean(EficienciaTrain)
    Eficienciatrain_std = np.std(EficienciaTrain)
    
    print("F1STD= " + str(f1_std))
    
    
    return clf, f1_, f1_std, gmean_, gmean_std, EficienciaVal_, EficienciaVal_std, EficienciaTrain_, Eficienciatrain_std

In [8]:
#ejercicio de código
def recursive_feature_elimination_wrapper(estimator, feature_numbers, X,Y):
    """
    Esta función es un envoltorio del objeto RFE de sklearn

    Parámetros:
    estimator(sklearn.svm.SVC), El estimador SVM
    feature_numbers(int), El número de características a considerar
    X (numpy.array), El arreglo numpy de características
    Y (numpy.array), El vector de etiquetas

    Retorna:
    El modelo entrenado ()
    La máscara de características seleccionada, array [longitud de caracterisitcas de X]
    El rankeo de características, array [longitud de caracterisitcas de X]
    El objeto RFE entrenado sobre el set reducido de características
    El tiempo de ejecución
    """
    rfe = RFE(estimator=estimator, n_features_to_select=feature_numbers, step=1)
    tiempo_i = time.time()
    rfe.fit(X=X, y=Y)
    time_o = time.time()-tiempo_i
    feature_mask = rfe.support_
    features_rank = rfe.ranking_
    estimator = rfe.estimator_
    return rfe, feature_mask, features_rank, estimator, time_o

In [9]:
#ejercicio de código
def experimentar(n_feats, n_sets, X, Y):
    """
    Esta función realiza la comparación del desempeño de RFE utilizando diferente 
    número de feats y particionando el conjunto de datos en diferente número de 
    subconjuntos

    Parámetros:
    X (numpy.array), El arreglo numpy de características
    Y (numpy.array), El vector de etiquetas
    n_feats, Vector de números enteros que indica el número de características
              que debe utilizar el modelo
    n_sets, Vector de números enteros que indica el número de particiones

    Retorna:  
    - DataFrame con las columnas: CON_SEL, NUM_VAR, NUM_SPLITS, ERROR_VALIDACION, IC_STD_VALIDACION, 
    y T_EJECUCION. 

    """
    df = pd.DataFrame()
    idx = 0
    for split_number in n_sets:
        clf, f1_, f1_std, gmean_, gmean_std, EficienciaVal_, EficienciaVal_std, EficienciaTrain_, Eficienciatrain_std = entrenamiento_sin_seleccion_caracteristicas(split_number,X,Y)        
        df.loc[idx,'CON_SEL'] = 'NO'
        df.loc[idx,'NUM_VAR'] = X.shape[1]
        df.loc[idx,'NUM_SPLITS'] = split_number
        df.loc[idx,'F1 Score'] = f1_
        df.loc[idx,'F1 Score IC'] = f1_std
        df.loc[idx,'Gmean'] = gmean_
        df.loc[idx,'Gmean IC'] = gmean_std
        df.loc[idx,'Eficiencia Test'] = EficienciaVal_
        df.loc[idx,'Eficiencia Test IC'] = EficienciaVal_std
        df.loc[idx,'Eficiencia Train'] = EficienciaTrain_
        df.loc[idx,'Eficiencia Train IC'] = Eficienciatrain_std
        idx+=1
    print("termina experimentos sin selección")
    #Con selección de características
    
    for f in n_feats:
        for split_number in n_sets:
            #Implemetamos la metodología de validación 
            EficienciaTrain = []
            EficienciaVal = []
            f1 = [] 
            gmean =[]
            times = np.ones(split_number)
            kf = StratifiedKFold(n_splits=split_number)
            j = 0
            for train_index, test_index in kf.split(X, Y):
                
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = Y[train_index], Y[test_index]
                
                clf =  GradientBoostingClassifier(n_estimators=50)
                
                # se ignorar las otras salidas
                y_pred = rfe.predict(X_test)
                print(y_pred)
                rfe, _, _, _, t = recursive_feature_elimination_wrapper(estimator=clf,feature_numbers=f, X=X_train,Y=y_train)
                
                #clf.fit(X_train, y_train)
                
                
                f1_score_current, gmean_current, eficiencia_train_current, eficiencia_test_current = ut.get_metrics(model=y_pred, X_train=X_train, X_test=X_test, y_train=y_train, Ytest=y_test)
                EficienciaTrain.append(eficiencia_train_current)
                EficienciaVal.append(eficiencia_test_current)
                f1.append(f1_score_current)
                gmean.append(gmean_current)
                times[j] = t
                j+=1

            df.loc[idx,'CON_SEL'] = 'SI'
            df.loc[idx,'NUM_VAR'] = f
            df.loc[idx,'NUM_SPLITS'] = split_number
            df.loc[idx,'F1 Score'] = np.mean(f1)
            df.loc[idx, 'F1 Score IC'] = np.std(f1)
            df.loc[idx, 'Gmean'] = np.mean(gmean)
            df.loc[idx, 'Gmean IC'] = np.std(gmean)
            df.loc[idx, 'Eficiencia Test'] = np.mean(EficienciaVal)
            df.loc[idx, 'Eficiencia Test IC'] = np.std(EficienciaVal)
            df.loc[idx, 'Eficiencia Train'] = np.mean(EficienciaTrain)
            df.loc[idx, 'Eficiencia Train IC'] = np.std(EficienciaTrain)
            idx+=1
    return df


In [10]:
dfr = experimentar(n_feats = [3, 5, 10,15,20], n_sets = [5, 10], X= X, Y=Y)

F1STD= 0.039788385041631354
F1STD= 0.041962503201004184
termina experimentos sin selección


AttributeError: 'numpy.ndarray' object has no attribute 'predict'

In [7]:
dfr

Unnamed: 0,CON_SEL,NUM_VAR,NUM_SPLITS,F1 Score,F1 Score IC,Gmean,Gmean IC,Eficiencia Test,Eficiencia Test IC,Eficiencia Train,Eficiencia Train IC
0,NO,22.0,5.0,0.88,0.039788,0.910633,0.027727,0.881569,0.036516,0.922525,0.010486
1,NO,22.0,10.0,0.897438,0.042359,0.924065,0.029244,0.899294,0.038573,0.935818,0.008242
2,SI,3.0,5.0,0.93072,0.050104,0.949317,0.03509,0.932729,0.046409,0.975579,0.004504
3,SI,3.0,10.0,0.932342,0.062288,0.951118,0.041169,0.935134,0.054313,0.973638,0.00281
4,SI,5.0,5.0,0.93052,0.050158,0.949164,0.035142,0.932528,0.046477,0.975579,0.004504
5,SI,5.0,10.0,0.932337,0.062364,0.951118,0.041236,0.935134,0.054402,0.973683,0.002777
6,SI,10.0,5.0,0.93052,0.050158,0.949164,0.035142,0.932528,0.046477,0.975579,0.004504
7,SI,10.0,10.0,0.932538,0.062494,0.95127,0.04134,0.935335,0.054541,0.973683,0.002777
8,SI,15.0,5.0,0.93052,0.050158,0.949164,0.035142,0.932528,0.046477,0.975579,0.004504
9,SI,15.0,10.0,0.932744,0.062541,0.951421,0.041372,0.935536,0.054582,0.973683,0.002777


In [1]:
Xtrain, Ytrain, Xtest, Ytest = ut.get_training_test(X=X, Y=Y, train=train, test=test)

NameError: name 'ut' is not defined

In [4]:
clf =  GradientBoostingClassifier(n_estimators=50)


In [None]:
rfe, _, _, _, t = recursive_feature_elimination_wrapper(estimator=clf,feature_numbers=f, X=X_train,Y=y_train)