In [1]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.model_selection import train_test_split
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import StratifiedKFold
from scipy.spatial.distance import euclidean

In [2]:
df = pd.read_csv('../data/CTGsmt.csv', header=None)
data = df.to_numpy()
# Features matrix
X = data[:,0:22]
Y = data[:,23]
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0, stratify=Y, shuffle=True, test_size = 0.25)

In [3]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.259259,0.259259,0.000000,0.000000,0.000000,0.813333,0.044118,0.472527,0.047337,0.000000,...,0.110092,0.034483,0.111111,0.000000,0.472441,0.587156,0.403670,0.271375,1.0,2.0
1,0.481481,0.481481,0.153846,0.000000,0.173913,0.066667,0.279412,0.000000,0.205128,0.125000,...,0.165138,0.655172,0.333333,0.100000,0.637795,0.577982,0.577982,0.044610,0.5,1.0
2,0.500000,0.500000,0.076923,0.000000,0.217391,0.053333,0.279412,0.000000,0.264300,0.125000,...,0.165138,0.655172,0.277778,0.100000,0.637795,0.568807,0.559633,0.048327,0.5,1.0
3,0.518519,0.518519,0.076923,0.000000,0.260870,0.053333,0.323529,0.000000,0.453649,0.125000,...,0.027523,0.413793,0.611111,0.000000,0.606299,0.559633,0.550459,0.048327,1.0,1.0
4,0.481481,0.481481,0.153846,0.000000,0.217391,0.053333,0.323529,0.000000,0.392505,0.000000,...,0.027523,0.413793,0.500000,0.000000,0.606299,0.577982,0.559633,0.040892,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4960,0.777778,0.777778,0.000000,0.002050,0.016753,0.767217,0.014706,0.819942,0.089517,0.000000,...,0.743877,0.263920,0.102446,0.000000,0.708661,0.693713,0.675364,0.000000,1.0,3.0
4961,0.518519,0.518519,0.000000,0.004269,0.000000,0.914561,0.000000,0.555522,0.099061,0.000000,...,0.722899,0.198276,0.111111,0.000000,0.590551,0.568807,0.541284,0.003717,0.5,3.0
4962,0.425926,0.425926,0.000000,0.001773,0.303076,0.719610,0.382353,0.000000,0.000000,0.380483,...,0.000000,0.191420,0.334958,0.005849,0.354331,0.110360,0.303289,0.041110,0.5,3.0
4963,0.500000,0.500000,0.000000,0.001597,0.178238,0.640000,0.388204,0.000000,0.143872,0.118783,...,0.091743,0.370690,0.283304,0.000000,0.309478,0.225708,0.270667,0.166177,0.5,3.0


In [4]:
def kernel_gaussiano(x):
    """Calcula el kernel gaussiano de x
    x: matriz/vector de numpy
    retorna: el valor de de kernel gaussiano
    """
    return np.exp((-0.5)*x**2)

In [5]:
def ErrorClas(Y_lest, Y):
    """funcion que calcula el error de clasificación
    Y_lest: numpy array con la estimaciones de etiqueta
    Y: etiquetas reales
    retorna: error de clasificación (int)
    """
    error = 1 - np.sum(Y_lest == Y)/len(Y)
    
    return error


In [6]:
def ParzenWindow(x,Data,h):
    """"ventana de parzen
    x: vector con representando una sola muestra
    Data: vector de muestras de entrenamiento
    h: ancho de la ventana de kernel
    retorna: el valor de ventana de parzen para una muestra
    """
    h = h
    Ns = Data.shape[0]
    suma = 0
    for k in range(Ns):
        u = euclidean(x,Data[k,:])
        suma += kernel_gaussiano(u/h)
    return suma


In [7]:
#Ejercicio de código
def parzenClass(X_train, Y_train, X_test, h):
    """ Funcion que implementa metodo de ventana de parzen para
        para clasificación
    X_train: es la matriz con las muestras de entrenamiento
    Y_train: es un vector con los valores de salida pra cada una de las muestras de entrenamiento
    X_test: es la matriz con las muestras de validación
    h (float): ancho de h de la ventana
    retorna: - las estimaciones del modelo parzen para el conjunto X_test 
              esta matriz debe tener un shape de [row/muestras de X_test]
             - las probabilidades de la vetana [row/muestras de X_test, numero de clases]  
    """
        
    Yest = np.zeros(X_test.shape[0])
    clases = np.unique(Y_train)
    fds_matrix = np.zeros((X_test.shape[0], len(clases)))
    
    
    ## pista: recuerde el termino que acompaña al sumatoria (N)
    
    idx = 0
    for n, sample in enumerate (X_test):
      max_prob = 0
      est_label = None
      j = 0
      for label in clases:
        indixes = np.argwhere(Y_train == label)
        train_elements = np.take(X_train, indixes.T[0], axis = 0)
        dim = train_elements.shape
        prob = ParzenWindow(sample,train_elements,h)/dim[0]
        fds_matrix[idx][j] = prob
        if prob > max_prob:
          max_prob = prob
          est_label = label
        j += 1
      Yest[idx] = est_label
      idx +=1

    #Debe retornar un vector que contenga las predicciones para cada una de las muestras en X_val, en el mismo orden.  
    return Yest, fds_matrix

In [8]:
#ejercicio de codigo
def experimentarParzen (X, Y, hs):
    """Función que realiza los experimentos con knn usando
       una estrategia de validacion entrenamiento y pruebas
    X: matriz de numpy conjunto con muestras y caracteristicas
    Y: vector de numpy con los valores de las etiquetas
    ks: List[int/float] lista con los valores de k-vecinos a usar
    retorna: dataframe con los resultados, debe contener las siguientes columnas:
        - el ancho de ventana, el error medio de prueba, la desviacion estandar del error
    """
    
    
    # se usa la función para implementar la estrategia de validación.
    skf = StratifiedKFold(n_splits=4)
    resultados = pd.DataFrame()
    idx = 0
    # iteramos sobre los valores de hs
    for h in hs:
        # lista para almacenar los errores de cada iteración
        # de la validación
        gmean = []
        f1 = []
        eficiencia_train = []
        eficiencia_test = []
        for train, test in skf.split(X, Y):

            Xtrain = X[train,:]
            Ytrain = Y[train]
            Xtest = X[test,:]
            Ytest = Y[test]            
            Yest, probabilidades =  parzenClass(Xtrain, Ytrain, Xtest, h)
            # F1
            f1_current = f1_score(y_true = Ytest, y_pred=Yest, average = "weighted")       
            f1.append(f1_current)
            # Gmean
            gmean_current = 
            geometric_mean_score(y_true = Ytest, y_pred=Yest, average="weighted")
            gmean.append(gmean_current)
            eficiencia_train.append(np.mean(Yest.ravel() == y_test.ravel()))
            eficiencia_test.append(np.mean(Yest.ravel() == y_train.ravel()))
        
        resultados.loc[idx,'ancho de ventana'] = h 
        resultados.loc[idx,'GMEAN'] = np.mean(gmean)
        resultados.loc[idx,'GMEAN IC'] = np.std(gmean)
        resultados.loc[idx,'F1 Score'] = np.mean(f1)
        resultados.loc[idx,'F1 Score IC'] = np.std(f1)
        resultados.loc[idx,'Eficiencia Train'] = np.mean(
        )
        resultados.loc[idx,'Eficiencia Train IC'] = np.std(eficiencia_train)
        resultados.loc[idx,'Eficiencia Test'] = np.mean(eficiencia_test)
        resultados.loc[idx,'Eficiencia Test IC'] = np.std(eficiencia_test)
        idx+=1
    return (resultados)

In [9]:
hs = [0.05, 0.1, 0.5, 1, 2, 5, 10]
experimentos_parzen = experimentarParzen(X,Y, hs)
experimentos_parzen

AttributeError: 'function' object has no attribute 'predict_proba'

In [None]:
plot_confusion_matrix(clf,X=X_test, y_true=y_test,normalize='true')

In [None]:
Yest = clf.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, Yest, pos_label=2)
roc_auc = auc(fpr, tpr)

In [None]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
y = label_binarize(Y, classes=[1, 2, 3])
n_classes = y.shape[1]
print(n_classes)

In [None]:
y_score = clf.predict_proba(X_test)
y_test.shape

In [None]:
y = label_binarize(y, classes=[1, 2, 3])
n_classes = y.shape[1]
n_classes

In [None]:
y_test = np.matrix(y_test)
y_test

In [None]:
y_score.shape

In [None]:
y_test = y_test.T
y_test.shape

In [None]:
y_test.shape

In [None]:
y_pred2 = clf.predict_log_proba(X_test)
score = y_pred2[:,0]-y_pred2[:,1]
tpr,fpr,_ = roc_curve(Y, score)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
y_test[:, ]

In [None]:
 y_score[:, 0]