In [19]:
from __future__ import division
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline
from sklearn.decomposition import PCA
import pandas as pd
import math
from numpy import random
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize as norm
from sklearn.metrics import roc_curve, roc_auc_score
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import f1_score

In [12]:
df = pd.read_excel('../data/CTG.xls', sheet_name='Raw Data', header=0, skiprows=[1])
data = df.to_numpy()
# Features matrix
X = data[:,0:22]
Y = data[:,23]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0, stratify=Y, shuffle=True)

In [14]:
smt = SMOTE()
X_smt, Y_smt = smt.fit_resample(X_train, y_train)
X_smt.shape

(3723, 22)

In [3]:
def prueba_svm(ker='linear',conf=0.001,gam=0.0):
    Folds = 4
    random.seed(19680801)
    EficienciaTrain = np.zeros(Folds)
    EficienciaVal = np.zeros(Folds)
    skf = StratifiedKFold(n_splits=Folds)
    porc_vectores = np.zeros(Folds)
    j = 0
    
    for train, test in skf.split(X, Y):
        Xtrain = X[train,:]
        Ytrain = Y[train]
        Xtest = X[test,:]
        Ytest = Y[test]

        #Normalizamos los datos
        scaler = preprocessing.StandardScaler().fit(Xtrain)
        Xtrain = scaler.transform(Xtrain)
        Xtest = scaler.transform(Xtest)
        
        #Haga el llamado a la función para crear y entrenar el modelo usando los datos de entrenamiento
        if ker == 'linear':
            modelo = SVC(kernel =ker, C=conf)
        else:
            modelo = SVC(kernel=ker, C= conf, gamma= gam)
        modelo.fit(Xtrain,Ytrain)


        #Validación
        Ytrain_pred = modelo.predict(Xtrain)
        Yest = modelo.predict(Xtest)

        #Evaluamos las predicciones del modelo con los datos de test
        EficienciaTrain[j] = np.mean(Ytrain_pred.ravel() == Ytrain.ravel())
        EficienciaVal[j] = np.mean(Yest.ravel() == Ytest.ravel())
        porc_vectores[j] = len(modelo.support_vectors_)/len(Xtrain)
        j += 1

    print('Eficiencia durante el entrenamiento = ' + str(np.mean(EficienciaTrain)) + '+-' + str(np.std(EficienciaTrain)))
    print('Eficiencia durante la validación = ' + str(np.mean(EficienciaVal)) + '+-' + str(np.std(EficienciaVal)))
    
    eficiencia_val = np.mean(EficienciaVal)
    ic_val = np.std(EficienciaVal)
    eficiencia_train = np.mean(EficienciaTrain)
    ic_train = np.std(EficienciaTrain)
    porc_vect = np.mean(porc_vectores)
    
    return eficiencia_val,ic_val,eficiencia_train,ic_train,porc_vect

In [1]:
import pandas as pd
import qgrid
df_types = pd.DataFrame({
    'Kernel' : pd.Series(['linear','linear','linear','linear','linear','linear','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf']),
    'C' : pd.Series([0.001,0.01,0.1,1,10,100,0.001,0.001,0.001,0.01,0.01,0.01,0.1,0.1,0.1,1,1,1,10,10,10,100,100,100]),
    'gamma' : pd.Series([0,0,0,0,0,0,0.01,0.1,1,0.01,0.1,1,0.01,0.1,1,0.01,0.1,1,0.01,0.1,1,0.01,0.1,1])})
df_types["Eficiencia en validacion"] = ""
df_types["Intervalo de confianza"] = ""
df_types["Eficiencia en entrenamiento"] = ""
df_types["Intervalo de confianza"] = ""
df_types["% de Vectores de Soporte"] = ""
df_types.set_index(['Kernel','C','gamma'], inplace=True)

In [6]:
j = 0
for i in df_types.index:
    print(i)
    kernel = i[0]
    c = i[1]
    gamma = i[2]    
    eficiencia_val,ic_val,eficiencia_train,ic_train,porc_vect=prueba_svm(ker=kernel,conf=c,gam=gamma)
    df_types.loc[i,"Eficiencia en validacion"] = str(eficiencia_val)
    df_types.loc[i,"Intervalo de confianza"] = str(ic_val)
    df_types.loc[i,"Eficiencia en entrenamiento"] = str(eficiencia_train)
    df_types.loc[i,"Intervalo de confianza"] = str(ic_train)
    df_types.loc[i, "% de Vectores de Soporte"] = str(porc_vect)
    j = j+1
    print(eficiencia_val,ic_val,eficiencia_train,ic_train,porc_vect)

('linear', 0.001, 0.0)
Eficiencia durante el entrenamiento = 0.8162457766782173+-0.008395656079409227
Eficiencia durante la validación = 0.8137566727553347+-0.02615462018078472
0.8137566727553347 0.02615462018078472 0.8162457766782173 0.008395656079409227 0.4137694056473531
('linear', 0.01, 0.0)
Eficiencia durante el entrenamiento = 0.8958917256325641+-0.012133197174505657
Eficiencia durante la validación = 0.8349032892966881+-0.02246488742738512
0.8349032892966881 0.02246488742738512 0.8958917256325641 0.012133197174505657 0.3264359294061193
('linear', 0.1, 0.0)
Eficiencia durante el entrenamiento = 0.9167451021267056+-0.007922726755760525
Eficiencia durante la validación = 0.8372440635487022+-0.04777071892472698
0.8372440635487022 0.04777071892472698 0.9167451021267056 0.007922726755760525 0.2568217217386516
('linear', 1.0, 0.0)
Eficiencia durante el entrenamiento = 0.9244270245395154+-0.00584971906888737
Eficiencia durante la validación = 0.8391149129886865+-0.055567550581846384
0.8

In [20]:
def train_svm(kernel, C, gamma):
    Folds = 4
    random.seed(19680801)
    f1 = np.zeros(Folds)
    auc_fpr = np.zeros(Folds)
    gmean = np.zeros(Folds)
    skf = StratifiedKFold(n_splits=Folds)
    j = 0
    models = []
    for train, test in skf.split(X_smt, Y_smt):
        Xtrain = X_smt[train,:]
        Ytrain = Y_smt[train]
        Xtest = X_smt[test,:]
        Ytest = Y_smt[test]

        #Normalizamos los datos
        Xtrain = norm(Xtrain)
        Xtest = norm(Xtest)

        #Haga el llamado a la función para crear y entrenar el modelo usando los datos de entrenamiento
        model = SVC(C=C,kernel=kernel, gamma=gamma, probability=True)
        model.fit(Xtrain, Ytrain)
        models.append(model)
        #Validación
        Y_pred=model.predict_proba(Xtest)
        Ytrain_pred = model.predict(Xtrain)#Use el modelo previamente entrenado para hacer predicciones con las mismas muestras de entrenamiento
        Yest = model.predict(Xtest)#Use el modelo previamente entrenado para hacer predicciones con las muestras de test

        #Evaluamos las predicciones del modelo con los datos de test

        auc_fpr[j] = roc_auc_score(Ytest, Y_pred, multi_class="ovr", average="weighted")
        gmean[j] =  geometric_mean_score(y_true = Ytest, y_pred=Yest, average="weighted")
        f1[j] = f1_score(y_true = Ytest, y_pred=Yest, average = "weighted")
        
        j += 1
    print(f"Finalizado: {kernel} C: {C} gamma: {gamma}")
    return str(np.mean(f1)), str(np.std(f1)), str(np.mean(auc_fpr)), str(np.std(auc_fpr)), str(np.mean(gmean)), str(np.std(gmean))

In [21]:
import qgrid
randn = np.random.randn
df_types = pd.DataFrame({
    'Kernel' : pd.Series(['lineal','lineal','lineal','lineal','lineal','lineal','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf']),
    'C' : pd.Series([0.001,0.01,0.1,1,10,100,0.001,0.001,0.001,0.01,0.01,0.01,0.1,0.1,0.1,1,1,1,10,10,10,100,100,100]),
    'gamma' : pd.Series([0,0,0,0,0,0,0.01,0.1,1,0.01,0.1,1,0.01,0.1,1,0.01,0.1,1,0.01,0.1,1,0.01,0.1,1])})

f1_ = []
stdf1_ = []
auc_ =[]
stdAuc_ = []
gmean_ = []
stdgmean_ = []
for e, i, j in zip(df_types['Kernel'], df_types['C'], df_types['gamma']):
    if(e == 'lineal'):
        f1, stdf1, auc, stdAuc, gmean, stdgmean = train_svm("linear", i, "auto")
    else:
        f1, stdf1, auc, stdAuc, gmean, stdgmean = train_svm(e, i, j)
    f1_.append(f1)
    stdf1_.append(stdf1)
    auc_.append(auc)
    stdAuc_.append(stdAuc)
    gmean_.append(gmean)
    stdgmean_.append(stdgmean)
df_types.set_index(['Kernel','C','gamma'], inplace=True)
df_types["f1"] = f1_
df_types["Intervalo de confianza f1"] = stdf1_
df_types["AUC"] = auc_
df_types["Intervalo de confianza AUC"] = stdAuc_
df_types["G-mean"] = gmean_
df_types["Intervalo de confianza G-mean"] = stdgmean_

Finalizado: linear C: 0.001 gamma: auto
Finalizado: linear C: 0.01 gamma: auto
Finalizado: linear C: 0.1 gamma: auto
Finalizado: linear C: 1.0 gamma: auto
Finalizado: linear C: 10.0 gamma: auto
Finalizado: linear C: 100.0 gamma: auto
Finalizado: rbf C: 0.001 gamma: 0.01
Finalizado: rbf C: 0.001 gamma: 0.1
Finalizado: rbf C: 0.001 gamma: 1.0
Finalizado: rbf C: 0.01 gamma: 0.01
Finalizado: rbf C: 0.01 gamma: 0.1
Finalizado: rbf C: 0.01 gamma: 1.0
Finalizado: rbf C: 0.1 gamma: 0.01
Finalizado: rbf C: 0.1 gamma: 0.1
Finalizado: rbf C: 0.1 gamma: 1.0
Finalizado: rbf C: 1.0 gamma: 0.01
Finalizado: rbf C: 1.0 gamma: 0.1
Finalizado: rbf C: 1.0 gamma: 1.0
Finalizado: rbf C: 10.0 gamma: 0.01
Finalizado: rbf C: 10.0 gamma: 0.1
Finalizado: rbf C: 10.0 gamma: 1.0
Finalizado: rbf C: 100.0 gamma: 0.01
Finalizado: rbf C: 100.0 gamma: 0.1
Finalizado: rbf C: 100.0 gamma: 1.0


In [22]:
df_types

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1,Intervalo de confianza f1,AUC,Intervalo de confianza AUC,G-mean,Intervalo de confianza G-mean
Kernel,C,gamma,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
lineal,0.001,0.0,0.400985396625114,0.0811416816475706,0.3073068919627593,0.1132529538157567,0.5946569636502275,0.041852790770876
lineal,0.01,0.0,0.400985396625114,0.0811416816475706,0.7538385568160542,0.0063281984753065,0.5946569636502275,0.041852790770876
lineal,0.1,0.0,0.6773160378086234,0.0082694959009687,0.8694685631853851,0.0071574070709577,0.748504548866699,0.0070347841946961
lineal,1.0,0.0,0.7944521345530227,0.0126045061302721,0.9313996986803228,0.0056592003512299,0.844321244193277,0.0093001506271908
lineal,10.0,0.0,0.7998211136118669,0.0103444247579444,0.942729066727184,0.0049165362922264,0.849672784074142,0.0077140619496643
lineal,100.0,0.0,0.8488015729844497,0.0040557027209568,0.9580909984928973,0.0028561495749498,0.885160542951299,0.003108566165196
rbf,0.001,0.01,0.4007981013710985,0.0812603556715125,0.2208341953155478,0.0077815330642756,0.5944332665978997,0.0419977363746337
rbf,0.001,0.1,0.4012914412554785,0.0817764386229443,0.2188641510054467,0.0084054504186799,0.5946467987920174,0.0424462163036486
rbf,0.001,1.0,0.4191614974696449,0.0855811681415892,0.2831366401739784,0.1272564374672913,0.6065368633902689,0.0437139629660468
rbf,0.01,0.01,0.4007981013710985,0.0812603556715125,0.2207964959001102,0.0084536012741724,0.5944332665978997,0.0419977363746337
