In [9]:
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
# Carga de datos.
csv = pd.read_csv('penguins_clean.csv', sep=';')
print(csv.head())

   Culmen length  Culmen depth  Flipper length  Body mass  Especies
0           36.7          19.3           193.0     3450.0       0.0
1           39.3          20.6           190.0     3650.0       0.0
2           38.9          17.8           181.0     3625.0       0.0
3           39.2          19.6           195.0     4675.0       0.0
4           34.1          18.1           193.0     3475.0       0.0


In [5]:
# Convertir dataframe en numpy array
data = np.array(csv)
X = data[:, :-1]
y = data[:, -1]

print(y)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.

In [6]:
# Mostrar características de la tabla de datos.
print("Tabla de datos: %d instancias y %d atributos" % (X.shape[0], X.shape[1]))
print("Valores de la clase:", set(y))

Tabla de datos: 332 instancias y 4 atributos
Valores de la clase: {0.0, 1.0, 2.0}


In [7]:
# Test: hold-out split 80-20%. PARTICIÓN EXTERNA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

valores_test, ocur_test = np.unique(y_test, return_counts=True)
print('Test: ', 'clases:', valores_test, ' ocurrencias: ', ocur_test)

valores_train, ocur_train = np.unique(y_train, return_counts=True)
print('Entrenamiento: ', ' clases:', valores_train, '  ocurrencias:', ocur_train)

Test:  clases: [0. 1. 2.]  ocurrencias:  [33 22 12]
Entrenamiento:   clases: [0. 1. 2.]   ocurrencias: [115  98  52]


In [10]:
# Estandarizar las características de entrenamiento y de test
standardizer = StandardScaler()
X_train = standardizer.fit_transform(X_train)
X_test = standardizer.transform(X_test)

In [11]:
# Hacemos el cross-validation interno para seleccionar los mejores hiperparámetros
from sklearn.svm import SVC
svc = SVC(C=1, gamma='scale', kernel='rbf')

#validacion cruzada
#n_splits = numero de "bolsas" en las qque queremos dividir el dataset
#shuffle baraja las posiciones del dataset para no elegir las muesstras de validacion en orden. 
results = cross_val_score(svc, X_train, y_train, cv = KFold(n_splits=5, shuffle=True, random_state=42))
print("Resultados por bolsa: ", results)
print("Accuracy (media +/- desv.): %0.4f +/- %0.4f" % (results.mean(), results.std()))

Resultados por bolsa:  [0.96226415 0.98113208 0.98113208 1.         0.96226415]
Accuracy (media +/- desv.): 0.9774 +/- 0.0141


Despues de optimizar hiperparametros se procede a entrenar el modelo de forma real

In [12]:
# Una vez entrenado y validado el modelo para seleccionar los mejores hyperparameters, utilizamos todos los datos de 
# "train" y "val" para entrenar el modelo definitivo

svc = SVC(C=1, gamma='scale', kernel='rbf')
svc.fit(X_train, y_train)


In [13]:
# Calcular la accuracy del conjunto de test
test_results = svc.score(X_test, y_test)
print('Exactitud en test: ', test_results*100, '%')

Exactitud en test:  98.50746268656717 %


In [14]:
# Extraer las predicciones, en lugar de directamente la accuracy
y_pred = svc.predict(X_test)
print('Predicciones:     ', y_pred)
print('Etiquetas reales: ', y_test)

Predicciones:      [0. 2. 0. 1. 0. 1. 2. 1. 1. 1. 0. 0. 2. 0. 2. 0. 0. 0. 0. 1. 0. 0. 2. 1.
 0. 0. 1. 0. 1. 2. 1. 2. 0. 0. 1. 1. 1. 2. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0.
 0. 2. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 2. 2. 1. 0. 0. 1. 0.]
Etiquetas reales:  [0. 2. 0. 1. 0. 1. 2. 1. 1. 1. 0. 0. 2. 0. 2. 0. 0. 0. 0. 1. 0. 0. 2. 1.
 0. 0. 1. 2. 1. 2. 1. 2. 0. 0. 1. 1. 1. 2. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0.
 0. 2. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 2. 2. 1. 0. 0. 1. 0.]


In [None]:
gt_labels = {0: 'Adelie', 1: 'Gentoo', 2: 'Chinstrap'}
Culmenlength= 46.1
Culmendepth= 13,2 
Flipperlength = 211.0 
Bodymass = 4500.0

sample_test  == [[Culmenlength]]



#hay que revisar esto para terminarlo