In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn_extra.cluster import KMedoids

In [2]:
# Load Data
features = ['area','perimeter','compactness','length','width','asymmetry','groove','class']
df = pd.read_csv('seeds_dataset.txt', header=None, delimiter='\s+', names=features)
df.head()

Unnamed: 0,area,perimeter,compactness,length,width,asymmetry,groove,class
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1


In [3]:
# Obtenemos los valores de los datos y los valores de la clase
data = df.iloc[:,:-1]
data.head()

Unnamed: 0,area,perimeter,compactness,length,width,asymmetry,groove
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175


In [4]:
y = df['class'].values
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int64)

In [5]:
## Preprocessing Data
scalar = StandardScaler()
data_std = scalar.fit_transform(data)

### Kmedias con la elección de baricentros iniciales expuesto en clase, que combina el algoritmo de ward, con un número de baricentros al final de 12.

#### Heuristica de los centroides mas alejados

In [370]:
# K+l baricentros iniciales
K = 6+9
kmeans = KMeans(n_clusters=K, random_state=0)

# Ajuste
kmeans.fit(data_std)

# Etiquetas
etiquetas = kmeans.labels_

# Moda
moda = -1 + np.zeros(K, dtype=int)
for c in range(K):
    moda[c] = np.bincount(y[etiquetas==c]).argmax()
    
# !!Hacemos que los centroides obtenidos sean las muestras con las que trabajara ward
X = kmeans.cluster_centers_

# Los valores objetivos son las modas de cada uno de los clusters
Y = moda
Y

array([3, 2, 1, 3, 1, 1, 1, 3, 2, 1, 2, 1, 3, 2, 3])

In [371]:
# Reduccion de los clusters mediante un aglomerativo (ward)
K = 6
aglo = AgglomerativeClustering(n_clusters=K, linkage='ward')
# Entrenamiento
aglo.fit(X)
# Etiquetas
labels = aglo.labels_

# Computo moda final
modaFinal = -1 + np.zeros(K, dtype=int)
aciertos = 0
for c in range(K):
    modaFinal[c] = np.bincount(Y[labels==c]).argmax()
    aciertos += np.sum(Y[labels==c]==modaFinal[c])
    
accuracy = aciertos/Y.shape[0]
print("Tasa de acierto con KMedias y ward:", accuracy)

Tasa de acierto con KMedias y ward: 1.0


### Kmedoids, también con 12 clusters, usando la variante por defecto y con centros iniciales establecidos con k-medoid++

In [372]:
k2 = 12
Kmedoids = KMedoids(n_clusters=k2, init='k-medoids++', method='alternate', random_state=0) # Variante por defecto: alternate
# Entrenamiento
Kmedoids.fit(data_std)

# Etiquetas
etiquetas = Kmedoids.labels_

# Moda y aciertos
moda2 = -1 + np.zeros(k2, dtype=int)
aciertos = 0
for c in range(k2):
    moda2[c] = np.bincount(y[etiquetas==c]).argmax()
    aciertos += np.sum(y[etiquetas==c]==moda2[c])
    
accuracy = aciertos/y.shape[0]
print("Tasa de acierto con Kmedoids:", accuracy)

Tasa de acierto con Kmedoids: 0.9095238095238095


### DBScan jugando con el valor de eps y mínimo de muestras por clusters para que salgan entorno a los 12 clusters antes de eliminar los puntos aislados.

In [373]:
dbs = DBSCAN(eps=0.8, min_samples=2)
# Entrenamiento
dbs.fit(data_std)

# Etiquetas
etiquetas = dbs.labels_ # Debemos conseguir entorno a 12 clusters antes de eliminar outliers
etiquetas

array([ 0,  0,  0,  0,  0,  0,  0,  0, -1,  1,  0,  0,  2,  0,  0,  2, -1,
        0, -1,  0,  0,  0,  0,  3,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,
        0,  1, -1, -1,  0, -1,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  0,
       -1,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1,  0,  0,  3,  0,  0,  0,
        0,  0,  4,  4,  4,  5,  4,  4,  4,  6, -1, -1,  4,  7,  8,  5,  5,
        5,  5, -1,  9,  9,  5,  5,  5,  7, -1,  4,  5, -1, -1,  5,  4,  5,
        5,  5,  5,  5,  5,  4,  5,  5,  5,  5,  5, -1,  6,  5,  5,  5,  5,
        5,  8,  4,  4,  5,  0,  5,  4,  5,  5, -1,  5,  5,  4,  4,  4,  0,
        4, 10, 10,  4,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,
        0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 11,
        0,  0,  0, 11,  0,  0], dtype=int64)

In [374]:
print("Numero de clusters =", len(set(etiquetas))-1) # Menos -1 por el -1 de los outliers

Numero de clusters = 12


In [375]:
outliers = (etiquetas==-1).nonzero()
outliers

(array([  8,  16,  18,  30,  36,  37,  39,  51,  59,  60,  61,  78,  79,
         87,  94,  97,  98, 113, 129, 146, 179, 188], dtype=int64),)

In [376]:
outliers = outliers[0].tolist() # Valores a eliminar del conjunto de datos

In [377]:
new_data = data[~data.index.isin(outliers)] # Eliminamos del conjunto
new_data.head()

Unnamed: 0,area,perimeter,compactness,length,width,asymmetry,groove
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175


In [378]:
y_aux = df.iloc[:,-1:]
y_aux.head()

Unnamed: 0,class
0,1
1,1
2,1
3,1
4,1


In [379]:
# Eliminar de las labels
new_y = y_aux[~y_aux.index.isin(outliers)]

In [380]:
y_final = new_y['class'].values
y_final

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int64)

In [381]:
# Nueva normalizacion
scalar = StandardScaler()
new_data_std = scalar.fit_transform(new_data)

In [382]:
# Creacion final del modelo
#dbs_final = DBSCAN(eps=0.8, min_samples=2)
# Train
dbs.fit(new_data_std)

DBSCAN(eps=0.8, min_samples=2)

In [383]:
# Etiquetas
etiquetas = dbs.labels_
etiquetas

array([ 0,  0,  0,  0,  1,  0,  0,  0,  2, -1,  0,  3,  0,  0,  3,  1,  0,
        0,  0,  1,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,
        0,  0,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,
        0,  0,  4,  0,  0,  0,  0,  0,  5,  5,  5,  6,  5,  5,  5,  7,  5,
        8,  9,  6,  6,  6,  6, 10, 10,  6,  6,  6,  8,  5,  6,  6,  5,  6,
        6,  6,  6,  6,  6,  5,  6,  6,  6,  6,  6,  7,  6,  6,  6,  6,  6,
        9,  5,  5,  6, -1,  6,  5,  6,  6,  6,  6,  5,  5,  5,  0,  5, 11,
       11,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 12,  0,  0,  0, 12,  0,
        0], dtype=int64)

In [384]:
# Comprobamos los clusters que tenemos
clusters = set(etiquetas)
clusters
k = len(clusters)-1
k

13

In [385]:
# Moda y aciertoos
moda3 = -1 + np.zeros(k, dtype=int)
aciertos = 0
for c in range(k):
    moda3[c] = np.bincount(y_final[etiquetas==c]).argmax()
    aciertos += np.sum(y_final[etiquetas==c]==moda3[c])
    
accuracy3 = aciertos/new_y.shape[0]
print("Tasa de acierto con DBScan:", accuracy3)

Tasa de acierto con DBScan: 0.723404255319149
