## Clustering - No supervisado

### Dataset de prueba

In [None]:
import pandas as pd

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial import distance

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  # we only take the first two features.
y = iris.target

In [None]:
X.shape, y.shape

In [None]:
scaler = StandardScaler()
print(scaler.fit(X))
X=scaler.transform(X)

## Algoritmos no supervisados
### 1. K no conocido

### 1.1 Substractive

In [None]:
def set_colors(labels, colors='rgb'):
    colored_labels = []
    for label in labels:
        colored_labels.append(colors[label])
    return colored_labels

In [None]:
# initialize potentials
def substractive(normalized_data_matrix,ra,Eup = 0.5,Edown = 0.15):
    rb = ra * 1.15
    colors_options = 'rgb'
    cluster_center = []
    size = len(normalized_data_matrix)
    potential = [0.0] * size
    for i in range(size):
        Xi = normalized_data_matrix[i]
        for j in range(i + 1, size):
            Xj = normalized_data_matrix[j]
            value = np.exp(-4.0 * (distance.euclidean(Xi, Xj)) / (ra / 2) ** 2)
            potential[i] += value
            potential[j] += value
    max_potential_value = max(potential)  # p1
    max_potential_index = potential.index(max_potential_value)
    # filter through accept and reject criteria
    current_max_value = max_potential_value
    criteria = 1
    while criteria and current_max_value:
        criteria = 0
        max_potential_vector = normalized_data_matrix[max_potential_index]  # x1
        potential_ratio = current_max_value / max_potential_value  # Pk and MaxPValue
        print(f'Maximal_potencial_value: {max_potential_value}')

        if potential_ratio > Eup:
            criteria = 1
        elif potential_ratio > Edown:
            dmin = np.min([distance.euclidean(max_potential_vector, cc) for cc in cluster_center])
            if ((dmin / ra) + potential_ratio) >= 1:
                criteria = 1
            else:
                criteria = 2
        elif potential_ratio < Edown:
            break
        print(f'Criteria: {criteria}')
        if criteria is 1:
            cluster_center.append(max_potential_vector)
            for i in range(size):
                Xj = normalized_data_matrix[i]
                potential_value = potential[i]
                potential_value = potential_value - (current_max_value * np.exp(-4.0 *(distance.euclidean(max_potential_vector, Xj)) / (rb / 2) ** 2))
                if potential_value < 0:
                    potential_value = 0
                potential[i] = potential_value
            current_max_value = max(potential)  # p1
            print(f'Current_max_value: {current_max_value}')
            max_potential_index = potential.index(current_max_value)
        elif criteria is 2:
            potential[max_potential_index] = 0
            current_max_value = max(potential)  # p1
            print(f'Current_max_value: {current_max_value}')
            max_potential_index = potential.index(current_max_value)
    print('Subtractive cluster centers')
    print(len(cluster_center))
    return len(cluster_center)

In [None]:
kclus = []
for i in np.arange(0.3,2.1,0.1):
    ra = i
    print(ra)
    aux = substractive(X,ra,Eup = 0.5,Edown = 0.15)
    kclus.append(aux)

In [None]:
print(kclus) # numero de clusters sensibilizando parametros
print("minimo numero de clusters:", min(kclus))
print("maximo numero de clusters:", max(kclus))

### 1.2. AffinityPropagation

In [None]:
from sklearn.cluster import AffinityPropagation

kclus2 = []
for i in np.arange(0.5,1,0.05):
    clustering = AffinityPropagation(damping=i).fit(X)
    kclus2.append(max(clustering.labels_)+1)

In [None]:
print(kclus2) # numero de clusters sensibilizando parametros
print("minimo numero de clusters:", min(kclus2))
print("maximo numero de clusters:", max(kclus2))

### 1.3. MeanShift

In [None]:
from sklearn.cluster import MeanShift

kclus3 = []
for i in np.arange(0.1,3,0.1):
    clustering = MeanShift(bandwidth=i).fit(X)
    kclus3.append(max(clustering.labels_)+1)
clustering

In [None]:
print(kclus3) # numero de clusters sensibilizando parametros
print("minimo numero de clusters:", min(kclus3))
print("maximo numero de clusters:", max(kclus3))

In [None]:
plt.plot(kclus3)
plt.title("Meanshift")
plt.xlabel("bandwidth")
plt.ylabel("Clusters")

### 2. K conocido

### 2.1. K - Means

In [None]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)

In [None]:
y_pred = KMeans(n_clusters=3, random_state=0).fit_predict(X)

In [None]:
centroids=kmeans.cluster_centers_[:,0:2] # se toman 2 features
centroids

In [None]:
plt.scatter(X[:, 0], X[:,1], c=y_pred)
plt.scatter(centroids[:,0],centroids[:,1],c='r',marker='X')
plt.title("Unequal Variance")

### 2.2. Fuzzy c-means clustering

In [None]:
#!pip install -U scikit-fuzzy

In [None]:
from __future__ import division, print_function
import numpy as np
import matplotlib.pyplot as plt
import skfuzzy as fuzz

The fuzzy partition coefficient (FPC)
The FPC is defined on the range from 0 to 1, with 1 being best. It is a metric which tells us how cleanly our data is described by a certain model. Next we will cluster our set of data - which we know has three clusters - several times, with between 2 and 9 clusters. We will then show the results of the clustering, and plot the fuzzy partition coefficient. When the FPC is maximized, our data is described best.

In [None]:
colors = ['b', 'orange', 'g', 'r', 'c', 'm', 'y', 'k', 'Brown', 'ForestGreen']
# Set up the loop and plot
fig1, axes1 = plt.subplots(3, 3, figsize=(8, 8))
xpts=X[:, 0]
ypts=X[:,1]
alldata = np.vstack((xpts, ypts))
fpcs = []

for ncenters, ax in enumerate(axes1.reshape(-1), 2):
    print(ncenters,ax)
    cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
        alldata, ncenters, 2, error=0.005, maxiter=1000, init=None)

    # Store fpc values for later
    fpcs.append(fpc)

    # Plot assigned clusters, for each data point in training set
    cluster_membership = np.argmax(u, axis=0)
    for j in range(ncenters):
        ax.plot(xpts[cluster_membership == j],
                ypts[cluster_membership == j], '.', color=colors[j])

    # Mark the center of each fuzzy cluster
    for pt in cntr:
        ax.plot(pt[0], pt[1], 'rs')

    ax.set_title('Centers = {0}; FPC = {1:.2f}'.format(ncenters, fpc))
    ax.axis('off')

fig1.tight_layout()


### Algoritmo individual - Fuzzy c-means

In [None]:
ncenters=3
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(X.T, ncenters, 2, error=0.005, maxiter=1000, init=None) #datos transpuestos
xpts=X[:,0]
ypts=X[:,1]

In [None]:
cluster_membership = np.argmax(u, axis=0)
for j in range(ncenters):
    plt.plot(xpts[cluster_membership == j],
            ypts[cluster_membership == j], '.', color=colors[j])

    # Mark the center of each fuzzy cluster
for pt in cntr:
    plt.plot(pt[0], pt[1], 'rs')

plt.title('Centers = {0}; FPC = {1:.2f}'.format(ncenters, fpc))

fig1.tight_layout()

### Prediccion

In [None]:
u, u0, d, jm, p, fpc = fuzz.cluster.cmeans_predict(X.T, cntr, 2, error=0.005, maxiter=1000)

cluster_membership = np.argmax(u, axis=0)  # Hardening for visualization

fig3, ax3 = plt.subplots()
ax3.set_title('Predictions')
for j in range(ncenters):
    ax3.plot(X[cluster_membership == j, 0],
             X[cluster_membership == j, 1], 'o',
             label='series ' + str(j))
ax3.legend()

plt.show()

### 2.3. AgglomerativeClustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
model = AgglomerativeClustering(linkage='ward',n_clusters=3)
model.fit(X)

In [None]:
model.labels_

In [None]:
for j in range(model.n_clusters):
    plt.plot(xpts[model.labels_ == j],
            ypts[model.labels_ == j], '.', color=colors[j])

plt.title('Clusters = {0}'.format(model.n_clusters))

fig1.tight_layout()

### 2.4. SpectralClustering

In [None]:
from sklearn.cluster import SpectralClustering

In [None]:
clustering = SpectralClustering(n_clusters=3,assign_labels="discretize",random_state=0).fit(X)

In [None]:
clustering.labels_

In [None]:
for j in range(clustering.n_clusters):
    plt.plot(xpts[clustering.labels_ == j],
            ypts[clustering.labels_ == j], '.', color=colors[j])

plt.title('Clusters = {0}'.format(clustering.n_clusters))

fig1.tight_layout()

### Resultados algoritmos Clustering

In [None]:
pred=pd.DataFrame([y_pred, cluster_membership,model.labels_,clustering.labels_])
pred

In [None]:
m=4
n=pred.shape[1] #recorre numero de activos 33 en este caso
hm=np.zeros((n,n))
for i in range(m):
    for j in range(n):
        for k in range(n):
            if pred.iloc[i,j]==pred.iloc[i,k]:
                hm[j,k]+=1
            else:
                hm[j,k]+=0

In [None]:
hm=pd.DataFrame(hm)

In [None]:
plt.subplots(figsize=(8,6))
ax = sns.heatmap(hm,cmap='Blues')
ax.invert_yaxis()

In [None]:
hm2=hm.sort_values(by=hm.columns.tolist()) #ordeno todas las columnas

In [None]:
hm2=hm2[hm2.index]
hm2.head()

In [None]:
plt.subplots(figsize=(8,6))
ax = sns.heatmap(hm2,cmap='Blues',cbar_kws={'ticks': [0,1,2,3,4]}, vmin=0, vmax=4)#yticklabels=hm2.index
ax.invert_yaxis()