In [1]:
!pip install plotly
!pip install umap-learn



In [2]:
!jupyter labextension install jupyterlab-plotly



Building jupyterlab assets (production, minimized)


In [3]:
!jupyter labextension list

JupyterLab v3.4.8
/opt/conda/share/jupyter/labextensions
        jupyterlab_pygments v0.2.2 [32menabled[0m [32mOK[0m (python, jupyterlab_pygments)
        jupyterlab-plotly v5.13.0 [32menabled[0m [32mOK[0m
        jupyter-matplotlib v0.11.2 [32menabled[0m [32mOK[0m
        @jupyter-widgets/jupyterlab-manager v5.0.3 [32menabled[0m [32mOK[0m (python, jupyterlab_widgets)

Other labextensions (built into JupyterLab)
   app dir: /opt/conda/share/jupyter/lab



In [1]:
import numpy as np
import scipy.stats as stats
from itertools import combinations

import umap

import pandas as pd
from sklearn.datasets import make_blobs


import matplotlib.pyplot as plt # Para crear gráficos con matplotlib
import plotly as py
import plotly.io as pio
import plotly.graph_objects as go

In [2]:
pio.renderers.default = 'iframe'

In [14]:
# Simulación de datos
# ==============================================================================
X, y = make_blobs(
        n_samples    = 600, 
        n_features   = 2, 
        centers      = 3, 
        cluster_std  = 1, 
        shuffle      = True, 
        random_state = 0
       )

In [15]:
fig = go.Figure(data=[go.Scatter(x=X[:, 0], y=X[:, 1],
                                   mode='markers', 
                                   marker=dict(
                                    size=6,
                                    color=y,                # set color to an array/list of desired values
                                    colorscale='picnic',   # choose a colorscale
                                    opacity=0.7)
                                  )])

fig.update_layout(
    autosize=False,
    width=600,
    height=600,
    margin=dict(l=0, r=0, b=0, t=10))
    
fig.show()

In [16]:
class Frontera:
    
    def __init__(self, X, y, percentil_min, percentil_max):
        self.X = X
        self.y = y
        self.percentil_min = percentil_min
        self.percentil_max = percentil_max
        self.dic_categorias = {}
        self.dic_min_dst = {}
        
        self.dic_categorias_UMAP = {}
        self.dic_min_dst_UMAP = {}
        
        self.Frontier_Point = {}
        
    def distance(self,x0,x1):
        
        # X=(200,3) ; Y=(150,3)
        # diag(X*X')              <> (200,3) * (3,200) > diag(200,200) > (200,1)            X^2   (X-Y)^2
        # 2*X*Y'                  <>(200,3) * (3,150) > (200,150)                          -2XY
        # ones(N,1)*(diag(Y*Y'))' <> (200,1) * (150,1)' > (200,1) * (1,150) >(200,150)      Y^2
        # M_distance = diag(X*X')-2*X*Y'+ones(N,1)*(diag(Y*Y'))'  %%MATLAB
        M_distance = np.reshape( 
            np.diag(np.dot(x0,x0.T)),(-1,1)) - 2*np.dot(x0,x1.T) + np.dot( np.ones((x0.shape[0],1)), np.reshape(np.diag( np.dot(x1,x1.T)).T,(1,-1)) )
        return M_distance
       
    def get_frontier(self):

        for i in np.unique(self.y): 
            dic_categorias_aux ={'X_'+str(i):self.X[self.y==i]}
            self.dic_categorias.update(dic_categorias_aux)

        categorias_list =  [key for key in self.dic_categorias]
        comb_categories = combinations(categorias_list, 2)
        
        for categories in list(comb_categories):
            
            dist = self.distance(self.dic_categorias.get(categories[0]), self.dic_categorias.get(categories[1]))

            # row
            row = np.mean(dist, axis=1)
            select_indices_row = np.where( 
                (row > np.percentile(row, self.percentil_min)) & (row < np.percentile(row, self.percentil_max)) )[0]
            min_dst_row = self.dic_categorias.get(categories[0])[select_indices_row]
            dic_min_dst_aux ={categories[0]+'_with_'+categories[1]:min_dst_row} 
            self.dic_min_dst.update(dic_min_dst_aux)

            #Column
            column = np.mean(dist, axis=0)
            select_indices_column = np.where( 
                (column > np.percentile(column, self.percentil_min)) & (column < np.percentile(column, self.percentil_max)) )[0]
            min_dst_column = self.dic_categorias.get(categories[1])[select_indices_column]
            dic_min_dst_aux ={categories[1]+'_with_'+categories[0]:min_dst_column} 
            self.dic_min_dst.update(dic_min_dst_aux)

        

        list_all_frontier = [key for key in self.dic_min_dst]
        list_A_with_B = [nombre for indice, nombre in enumerate(list_all_frontier) if indice%2==0]
        list_B_with_A = [nombre for indice, nombre in enumerate(list_all_frontier) if indice%2==1]
        
        

        for A_with_B, B_with_A in zip(list_A_with_B, list_B_with_A): #obtenemos los valores en cada iteración
                        
            dic_min_dst_copy = self.dic_min_dst.copy()
            Front_Point = {}
            points_matriz = np.zeros(shape=(3, 2))
            
            for i in range(0,2):
                #print(dic_min_dst_copy)
                
                dist = self.distance(dic_min_dst_copy.get(A_with_B), dic_min_dst_copy.get(B_with_A))
                #print(dist)

                #ROW
                min_dist_A_with_B = np.where( dist==np.min(dist) )[0]
                min_A_with_B = dic_min_dst_copy.get(A_with_B)[min_dist_A_with_B]
                #print(dist==np.min(dist),min_A_with_B)
                dic_min_dst_copy.update({ A_with_B: np.delete(dic_min_dst_copy.get(A_with_B), min_dist_A_with_B, axis=0)})

                #COLUMN
                min_dist_B_with_A = np.where( dist==np.min(dist) )[1]
                min_B_with_A = dic_min_dst_copy.get(B_with_A)[min_dist_B_with_A]
                #print(dist==np.min(dist),min_B_with_A)

                dic_min_dst_copy.update({ B_with_A: np.delete(dic_min_dst_copy.get(B_with_A), min_dist_B_with_A, axis=0)})
                
                point_value = (np.mean(min_A_with_B+min_B_with_A,axis=0))/2
                points_matriz[i] = point_value
                
                #print('__________________________________________________________________')

            
            Front_Point ={'Frontier:(' + B_with_A.split('_')[-1] + ',' + A_with_B.split('_')[-1] + ')' : points_matriz}    
            self.Frontier_Point.update(Front_Point)
            
        self.color_list = [0] * len(self.dic_categorias.keys())
        for i in range( len(self.dic_categorias.keys()) ):
            self.color_list[i] = np.random.randint(0, 1000)
            
        #return self.dic_min_dst
    
            
    def plot_muestra_2D(self, col_1, col_2,include_layout=True):
        
        
        door = True
        next_color = 0
        
        
        for key, value in self.dic_categorias.items(): 

            if door:
                fig = go.Figure(data=[go.Scatter(x=value[:, col_1], y=value[:, col_2],
                                                 mode='markers', 
                                                 name= key,
                                                 marker=dict(
                                                     size=6,
                                                     #color=self.color_list[next_color],                # set color to an array/list of desired values
                                                     colorscale='picnic',   # choose a colorscale
                                                     opacity=0.7)
                                                )])
                door = False
                next_color += 1
                
            else:
                
                fig.add_trace(go.Scatter(x=value[:,col_1], y=value[:,col_2],
                                     mode='markers', 
                                     name= key,
                                     marker=dict(
                                         size=6,
                                         #color=self.color_list[next_color],                # set color to an array/list of desired values
                                         colorscale='picnic',   # choose a colorscale
                                         opacity=0.7)
                                    ))
                next_color += 1
              
        if include_layout:
            
            fig.update_layout(
                autosize=False,
                width=600,
                height=600,
                margin=dict(l=0, r=0, b=0, t=10))

            fig.show()
            

    def plot_frontera_2D(self, col_1, col_2):    
        
       
        #self.plot_muestra_2D(col_1, col_2, include_layout=False)
        
        door = True
        next_color = 0
        
        
        for key, value in self.dic_categorias.items(): 

            if door:
                fig = go.Figure(data=[go.Scatter(x=value[:, col_1], y=value[:, col_2],
                                                 mode='markers', 
                                                 name= key,
                                                 marker=dict(
                                                     size=6,
                                                     #color=self.color_list[next_color],                # set color to an array/list of desired values
                                                     colorscale='picnic',   # choose a colorscale
                                                     opacity=0.7)
                                                )])
                door = False
                next_color += 1
                
            else:
                
                fig.add_trace(go.Scatter(x=value[:,col_1], y=value[:,col_2],
                                     mode='markers', 
                                     name= key,
                                     marker=dict(
                                         size=6,
                                         #color=self.color_list[next_color],                # set color to an array/list of desired values
                                         colorscale='picnic',   # choose a colorscale
                                         opacity=0.7)
                                    ))
                next_color += 1
        
        for key, value_dst in self.dic_min_dst.items(): 

            fig.add_trace(go.Scatter(x=value_dst[:,col_1], y=value_dst[:,col_2],
                                     mode='markers', 
                                     name= key,
                                     marker=dict(
                                         symbol=220,
                                         size=14,
                                         color=np.random.randint(100),                # set color to an array/list of desired values
                                         #colorscale='Viridis',   # choose a colorscale
                                         opacity=1)
                                    ))
            
        for key, value_dst in self.Frontier_Point.items(): 

            fig.add_trace(go.Scatter(x=value_dst[:,col_1], y=value_dst[:,col_2],
                                     mode='markers', 
                                     name= key,
                                     marker=dict(
                                         symbol=300,
                                         size=50,
                                         color=np.random.randint(100),                # set color to an array/list of desired values
                                         #colorscale='Viridis',   # choose a colorscale
                                         opacity=1)
                                    ))

        fig.update_layout(
            autosize=True,
            width=800,
            height=600,
            margin=dict(l=10, r=10, b=10, t=20))

        fig.show()
        
    def plot_UMAP(self):    
        
        trans = umap.UMAP(random_state=42).fit(self.X)

        for key, value in self.dic_categorias.items():
            value_UMAP = trans.transform(value)
            dic_categorias_aux ={key:value_UMAP}
            self.dic_categorias_UMAP.update(dic_categorias_aux)

        for key, value in self.dic_min_dst.items():
            value_UMAP = trans.transform(value)
            dic_min_dst_aux ={key:value_UMAP}
            self.dic_min_dst_UMAP.update(dic_min_dst_aux)
        
        door = True
        next_color = 0
        
        
        for key, value in self.dic_categorias_UMAP.items(): 

            if door:
                fig = go.Figure(data=[go.Scatter(x=value[:,0], y=value[:,1],
                                                 mode='markers', 
                                                 name= key,
                                                 marker=dict(
                                                     size=6,
                                                     #color=self.color_list[next_color],                # set color to an array/list of desired values
                                                     colorscale='picnic',   # choose a colorscale
                                                     opacity=0.7)
                                                )])
                door = False
                next_color += 1
                
            else:
                
                fig.add_trace(go.Scatter(x=value[:,0], y=value[:,1],
                                     mode='markers', 
                                     name= key,
                                     marker=dict(
                                         size=6,
                                         #color=self.color_list[next_color],                # set color to an array/list of desired values
                                         colorscale='picnic',   # choose a colorscale
                                         opacity=0.7)
                                    ))
                next_color += 1
        
        for key, value_dst in self.dic_min_dst_UMAP.items(): 

            fig.add_trace(go.Scatter(x=value_dst[:,0], y=value_dst[:,1],
                                     mode='markers', 
                                     name= key,
                                     marker=dict(
                                         symbol=220,
                                         size=14,
                                         color=np.random.randint(100),                # set color to an array/list of desired values
                                         #colorscale='Viridis',   # choose a colorscale
                                         opacity=1)
                                    ))

        fig.update_layout(
            autosize=True,
            width=800,
            height=600,
            margin=dict(l=10, r=10, b=10, t=20))

        fig.show()
        
        
        
     

In [11]:
frontera = Frontera(X,y,2,10)

In [12]:
%%time
frontera.get_frontier()

CPU times: user 3.48 ms, sys: 984 µs, total: 4.47 ms
Wall time: 3.39 ms


In [20]:
frontera.plot_muestra_2D(0,1)

In [21]:
# graph tsne
frontera.plot_UMAP()

In [22]:
frontera.plot_frontera_2D(0,1)

In [230]:
frontera.Frontier_Point

{'Frontier:(0,1)': array([[1.67965135, 2.9108661 ],
        [0.45915817, 2.03250875],
        [1.24757045, 2.60953324]]),
 'Frontier:(0,2)': array([[-0.74550706,  4.49314904],
        [-0.24479135,  3.49139245],
        [-0.29855852,  2.9258399 ]]),
 'Frontier:(1,2)': array([[-0.09482055,  1.53552849],
        [-0.18455952,  1.15409225],
        [ 0.18571869,  2.10326155]])}

In [231]:
def distance(x0,x1):

    # X=(200,3) ; Y=(150,3)
    # diag(X*X')              <> (200,3) * (3,200) > diag(200,200) > (200,1)            X^2   (X-Y)^2
    # 2*X*Y'                  <>(200,3) * (3,150) > (200,150)                          -2XY
    # ones(N,1)*(diag(Y*Y'))' <> (200,1) * (150,1)' > (200,1) * (1,150) >(200,150)      Y^2
    # M_distance = diag(X*X')-2*X*Y'+ones(N,1)*(diag(Y*Y'))'  %%MATLAB
    M_distance = np.reshape( 
        np.diag(np.dot(x0,x0.T)),(-1,1)) - 2*np.dot(x0,x1.T) + np.dot( np.ones((x0.shape[0],1)), np.reshape(np.diag( np.dot(x1,x1.T)).T,(1,-1)) )
    return M_distance

In [88]:
for categories in list(comb_categories):

    dist = self.distance(self.dic_categorias.get(categories[0]), self.dic_categorias.get(categories[1]))

    # row
    row = np.mean(dist, axis=1)
    select_indices_row = np.where( 
        (row > np.percentile(row, self.percentil_min)) & (row < np.percentile(row, self.percentil_max)) )[0]
    min_dst_row = self.dic_categorias.get(categories[0])[select_indices_row]
    dic_min_dst_aux ={categories[0]+'_with_'+categories[1]:min_dst_row} 
    self.dic_min_dst.update(dic_min_dst_aux)

    #Column
    column = np.mean(dist, axis=0)
    select_indices_column = np.where( 
        (column > np.percentile(column, self.percentil_min)) & (column < np.percentile(column, self.percentil_max)) )[0]
    min_dst_column = self.dic_categorias.get(categories[1])[select_indices_column]
    dic_min_dst_aux ={categories[1]+'_with_'+categories[0]:min_dst_column} 
    self.dic_min_dst.update(dic_min_dst_aux)

In [147]:
frontera.dic_min_dst.get(A_with_B).shape

(16, 2)

In [156]:
Frontier_Point = {}

list_all_frontier = [key for key in frontera.dic_min_dst]
list_A_with_B = [nombre for indice, nombre in enumerate(list_all_frontier) if indice%2==0]
list_B_with_A = [nombre for indice, nombre in enumerate(list_all_frontier) if indice%2==1]

for A_with_B, B_with_A in zip(list_A_with_B, list_B_with_A): #obtenemos los valores en cada iteración
    dist = distance(frontera.dic_min_dst.get(A_with_B), frontera.dic_min_dst.get(B_with_A))
    
    #ROW
    min_dist_A_with_B = np.where( dist==np.min(dist) )[0]
    min_A_with_B = frontera.dic_min_dst.get(A_with_B)[min_dist_A_with_B]
    dd = np.delete(frontera.dic_min_dst.get(A_with_B), min_dist_A_with_B, axis=0)
    print(dd.shape)
    #COLUMN
    min_dist_B_with_A = np.where( dist==np.min(dist) )[1]
    min_B_with_A = frontera.dic_min_dst.get(B_with_A)[min_dist_B_with_A]
    
    Point = (np.mean(min_A_with_B+min_B_with_A,axis=0))/2
    Front_Point ={'Frontier:(' + B_with_A.split('_')[-1] + ',' + A_with_B.split('_')[-1] + ')' : Point} 
    Frontier_Point.update(Front_Point)
    
    
    
print(Frontier_Point)


(15, 2)
(15, 2)
(15, 2)
{'Frontier:(0,1)': array([2.2549212 , 2.99383436]), 'Frontier:(0,2)': array([0.25356862, 2.85411382]), 'Frontier:(1,2)': array([-0.40268602,  0.89427824])}


In [132]:
list_B[0]

'X_1_with_X_0'

In [139]:
list_B[0].split('_')[-1]

'0'

In [45]:
for key, value_dst in frontera.dic_min_dst.items():
    print(key)
    

X_0_with_X_1
X_1_with_X_0
X_0_with_X_2
X_2_with_X_0
X_1_with_X_2
X_2_with_X_1


In [36]:
#dic_categorias.get('X_0')

In [37]:
categorias_list = [key for key in dic_categorias]
categorias_list.append('X_')
categorias_list

NameError: name 'dic_categorias' is not defined

In [139]:
combinaciones(categorias_list, 2)

[['X_0', 'X_1'], ['X_0', 'X_'], ['X_1', 'X_']]

In [81]:
dic_categorias ={}

for i in np.unique(y): 
    dic_categorias_aux ={'X_'+str(i):X[y==i]} 
    dic_categorias.update(dic_categorias_aux)

In [82]:
x0 = dic_categorias.get('X_0')

In [83]:
x1 = dic_categorias.get('X_1')

In [84]:
dis = np.reshape(np.diag(np.dot(x0,x0.T)),(-1,1))-2*np.dot(x0,x1.T) + np.dot( np.ones((x0.shape[0],1)), np.reshape(np.diag( np.dot(x1,x1.T)).T,(1,-1)) )

In [94]:
# row
row = np.mean(dis, axis=1)

select_indices_1 = np.where( row < np.percentile(row, 10))[0]
min_dst_x0 = dic_categorias.get('X_0')[select_indices_1]

In [93]:
#Columna
column = np.mean(dis, axis=0)

select_indices_0 = np.where( column < np.percentile(column, 10))[0]
min_dst_x1 = dic_categorias.get('X_1')[select_indices_0]

In [95]:
fig = go.Figure(data=[go.Scatter(x=X[:, 0], y=X[:, 1],
                                 mode='markers',
                                 name='',
                                 marker=dict(
                                     size=6,
                                     color=y,                # set color to an array/list of desired values
                                     colorscale='picnic',   # choose a colorscale
                                     opacity=0.7)
                                )])

fig.add_trace(go.Scatter(x=min_dst_x0[:,0], y=min_dst_x0[:,1],
                         mode='markers', 
                         name="X1 Limit",
                         marker=dict(
                             symbol=12,
                             size=14,
                             color='orange',                # set color to an array/list of desired values
                             #colorscale='Viridis',   # choose a colorscale
                             opacity=1)
                        ))

fig.add_trace(go.Scatter(x=min_dst_x1[:,0], y=min_dst_x1[:,1],
                         mode='markers', 
                         name="X2 Limit",
                         marker=dict(
                             symbol=10,
                             size=14,
                             color='green',                # set color to an array/list of desired values
                             #colorscale='Viridis',   # choose a colorscale
                             opacity=1)
                        ))
fig.update_layout(
    autosize=True,
    width=900,
    height=700,
    margin=dict(l=10, r=10, b=10, t=20))
    
fig.show()

## With Pandas

In [35]:
df = pd.DataFrame(dict(x1=X[:,0], x2=X[:,1], label=y))

In [36]:
df_label_0 = df[df.label == 0]
df_label_1 = df[df.label == 1]

In [37]:
#dist = numpy.linalg.norm(a-b)
data = {'label_0_x1': [], 'label_0_x2': [], 'label_1_x1': [], 'label_1_x2': [], 'distance':[]}  
df_distances = pd.DataFrame(data)  

In [38]:
%%time

index = 0
for index_0, row_label_0 in df_label_0.iterrows():
    for index_1, row_label_1 in df_label_1.iterrows():
        a = row_label_0[:2]
        b = row_label_1[:2]
        distance = np.linalg.norm(a-b)
        df_distances.loc[index] = [row_label_0['x1'], row_label_0['x2'], 
                                   row_label_1['x1'], row_label_1['x2'],
                                   distance]
        index += 1



CPU times: user 51.4 s, sys: 24.4 ms, total: 51.4 s
Wall time: 51.4 s


In [25]:
df_distances

Unnamed: 0,label_0_x1,label_0_x2,label_1_x1,label_1_x2,distance
0,-0.316587,4.570838,2.810663,1.765071,4.201431
1,-0.316587,4.570838,1.925161,0.991617,4.223299
2,-0.316587,4.570838,2.855832,0.975924,4.794544
3,-0.316587,4.570838,2.280876,1.347376,4.139749
4,-0.316587,4.570838,2.619276,-0.386889,5.761801
...,...,...,...,...,...
39995,1.908118,4.643752,0.453431,0.010483,4.856264
39996,1.908118,4.643752,2.129854,-0.179435,4.828282
39997,1.908118,4.643752,2.984773,1.479888,3.342038
39998,1.908118,4.643752,2.775251,-0.205243,4.925918


In [144]:
min_dst_label_0 = df_distances[['label_0_x1','label_0_x2','distance']].groupby(by=['label_0_x1','label_0_x2'], dropna=False). \
	mean().sort_values(by='distance',ascending=True).head(15).reset_index()

In [145]:
min_dst_label_1 = df_distances[['label_1_x1','label_1_x2','distance']].groupby(by=['label_1_x1','label_1_x2'], dropna=False). \
	mean().sort_values(by='distance',ascending=True).head(15).reset_index()

In [194]:
fig = go.Figure(data=[go.Scatter(x=X[:, 0], y=X[:, 1],
                                 mode='markers',
                                 name='',
                                 marker=dict(
                                     size=6,
                                     color=y,                # set color to an array/list of desired values
                                     colorscale='picnic',   # choose a colorscale
                                     opacity=0.7)
                                )])

fig.add_trace(go.Scatter(x=min_dst_label_0.label_0_x1, y=min_dst_label_0.label_0_x2,
                         mode='markers', 
                         name="X1 Limit",
                         marker=dict(
                             symbol=12,
                             size=14,
                             color='orange',                # set color to an array/list of desired values
                             #colorscale='Viridis',   # choose a colorscale
                             opacity=1)
                        ))

fig.add_trace(go.Scatter(x=min_dst_label_1.label_1_x1, y=min_dst_label_1.label_1_x2,
                         mode='markers', 
                         name="X2 Limit",
                         marker=dict(
                             symbol=10,
                             size=14,
                             color='green',                # set color to an array/list of desired values
                             #colorscale='Viridis',   # choose a colorscale
                             opacity=1)
                        ))
fig.update_layout(
    autosize=True,
    width=1000,
    height=800,
    margin=dict(l=10, r=10, b=10, t=20))
    
fig.show()

### 1. Buscar forma matricial
### 2. Función: hasta la selección de puntos limites
### 3. Probar otras formas ej: Espiral suiza con solapamiento - 2D
### 4. Mas dimenciones visualizando con UMAP