In [82]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#EUCLIDEA

class ClassCentroidIdentifier:
    def __init__(self):
        self.centroids = None
        self.nearest_indices = None
    
    def set_centroids(self, data, class_var):
        """
        Function to set the centroids for each class.
        The centroid of a class is calculated as the mean of the data points belonging to that class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        """
        unique_classes = np.unique(class_var)
        self.centroids = np.zeros((len(unique_classes), data.shape[1]))
        for i, c in enumerate(unique_classes):
            class_data = data[class_var == c]
            self.centroids[i, :] = np.median(class_data, axis=0)
        return self.centroids

    def distances_sqr_euclidean(self, X1, X2):
        """
        Function to set the centroids for each class.
        The centroid of a class is calculated as the mean of the data points belonging to that class.
        
        Parameters:
        X1 (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        X2 (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        
        Returns:
        np.array: A 1D array of length n, representing the distances between X1 and X2.
        """
        distances = np.reshape(np.diag((np.dot(X1, X1.T)) - (2 * (np.dot(X1, X2.T))) + (np.ones((X2.shape[1], 1)) * (np.diag(np.dot(X2, X2.T)).T))),(1,-1))
        return distances
    
    def get_furthest_points(self, data, class_var):
        """
        Function to get the furthest data point in the 90th percentile for each class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        """
        unique_classes = np.unique(class_var)
        furthest_points = np.zeros((len(unique_classes), data.shape[1]))
        for i, c in enumerate(unique_classes):
            class_data = data[class_var == c]
            # class_data = np.expand_dims(class_data, axis=1)
            centroid = np.expand_dims(self.centroids[i, :], axis=0)
            distances = abs(self.distances_sqr_euclidean(centroid,class_data))#np.linalg.norm(class_data - centroid, axis=-1).flatten()
            furthest_point_index = int(np.percentile(np.argsort(distances), 90))
            furthest_points[i, :] = class_data[furthest_point_index, :].flatten()
        return furthest_points

    def assign_nearby_points(self, data, class_var):
        """
        Function to assign the nearest points to each centroid.
        Points are considered nearest if they are closer to the centroid than the furthest point in the 90th percentile
        for the same class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 1D array of length n, representing the class label for each data point after re-assignment.
        np.array: A 1D array of length n, representing the data points that were not re-assigned to any class.
        """
        # Get the furthest points for each class
        furthest_points = self.get_furthest_points(data, class_var)
        # Initialize arrays to store the re-assigned class labels and unassigned data points
        re_assigned_class = np.zeros_like(class_var)
        unassigned_points = []
 

        # Loop through the data points
        for i, d in enumerate(data):
            # Initialize a flag to check if the current data point has been re-assigned to a class
            re_assigned = False
            # Loop through the classes
            for j, c in enumerate(self.centroids):
                # Check if the current data point does not belong to the class
                if class_var[i] != j:
                    # Calculate the Euclidean distance between the data point and the centroid of the current class
                    c = np.expand_dims(c, axis=0)
                    distance = self.distances_sqr_euclidean(c,d)#abs(np.linalg.norm(c - d)) #expandir dimenciones Centroide, y sacar funcion de robin
                    # Calculate the Euclidean distance between the furthest point and the centroid of the current class
                    furthest_distance = abs(self.distances_sqr_euclidean(c,furthest_points[j]))#abs(np.linalg.norm(c - furthest_points[j]))
                    #distance = distance.append(distance)
                    # Check if the distance between the data point and the centroid is less than the distance between the furthest point and the centroid
                    if distance < furthest_distance:
                        # Assign the data point to the class
                        re_assigned_class[i] = re_assigned_class[i]+j
                        re_assigned = True
                        break
            # If the data point was not re-assigned to a class, add it to the list of unassigned points
            if not re_assigned:
                unassigned_points.append(i)
        return re_assigned_class, np.array(unassigned_points)


In [95]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#EUCLIDEA

class ClassCentroidIdentifier:
    def __init__(self):
        self.centroids = None
        self.nearest_indices = None
    
    def set_centroids(self, data, class_var):
        """
        Function to set the centroids for each class.
        The centroid of a class is calculated as the mean of the data points belonging to that class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        """
        unique_classes = np.unique(class_var)
        self.centroids = np.zeros((len(unique_classes), data.shape[1]))
        for i, c in enumerate(unique_classes):
            class_data = data[class_var == c]
            self.centroids[i, :] = np.median(class_data, axis=0)
        return self.centroids

    def distances_sqr_euclidean(self, X1, X2):
        """
        Function to set the centroids for each class.
        The centroid of a class is calculated as the mean of the data points belonging to that class.
        
        Parameters:
        X1 (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        X2 (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        
        Returns:
        np.array: A 1D array of length n, representing the distances between X1 and X2.
        """
        distances = np.reshape(np.diag((np.dot(X1, X1.T)) - (2 * (np.dot(X1, X2.T))) + (np.ones((X2.shape[1], 1)) * (np.diag(np.dot(X2, X2.T)).T))),(1,-1))
        return distances
    
    def get_furthest_points(self, data, class_var):
        """
        Function to get the furthest data point in the 90th percentile for each class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        """
        unique_classes = np.unique(class_var)
        furthest_points = np.zeros((len(unique_classes), data.shape[1]))
        for i, c in enumerate(unique_classes):
            class_data = data[class_var == c]
            # class_data = np.expand_dims(class_data, axis=1)
            centroid = np.expand_dims(self.centroids[i, :], axis=0)
            distances = abs(self.distances_sqr_euclidean(centroid,class_data))#np.linalg.norm(class_data - centroid, axis=-1).flatten()
            furthest_point_index = int(np.percentile(np.argsort(distances), 90))
            furthest_points[i, :] = class_data[furthest_point_index, :].flatten()
        return furthest_points

    def assign_nearby_points(self, data, class_var):
        """
        Function to assign the nearest points to each centroid.
        Points are considered nearest if they are closer to the centroid than the furthest point in the 90th percentile
        for the same class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 1D array of length n, representing the class label for each data point after re-assignment.
        np.array: A 1D array of length n, representing the data points that were not re-assigned to any class.
        """
        # Get the furthest points for each class
        furthest_points = self.get_furthest_points(data, class_var)
        # Initialize arrays to store the re-assigned class labels and unassigned data points
        re_assigned_class = np.zeros_like(class_var)
        unassigned_points = []

        # Loop through the data points
        for i, d in enumerate(data):
            re_assigned = False
            for j, c in enumerate(self.centroids):
                # Get data points for the current class
                class_data = data[class_var == c]
                # Calculate the distances between the current data point and the centroid for the current class
                c = np.expand_dims(c, axis=0)
                distances = self.distances_sqr_euclidean(class_data, c)#np.linalg.norm(class_data - c, axis=-1).flatten()
                furthest_distance =  self.distances_sqr_euclidean(furthest_points[j],c)
                if np.min(distances) < furthest_distance:
                    # Assign the data point to the class
                    re_assigned_class[i] = c
                    re_assigned = True
                    break
            # If the data point was not re-assigned to a class, add it to the list of unassigned points
            if not re_assigned:
                unassigned_points.append(i)
        return re_assigned_class, np.array(unassigned_points)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#Mahalanobis

class ClassCentroidIdentifier:
    def __init__(self):
        self.centroids = None
        self.nearest_indices = None
    
    def set_centroids(self, data, class_var):
        """
        Function to set the centroids for each class.
        The centroid of a class is calculated as the mean of the data points belonging to that class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        """
        unique_classes = np.unique(class_var)
        self.centroids = np.zeros((len(unique_classes), data.shape[1]))
        for i, c in enumerate(unique_classes):
            class_data = data[class_var == c]
            self.centroids[i, :] = np.median(class_data, axis=0)
        return self.centroids
    
    def get_furthest_points(self, data, class_var):
        """
        Function to get the furthest data point in the 90th percentile for each class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        """
        unique_classes = np.unique(class_var)
        furthest_points = np.zeros((len(unique_classes), data.shape[1]))
        for i, c in enumerate(unique_classes):
            class_data = data[class_var == c]
            class_data = np.expand_dims(class_data, axis=1)
            centroid = np.expand_dims(self.centroids[i, :], axis=0)
            distances = np.linalg.norm(class_data - centroid, axis=-1).flatten()
            furthest_point_index = int(np.percentile(np.argsort(distances), 90))
            furthest_points[i, :] = class_data[furthest_point_index, :].flatten()
        return furthest_points

    def assign_nearby_points(self, data, class_var):
        """
        Function to assign the nearest points to each centroid.
        Points are considered nearest if they are closer to the centroid than the furthest point in the 90th percentile
        for the same class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 1D array of length n, representing the class label for each data point after re-assignment.
        np.array: A 1D array of length n, representing the data points that were not re-assigned to any class.
        """
        # Get the furthest points for each class
        furthest_points = self.get_furthest_points(data, class_var)
        # Initialize arrays to store the re-assigned class labels and unassigned data points
        re_assigned_class = np.zeros_like(class_var)
        unassigned_points = []
 

        # Loop through the data points
        for i, d in enumerate(data):
            # Initialize a flag to check if the current data point has been re-assigned to a class
            re_assigned = False
            # Loop through the classes
            for j, c in enumerate(self.centroids):
                # Check if the current data point does not belong to the class
                if class_var[i] != j:
                    # Calculate the Euclidean distance between the data point and the centroid of the current class
                    distance = abs(np.linalg.norm(c - d)) #expandir dimenciones Centroide, y sacar funcion de robin
                    # Calculate the Euclidean distance between the furthest point and the centroid of the current class
                    furthest_distance = abs(np.linalg.norm(c - furthest_points[j]))
                    #distance = distance.append(distance)
                    # Check if the distance between the data point and the centroid is less than the distance between the furthest point and the centroid
                    if distance < furthest_distance:
                        # Assign the data point to the class
                        re_assigned_class[i] = re_assigned_class[i]+j
                        re_assigned = True
                        break
            # If the data point was not re-assigned to a class, add it to the list of unassigned points
            if not re_assigned:
                unassigned_points.append(i)
        return re_assigned_class, np.array(unassigned_points)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#COSINE

class ClassCentroidIdentifier:
    def __init__(self):
        self.centroids = None
        self.nearest_indices = None
    
    def set_centroids(self, data, class_var):
        """
        Function to set the centroids for each class.
        The centroid of a class is calculated as the mean of the data points belonging to that class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        """
        unique_classes = np.unique(class_var)
        self.centroids = np.zeros((len(unique_classes), data.shape[1]))
        for i, c in enumerate(unique_classes):
            class_data = data[class_var == c]
            self.centroids[i, :] = np.mean(class_data, axis=0)
        return self.centroids


    
    def get_furthest_points(self, data, class_var):
        """
        Function to get the furthest data point in the 90th percentile for each class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        """
        unique_classes = np.unique(class_var)
        furthest_points = np.zeros((len(unique_classes), data.shape[1]))
        for i, c in enumerate(unique_classes):
            class_data = data[class_var == c]
            class_data = np.expand_dims(class_data, axis=1) #X
            centroid = np.expand_dims(self.centroids[i, :], axis=0) #Y 
            distances = abs(np.linalg.norm(class_data - centroid, axis=-1).flatten()) 
            furthest_point_index = int(np.percentile(np.argsort(distances), 90))
            furthest_points[i, :] = class_data[furthest_point_index, :].flatten()
        return furthest_points

    def assign_nearby_points(self, data, class_var):
        """
        Function to assign the nearest points to each centroid.
        Points are considered nearest if they are closer to the centroid than the furthest point in the 90th percentile
        for the same class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 1D array of length n, representing the class label for each data point after re-assignment.
        np.array: A 1D array of length n, representing the data points that were not re-assigned to any class.
        """
        # Get the furthest points for each class
        furthest_points = self.get_furthest_points(data, class_var)
        # Initialize arrays to store the re-assigned class labels and unassigned data points
        re_assigned_class = np.zeros_like(class_var)
        unassigned_points = []
        
        # Loop through the data points
        for i, d in enumerate(data):
            # Initialize a flag to check if the current data point has been re-assigned to a class
            re_assigned = False
            # Loop through the classes
            for j, c in enumerate(self.centroids):
                # Check if the current data point belongs to the class
                if class_var[i] != j:
                    #re_assigned_class[i] = class_var[i]
                    # Calculate the Euclidean distance between the data point and the centroid of the current class
                    distance = abs(np.dot(d, c) / (np.linalg.norm(d) * np.linalg.norm(c))) #
                    # Calculate the cosine distance between the furthest point and the centroid of the current class
                    furthest_distance = abs(np.dot(furthest_points[j], c) / (np.linalg.norm(furthest_points[j]) * np.linalg.norm(c)))
                    # If the distance from the data point to the centroid is less than the distance from the furthest point
                    # to the centroid, assign the class label of the current class to the data point
                    if distance < furthest_distance: 
                        re_assigned_class[i] = re_assigned_class[i]+(j)
                        re_assigned = True
                        break
            # If the data point was not re-assigned to any class, add it to the list of unassigned points
            if not re_assigned:
                unassigned_points.append(d)
        return re_assigned_class, np.array(unassigned_points) 

In [None]:
# agregar en el codigo de robin, tres distancias.

In [5]:
import numpy as np
import pandas as pd
import time

In [96]:
import numpy as np
import pandas as pd

# Load the iris dataset
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header=None)

# Get the data and class labels
data = df.iloc[:, :-1].values
class_var = df.iloc[:, -1].values

tic = time.time()
# Create an instance of the ClassCentroi class
cci = ClassCentroidIdentifier()

# Set the centroids for each class
centroids = cci.set_centroids(data, class_var)
c = cci.get_furthest_points(data, class_var)
# Assign nearby points to each centroid
re_assigned_class, unassigned_points = cci.assign_nearby_points(data, class_var)
c = cci.get_furthest_points(data, class_var)
toc = time.time()
# Print the number of unassigned points
print(f"Number of unassigned points: {len(unassigned_points)}")
print(f"Number of re_assigned_class points: {len(re_assigned_class)}")
print(centroids)
print(c)
print("Vectorized version:" + str(1000*(toc-tic))+"ms")


elementwise comparison failed; this will raise an error in the future.



ValueError: shapes (0,150,4) and (4,150,0) not aligned: 4 (dim 2) != 150 (dim 1)

kdc 

In [None]:
df_test = pd.DataFrame(data)
df_test['y'] = np.where((df.iloc[:, -1].values)=='Iris-setosa',0,np.where((df.iloc[:, -1].values)=='Iris-versicolor',1,2))
df_test['near'] = re_assigned_class
# df_test['distance'] = distance
df_test['val'] = df_test['y']==df_test['near']
df_test[df_test.val!=True]

In [None]:
df_test['near'].unique()

# 1

In [None]:
pip install umap

In [7]:
import numpy as np
import scipy.stats as stats
from itertools import combinations

import umap

import pandas as pd
from sklearn.datasets import make_blobs


import matplotlib.pyplot as plt # Para crear gráficos con matplotlib
import plotly as py
import plotly.io as pio
import plotly.graph_objects as go

In [46]:
# Simulación de datos
# ==============================================================================
X, y = make_blobs(n_samples = 150,n_features = 2, centers = 3, cluster_std = 1.5, shuffle = True, random_state = 0)

df = pd.DataFrame(X, columns=['feature_1', 'feature_2'])
df['target'] = y
print(df.head())



   feature_1  feature_2  target
0   3.704757   1.880559       1
1  -0.355409   1.332593       0
2   0.454402   4.538311       0
3  -0.366930   4.884141       0
4   3.745221  -0.722234       1


In [51]:
fig = go.Figure(data=[go.Scatter(x=df['feature_1'],  # specify the data for the x-axis                                 
                        y=df['feature_2'],  # specify the data for the y-axis                                 
                        mode='markers',  # set the mode to 'markers' to display individual markers for each data point                                 
                        marker=dict(size=6,  # specify the size of the markers                                             
                        color=df['target'],  # specify the data for the marker color                                             
                        colorscale='picnic',  # choose a colorscale for the marker color                                             
                        opacity=0.7)  # specify the opacity of the markers                                
                        )])


fig.update_layout(autosize=False,  # set autosize to False to manually set the dimensions                  
width=600,  # set the width of the plot                  
height=600,  # set the height of the plot                  
margin=dict(l=0, r=0, b=0, t=10))  # set the margins of the plot# Show the figure
fig.show()

In [52]:
unique_classes = np.unique(y)
print(unique_classes)
centroids = np.zeros((len(unique_classes), X.shape[1]))
for i, c in enumerate(unique_classes):
    class_data = X[y == c]
    centroids[i, :] = np.median(class_data, axis=0)
print(centroids)


[0 1 2]
[[ 0.96300842  4.49178077]
 [ 1.7804256   1.16361524]
 [-1.72852672  2.93363589]]


In [53]:
class_data = X[y == 1]
centroid  = np.expand_dims(centroids[1], axis=0)
distance = np.diag((np.dot(centroid, centroid.T)) - 2 * (np.dot(centroid, class_data.T)) + (np.ones((class_data.shape[1], 1)) * (np.diag(np.dot(class_data, class_data.T)).T)))
# print(distance)
furthest_point_index = int(np.percentile(np.argsort(distance), 90))
print(furthest_point_index)
furthest_points= class_data[furthest_point_index, :].flatten()
furthest_points #clase 0

0


array([3.70475692, 1.88055926])

In [54]:
furthest_points  = np.expand_dims(furthest_points, axis=0)
furthest_points

array([[3.70475692, 1.88055926]])

In [55]:
re_assigned_class = np.zeros_like(y)
unassigned_points = []
no_class_data = X[y != 1]
distance_no = np.diag((np.dot(centroid, centroid.T)) - 2 * (np.dot(centroid, no_class_data.T)) + (np.ones((no_class_data.shape[0], 1)) * (np.diag(np.dot(no_class_data, no_class_data.T)).T)))
distance_further = np.diag((np.dot(centroid, centroid.T)) - 2 * (np.dot(centroid, furthest_points.T)) + (np.ones((furthest_points.shape[0], 1)) * (np.diag(np.dot(furthest_points, furthest_points.T)).T)))
re_assigned_class = np.where(distance_no<distance_further,1,0)
re_assigned_class

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
distance_further

In [None]:
distance_no