In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#EUCLIDEA

class ClassCentroidIdentifier:
    def __init__(self):
        self.centroids = None
        self.nearest_indices = None
    
    def set_centroids(self, data, class_var):
        """
        Function to set the centroids for each class.
        The centroid of a class is calculated as the mean of the data points belonging to that class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        """
        unique_classes = np.unique(class_var)
        self.centroids = np.zeros((len(unique_classes), data.shape[1]))
        for i, c in enumerate(unique_classes):
            class_data = data[class_var == c]
            self.centroids[i, :] = np.median(class_data, axis=0)
        return self.centroids
    
    def get_furthest_points(self, data, class_var):
        """
        Function to get the furthest data point in the 90th percentile for each class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        """
        unique_classes = np.unique(class_var)
        furthest_points = np.zeros((len(unique_classes), data.shape[1]))
        for i, c in enumerate(unique_classes):
            class_data = data[class_var == c]
            class_data = np.expand_dims(class_data, axis=1)
            centroid = np.expand_dims(self.centroids[i, :], axis=0)
            distances = np.linalg.norm(class_data - centroid, axis=-1).flatten()
            furthest_point_index = int(np.percentile(np.argsort(distances), 90))
            furthest_points[i, :] = class_data[furthest_point_index, :].flatten()
        return furthest_points

    def assign_nearby_points(self, data, class_var):
        """
        Function to assign the nearest points to each centroid.
        Points are considered nearest if they are closer to the centroid than the furthest point in the 90th percentile
        for the same class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 1D array of length n, representing the class label for each data point after re-assignment.
        np.array: A 1D array of length n, representing the data points that were not re-assigned to any class.
        """
        # Get the furthest points for each class
        furthest_points = self.get_furthest_points(data, class_var)
        # Initialize arrays to store the re-assigned class labels and unassigned data points
        re_assigned_class = np.zeros_like(class_var)
        unassigned_points = []
 

        # Loop through the data points
        for i, d in enumerate(data):
            # Initialize a flag to check if the current data point has been re-assigned to a class
            re_assigned = False
            # Loop through the classes
            for j, c in enumerate(self.centroids):
                # Check if the current data point does not belong to the class
                if class_var[i] != j:
                    # Calculate the Euclidean distance between the data point and the centroid of the current class
                    distance = abs(np.linalg.norm(c - d)) #expandir dimenciones Centroide, y sacar funcion de robin
                    # Calculate the Euclidean distance between the furthest point and the centroid of the current class
                    furthest_distance = abs(np.linalg.norm(c - furthest_points[j]))
                    #distance = distance.append(distance)
                    # Check if the distance between the data point and the centroid is less than the distance between the furthest point and the centroid
                    if distance < furthest_distance:
                        # Assign the data point to the class
                        re_assigned_class[i] = re_assigned_class[i]+j
                        re_assigned = True
                        break
            # If the data point was not re-assigned to a class, add it to the list of unassigned points
            if not re_assigned:
                unassigned_points.append(i)
        return re_assigned_class, np.array(unassigned_points)


In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#COSINE

class ClassCentroidIdentifier:
    def __init__(self):
        self.centroids = None
        self.nearest_indices = None
    
    def set_centroids(self, data, class_var):
        """
        Function to set the centroids for each class.
        The centroid of a class is calculated as the mean of the data points belonging to that class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        """
        unique_classes = np.unique(class_var)
        self.centroids = np.zeros((len(unique_classes), data.shape[1]))
        for i, c in enumerate(unique_classes):
            class_data = data[class_var == c]
            self.centroids[i, :] = np.mean(class_data, axis=0)
        return self.centroids
    
    def get_furthest_points(self, data, class_var):
        """
        Function to get the furthest data point in the 90th percentile for each class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        """
        unique_classes = np.unique(class_var)
        furthest_points = np.zeros((len(unique_classes), data.shape[1]))
        for i, c in enumerate(unique_classes):
            class_data = data[class_var == c]
            class_data = np.expand_dims(class_data, axis=1) #X
            centroid = np.expand_dims(self.centroids[i, :], axis=0) #Y 
            distances = abs(np.linalg.norm(class_data - centroid, axis=-1).flatten()) #diag(dot(X*X.T))-(2(dot(X*Y.T)))+dot((NX1)==1)*diag((Y*Y.T).T))
            furthest_point_index = int(np.percentile(np.argsort(distances), 90))
            furthest_points[i, :] = class_data[furthest_point_index, :].flatten()
        return furthest_points

    def assign_nearby_points(self, data, class_var):
        """
        Function to assign the nearest points to each centroid.
        Points are considered nearest if they are closer to the centroid than the furthest point in the 90th percentile
        for the same class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 1D array of length n, representing the class label for each data point after re-assignment.
        np.array: A 1D array of length n, representing the data points that were not re-assigned to any class.
        """
        # Get the furthest points for each class
        furthest_points = self.get_furthest_points(data, class_var)
        # Initialize arrays to store the re-assigned class labels and unassigned data points
        re_assigned_class = np.zeros_like(class_var)
        unassigned_points = []
        
        # Loop through the data points
        for i, d in enumerate(data):
            # Initialize a flag to check if the current data point has been re-assigned to a class
            re_assigned = False
            # Loop through the classes
            for j, c in enumerate(self.centroids):
                # Check if the current data point belongs to the class
                if class_var[i] != j:
                    #re_assigned_class[i] = class_var[i]
                    # Calculate the Euclidean distance between the data point and the centroid of the current class
                    distance = abs(np.dot(d, c) / (np.linalg.norm(d) * np.linalg.norm(c))) #
                    # Calculate the cosine distance between the furthest point and the centroid of the current class
                    furthest_distance = abs(np.dot(furthest_points[j], c) / (np.linalg.norm(furthest_points[j]) * np.linalg.norm(c)))
                    # If the distance from the data point to the centroid is less than the distance from the furthest point
                    # to the centroid, assign the class label of the current class to the data point
                    if distance < furthest_distance: 
                        re_assigned_class[i] = re_assigned_class[i]+(j)
                        re_assigned = True
                        break
            # If the data point was not re-assigned to any class, add it to the list of unassigned points
            if not re_assigned:
                unassigned_points.append(d)
        return re_assigned_class, np.array(unassigned_points) 

In [None]:
# agregar en el codigo de robin, tres distancias.

In [10]:
import numpy as np
import pandas as pd

# Load the iris dataset
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header=None)

# Get the data and class labels
data = df.iloc[:, :-1].values
class_var = df.iloc[:, -1].values

# Create an instance of the ClassCentroi class
cci = ClassCentroidIdentifier()

# Set the centroids for each class
centroids = cci.set_centroids(data, class_var)

# Assign nearby points to each centroid
re_assigned_class, unassigned_points = cci.assign_nearby_points(data, class_var)
c = cci.get_furthest_points(data, class_var)
# Print the number of unassigned points
print(f"Number of unassigned points: {len(unassigned_points)}")
print(f"Number of re_assigned_class points: {len(re_assigned_class)}")
print(centroids)
print(c)


Number of unassigned points: 87
Number of re_assigned_class points: 150
[[5.   3.4  1.5  0.2 ]
 [5.9  2.8  4.35 1.3 ]
 [6.5  3.   5.55 2.  ]]
[[5.1 3.8 1.9 0.4]
 [5.6 2.7 4.2 1.3]
 [6.7 3.3 5.7 2.5]]


kdc 

In [11]:
df_test = pd.DataFrame(data)
df_test['y'] = np.where((df.iloc[:, -1].values)=='Iris-setosa',0,np.where((df.iloc[:, -1].values)=='Iris-versicolor',1,2))
df_test['near'] = re_assigned_class
# df_test['distance'] = distance
df_test['val'] = df_test['y']==df_test['near']
df_test[df_test.val!=True]

Unnamed: 0,0,1,2,3,y,near,val
50,7.0,3.2,4.7,1.4,1,0,False
51,6.4,3.2,4.5,1.5,1,0,False
52,6.9,3.1,4.9,1.5,1,0,False
53,5.5,2.3,4.0,1.3,1,0,False
54,6.5,2.8,4.6,1.5,1,0,False
...,...,...,...,...,...,...,...
141,6.9,3.1,5.1,2.3,2,0,False
142,5.8,2.7,5.1,1.9,2,0,False
144,6.7,3.3,5.7,2.5,2,0,False
146,6.3,2.5,5.0,1.9,2,0,False


In [12]:
df_test['near'].unique()

array([0, 1, 2], dtype=object)