In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

class ClassCentroidIdentifier:
    def __init__(self):
        self.centroids = None
        self.nearest_indices = None
    
    def set_centroids(self, data, class_var):
        """
        Function to set the centroids for each class.
        The centroid of a class is calculated as the mean of the data points belonging to that class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        """
        unique_classes = np.unique(class_var)
        self.centroids = np.zeros((len(unique_classes), data.shape[1]))
        for i, c in enumerate(unique_classes):
            class_data = data[class_var == c]
            self.centroids[i, :] = np.mean(class_data, axis=0)
        return self.centroids
    
    def get_furthest_points(self, data, class_var):
        """
        Function to get the furthest data point in the 90th percentile for each class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        """
        unique_classes = np.unique(class_var)
        furthest_points = np.zeros((len(unique_classes), data.shape[1]))
        for i, c in enumerate(unique_classes):
            class_data = data[class_var == c]
            class_data = np.expand_dims(class_data, axis=1)
            centroid = np.expand_dims(self.centroids[i, :], axis=0)
            distances = np.linalg.norm(class_data - centroid, axis=-1).flatten()
            furthest_point_index = int(np.percentile(np.argsort(distances), 90))
            furthest_points[i, :] = class_data[furthest_point_index, :].flatten()
        return furthest_points

    def get_class_var(self, data, centroids):
        """
        Function to get the class label for each data point based on the nearest centroid.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        centroids (np.array): A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        
        Returns:
        np.array: A 1D array of length n, representing the class label for each data point.
        """
        distances = np.zeros((data.shape[0], centroids.shape[0]))
        for i, c in enumerate(centroids):
            distances[:, i] = np.linalg.norm(data - c, axis=1)
        self.nearest_indices = np.argmin(distances, axis=1)
        return self.nearest_indices

    def assign_to_nearest_class(self, data):
        distances = np.zeros((data.shape[0], self.centroids.shape[0]))
        for i, c in enumerate(self.centroids):
            distances[:, i] = np.linalg.norm(data - c, axis=1)
        nearest_class = np.argmin(distances, axis=1)
        for i, nc in enumerate(nearest_class):
            if distances[i, nc] < self.furthest_points[nc]:
                self.nearest_indices = np.concatenate((self.nearest_indices, [nc]))
        return nearest_class


In [8]:
import numpy as np
import pandas as pd

# Load the iris dataset
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header=None)
data = df.values[:, :-1]
class_var = df.values[:, -1]

# Initialize the class centroid identifier
cc = ClassCentroidIdentifier()

# Set the centroids for each class
centroids = cc.set_centroids(data, class_var)

# Get the furthest data point in the 90th percentile for each class
furthest_points = cc.get_furthest_points(data, class_var)
cc.furthest_points = furthest_points

# Assign each data point to its nearest class
nearest_class = cc.assign_to_nearest_class(data)

print("Nearest class:", nearest_class)
print("Nearest indices:", cc.nearest_indices)


TypeError: loop of ufunc does not support argument 0 of type float which has no callable sqrt method

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

class ClassCentroidIdentifier:
    def __init__(self):
        # Initialize the centroids and nearest indices as None
        self.centroids = None
        self.nearest_indices = None
    
    def set_centroids(self, data, class_var):
        """
        Function to set the centroids for each class.
        The centroid of a class is calculated as the mean of the data points belonging to that class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        """
        
        # Get unique classes
        unique_classes = np.unique(class_var)
        
        # Initialize centroids with zeros with the shape of (number of unique classes, number of features)
        self.centroids = np.zeros((len(unique_classes), data.shape[1]))
        
        # Loop through each unique class
        for i, c in enumerate(unique_classes):
            
            # Get the data points for the current class
            class_data = data[class_var == c]
            
            # Calculate the mean of the data points for the current class and set it as the centroid for that class
            self.centroids[i, :] = np.mean(class_data, axis=0)
        
        # Return the centroids
        return self.centroids
    
    def get_furthest_points(self, data, class_var):
        """
        Function to get the furthest data point in the 90th percentile for each class.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        class_var (np.array): A 1D array of length n, representing the class label for each data point.
        
        Returns:
        np.array: A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        """
        # Get unique classes from the class_var
        unique_classes = np.unique(class_var)
        # Initialize an array to store the furthest points for each class
        furthest_points = np.zeros((len(unique_classes), data.shape[1]))
        # Loop through the unique classes
        for i, c in enumerate(unique_classes):
            # Get data points for the current class
            class_data = data[class_var == c]
            # Expand the dimension of class data so that it can be subtracted from the centroid
            class_data = np.expand_dims(class_data, axis=1)
            # Expand the dimension of the centroid of the current class
            centroid = np.expand_dims(self.centroids[i, :], axis=0)
            # Calculate the Euclidean distances between the data points and the centroid
            distances = np.linalg.norm(class_data - centroid, axis=-1).flatten()
            # Find the index of the data point with the 90th percentile distance from the centroid
            furthest_point_index = np.argsort(distances)[-int(0.9*len(distances))]
            # Store the furthest data point for the current class
            furthest_points[i, :] = class_data[furthest_point_index, :].flatten()
        return furthest_points

    def get_class_var(self, data, centroids):
        """
        Function to get the class label for each data point based on the nearest centroid.
        
        Parameters:
        data (np.array): A 2D array of shape (n, m) where n is the number of data points and m is the number of features.
        centroids (np.array): A 2D array of shape (k, m) where k is the number of unique classes and m is the number of features.
        
        Returns:
        np.array: A 1D array of length n, representing the class label for each data point.
        """
        # Initialize distances matrix to store the distances of each data point to each centroid
        distances = np.zeros((data.shape[0], centroids.shape[0]))
        
        # Loop over each centroid to calculate the distances from each data point to that centroid
        for i, c in enumerate(centroids):
            distances[:, i] = np.linalg.norm(data - c, axis=1)
        
        # Get the indices of the nearest centroid for each data point
        self.nearest_indices = np.argmin(distances, axis=1)
        
        # Return the class label for each data point based on the nearest centroid
        return self.nearest_indices
        
    def assign_to_nearest_class(self, data):
        # Initialize distances matrix to store the distances of each data point to each centroid
        distances = np.zeros((data.shape[0], self.centroids.shape[0]))
        
        # Loop over each centroid to calculate the distances from each data point to that centroid
        for i, c in enumerate(self.centroids):
            distances[:, i] = np.linalg.norm(data - c, axis=1)
        
        # Get the indices of the nearest centroid for each data point
        nearest_class = np.argmin(distances, axis=1)
        
        # Loop over each data point and check if its distance to its nearest centroid is less than the furthest distance
        # of that class. If so, add the data point to the nearest indices of that class.
        for i, nc in enumerate(nearest_class):
            if distances[i, nc] < self.furthest_points[nc]:
                self.nearest_indices = np.concatenate((self.nearest_indices, [nc]))
        
        # Return the indices of the nearest centroid for each data point
        return nearest_class