In [25]:
#############################
import numpy as np

def set_centroids(data, class_var):
    """
    Function to set the centroids for each class
    """
    unique_classes = np.unique(class_var)
    centroids = np.zeros((len(unique_classes), data.shape[1]))
    for i, c in enumerate(unique_classes):
        class_data = data[class_var == c]
        centroids[i, :] = np.mean(class_data, axis=0)
    return centroids

def evaluate_distance(data, centroids):
    """
    Function to evaluate the Euclidean distance between each data point and centroids
    """
    distances = np.zeros((len(data), centroids.shape[0]))
    for i in range(centroids.shape[0]):
        diff = data - centroids[i]
        distances[:, i] = np.sqrt(np.sum(diff**2, axis=1))
    return distances

def identify_25_percent(distances):
    """
    Function to identify the 25% nearest data points for each class
    """
    nearest_indices = np.argsort(distances, axis=0)[:int(0.25*len(distances))]
    return nearest_indices



In [11]:
# Input data
data = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])

# Input class variable
class_var = np.array([0, 0, 1, 1, 1, 0])

# Set the centroids for each class
centroids = set_centroids(data, class_var)
print(centroids)
# Evaluate the distances between each data point and centroids
distances = evaluate_distance(data, centroids)
print(distances)
# Identify the 25% nearest data points for each class
nearest_indices = identify_25_percent(distances)
print(nearest_indices)

[[3. 4.]
 [4. 5.]]
[[2.82842712 4.24264069]
 [1.41421356 2.82842712]
 [0.         1.41421356]
 [1.41421356 0.        ]
 [2.82842712 1.41421356]
 [4.24264069 2.82842712]]
[[2 3]]


In [8]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_iris
iris = load_iris()
df_iris = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])
df_iris.head(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [30]:
# Set the centroids for each class
X = df_iris.drop('target', axis=1)
y = df_iris['target']
centroids = set_centroids(df_iris, df_iris.target)
print(centroids)
# Evaluate the distances between each data point and centroids
distances = evaluate_distance(df_iris, centroids)
print(distances)
# Identify the 25% nearest data points for each class
nearest_indices = identify_25_percent(distances)
print(nearest_indices)

[[5.006 3.428 1.462 0.246 0.   ]
 [5.936 2.77  4.26  1.326 1.   ]
 [6.588 2.974 5.552 2.026 2.   ]]
[[0.14135063 3.41749499 5.2023264 ]
 [0.44763825 3.40200999 5.23673562]
 [0.4171091  3.5696039  5.38789384]
 [0.52533799 3.4320944  5.2650736 ]
 [0.18862662 3.46590133 5.24280459]
 [0.67703767 3.1583654  4.87345873]
 [0.4151867  3.5233609  5.32464083]
 [0.06618157 3.34042991 5.14355908]
 [0.80745278 3.5810155  5.41905896]
 [0.37627118 3.36197442 5.1955173 ]
 [0.4824728  3.3222992  5.06866846]
 [0.25373214 3.32127566 5.13137408]
 [0.50077939 3.46786274 5.30529924]
 [0.91322505 3.90103473 5.72410692]
 [1.01409073 3.62806725 5.28656788]
 [1.20481534 3.49108465 5.08769103]
 [0.6542018  3.49045441 5.19540181]
 [0.1441527  3.38586355 5.16807508]
 [0.82436642 3.11815202 4.82574347]
 [0.38933276 3.38083895 5.12778705]
 [0.46344363 3.08312698 4.8749359 ]
 [0.3286031  3.3224798  5.07974409]
 [0.64029681 3.92036631 5.69201195]
 [0.38259639 3.03573253 4.84633882]
 [0.48701129 3.08623914 4.90405954]


In [14]:
def set_centroids(data, class_var):
    """
    Function to set the centroids for each class
    """
    unique_classes = np.unique(class_var)
    centroids = np.zeros((len(unique_classes), data.shape[1]))
    for i, c in enumerate(unique_classes):
        class_data = data[class_var == c, :]
        centroids[i, :] = np.mean(class_data, axis=0)
    return centroids

In [22]:
df_iris[df_iris.target == c, :]

TypeError: '(0       True
1       True
2       True
3       True
4       True
       ...  
145    False
146    False
147    False
148    False
149    False
Name: target, Length: 150, dtype: bool, slice(None, None, None))' is an invalid key

In [24]:
unique_classes = np.unique(y)
centroids = np.zeros((len(unique_classes), df_iris.shape[1]))
centroids
for i, c in enumerate(unique_classes):
    class_data = df_iris[df_iris.target == c]
    centroids[i, :] = np.mean(class_data, axis=0)
centroids

array([[5.006, 3.428, 1.462, 0.246, 0.   ],
       [5.936, 2.77 , 4.26 , 1.326, 1.   ],
       [6.588, 2.974, 5.552, 2.026, 2.   ]])

In [None]:
    unique_classes = np.unique(y)
    centroids = np.zeros((len(unique_classes), data.shape[1]))
    for i, c in enumerate(unique_classes):
        class_data = df_iris[y == c, :]
        centroids[i, :] = np.mean(class_data, axis=0)
    return centroids