In [1]:
#import necessary libraries
import numpy as np
import pandas as pd

In [2]:
class matrix:
    #Create Constructor
    def __init__(self,path):
        self.load_from_csv(path)
        
    #Loading the Dataset
    def load_from_csv(self, path):
        data = pd.read_csv(path,header = None)
        self.array_2d = data.to_numpy()
        # Standardise the 2d numpy array
        self.standardise()
    def standardise(self):
        for j in range(np.shape(self.array_2d)[1]):
            column = self.array_2d[:, j]
            mean_col = np.mean(column)
            max_col = np.max(column)
            min_col = np.min(column)
            #Standatdise each row
            for i in range(np.shape(self.array_2d)[0]):
                if max_col != min_col:
                    self.array_2d[i, j] = (self.array_2d[i, j] - mean_col) / (max_col - min_col)
                else:
                    self.array_2d[:, j] = 0
    def get_distance(self,other_matrix,row_i):
        #Find Euclidean Distance
        squared_diff = (other_matrix - row_i)** 2
        distance = np.sqrt(np.sum( squared_diff))
        return distance
    def get_weighted_distance(self, other_matrix,weights,row_i):
        #Find Weighted Euclidean Distance
        weighted_squared_diff = ((other_matrix - row_i)** 2 )* weights
        weighted_distance = np.sqrt(np.sum(weighted_squared_diff, axis=1))
        return weighted_distance
    def get_count_frequency(self):
        # Return Count of all cluster
        if self.S is None:
            return 0
        # Get unique cluster labels
        clusters = np.unique(self.S)
        d={}
        # Loop over each cluster label and count occurrences in self.S
        for i in clusters:
            d[i] = (self.S == i).sum()
        return d

In [3]:
def get_initial_weights(m):
    # Assign random weight between 0 to 1
    weights = np.random.rand(m)
    
    weights = weights / np.sum(weights)
    
    return weights

In [4]:
def get_centroids(matrix,S,K):
    # Initialize an empty array for centroids 
    centroids = np.zeros((K, np.shape(matrix)[1]))
    # Loop through each cluster
    for cluster_id in range(K):
        cluster_indices = np.where(S == cluster_id)[0]
        # If the cluster contains data points
        if len(cluster_indices) > 0:
            cluster_data = matrix[cluster_indices, :]
            # Calculate the mean of the data points for each feature
            centroids[cluster_id, :] = np.mean(cluster_data, axis=0)
    
    return centroids

In [5]:
def selecting_centroids(matrix,K):
    # select random indecies as clusters
    random_indices = np.random.choice(matrix.shape[0], size=K, replace=False)
    random_rows = matrix[random_indices, :]
    return random_rows,random_indices

In [6]:
def get_separation_within(matrix, centroids,S,K):
    # Initialize an array 'a' with zeros having the same shape as 'matrix'
    a = np.zeros((np.shape(matrix)))
    # Loop through each cluster
    for k in range(K):
        for i in range(len(S)):
            if S[i]==k :
                # Add the distance between 'matrix' and the centroid of the current cluster
                a += m.get_distance(matrix,centroids[k])       
    return a

In [7]:
def get_separation_between(matrix, centroids,S,K):
    # Initialize an array 'a' with zeros having the same shape as 'row'
    b = np.zeros((np.shape(matrix)))
    # Loop through each cluster
    for k in range(K):
        b += (S == k).sum() * m.get_distance(centroids[k],np.mean(centroids[k]))

    return b

In [8]:
def get_new_weights(matrix,centroids,old_weights,S,K):
    # Initialize a weights array 'w' with zeros
    w = np.zeros((len(old_weights),len(old_weights)))
    s = np.zeros((len(old_weights),))
    
    # Loop through each data point to accumulate the separation ratio
    for i in range(len(old_weights)):
        s += get_separation_between(matrix[i],centroids,S,K)/get_separation_within(matrix[i],centroids,S,K)
    
    # Loop through each data point to calculate the new weights
    for i in range(len(old_weights)):
        between_by_within = get_separation_between(matrix[i],centroids,S,K)/get_separation_within(matrix[i],centroids,S,K)
        # Update the weights using the formula that averages old weights and the new ratio
        w[i] = 0.5 * (old_weights[i] + (between_by_within/s))
    w = w.T
    return w[0]

In [9]:
def get_groups(matrix,K):
    # Get the shape of the matrix 
    x,y = np.shape(matrix)
    weights = get_initial_weights(y)
    
    # Initialize centroids and cluster assignment arrays
    initial_centroids = np.zeros((K, y))
    S_distance = np.zeros((x,1))
    S = np.zeros((x,1))
    
    # Select initial centroids and retrieve the row numbers for those centroids
    initial_centroids,centroid_row_number = selecting_centroids(matrix,K)
    
    #Calculate the weighted distance from each data point to the initial centroids
    distance = []
    for i in initial_centroids:
        w_distance  = m.get_weighted_distance(m.array_2d,weights,i)
        distance.append(w_distance)
    distance =  np.array(distance)
    
    # Assign data points to the nearest centroid
    for i in range(np.shape(S)[0]):
        index = -1
        min_distance = 999
        for j in range(K):
            if min_distance> distance[j][i]:
                min_distance = distance[j][i]
                index = centroid_row_number[j]
        S_distance[i] = min_distance
        S[i] = np.where(centroid_row_number == index)[0]
    old_S = S.copy()
    
    #Compute new centroids based on current cluster assignment
    new_centroids = get_centroids(m.array_2d,S,K)
    
    # Update the weights based on the new centroids and current cluster assignment
    weights = get_new_weights(m.array_2d,new_centroids,weights,S,K)
    
    while True:
        distance = []
        for i in new_centroids:
            w_distance  = m.get_weighted_distance(m.array_2d,weights,i)
            distance.append(w_distance)
        distance =  np.array(distance)
        
        # Reassign data points to the nearest new centroid
        for i in range(np.shape(S)[0]):
            index = -1
            min_distance = 999
            for j in range(K):
                if min_distance> distance[j][i]:
                    min_distance = distance[j][i]
                    index = centroid_row_number[j]
            S_distance[i] = min_distance
            S[i] = np.where(centroid_row_number == index)[0]
        new_centroids = get_centroids(m.array_2d,S,K)

        weights = get_new_weights(m.array_2d,new_centroids,weights,S,K)
        
        # If the cluster assignments have not changed, return the result
        if np.array_equal(S, old_S):
            m.S =S
            return S
        else:
            old_S = S.copy()
    
    

In [10]:
def run_test():
    global m
    m = matrix('data.csv')
    for k in range(2,11):
        for i in range(20):
            S = get_groups(m.array_2d,k)
            print(str(k)+'='+str(m.get_count_frequency()))

In [11]:
run_test()

2={0.0: 70, 1.0: 108}
2={0.0: 89, 1.0: 89}
2={0.0: 72, 1.0: 106}
2={0.0: 85, 1.0: 93}
2={0.0: 109, 1.0: 69}
2={0.0: 108, 1.0: 70}
2={0.0: 109, 1.0: 69}
2={0.0: 89, 1.0: 89}
2={0.0: 70, 1.0: 108}
2={0.0: 109, 1.0: 69}
2={0.0: 110, 1.0: 68}
2={0.0: 69, 1.0: 109}
2={0.0: 86, 1.0: 92}
2={0.0: 69, 1.0: 109}
2={0.0: 108, 1.0: 70}
2={0.0: 89, 1.0: 89}
2={0.0: 89, 1.0: 89}
2={0.0: 89, 1.0: 89}
2={0.0: 89, 1.0: 89}
2={0.0: 70, 1.0: 108}
3={0.0: 63, 1.0: 51, 2.0: 64}
3={0.0: 63, 1.0: 51, 2.0: 64}
3={0.0: 54, 1.0: 64, 2.0: 60}
3={0.0: 55, 1.0: 60, 2.0: 63}
3={0.0: 54, 1.0: 63, 2.0: 61}
3={0.0: 60, 1.0: 63, 2.0: 55}
3={0.0: 63, 1.0: 60, 2.0: 55}
3={0.0: 64, 1.0: 52, 2.0: 62}
3={0.0: 61, 1.0: 54, 2.0: 63}
3={0.0: 60, 1.0: 55, 2.0: 63}
3={0.0: 64, 1.0: 60, 2.0: 54}
3={0.0: 55, 1.0: 60, 2.0: 63}
3={0.0: 60, 1.0: 64, 2.0: 54}
3={0.0: 64, 1.0: 63, 2.0: 51}
3={0.0: 55, 1.0: 60, 2.0: 63}
3={0.0: 61, 1.0: 55, 2.0: 62}
3={0.0: 62, 1.0: 51, 2.0: 65}
3={0.0: 51, 1.0: 65, 2.0: 62}
3={0.0: 66, 1.0: 61, 2.0: 51

9={0.0: 28, 1.0: 14, 2.0: 23, 3.0: 7, 4.0: 21, 5.0: 16, 6.0: 20, 7.0: 36, 8.0: 13}
10={0.0: 21, 1.0: 21, 2.0: 19, 3.0: 1, 4.0: 22, 5.0: 41, 6.0: 10, 7.0: 20, 8.0: 5, 9.0: 18}
10={0.0: 36, 1.0: 21, 2.0: 19, 3.0: 3, 4.0: 18, 5.0: 17, 6.0: 17, 7.0: 16, 8.0: 22, 9.0: 9}
10={0.0: 14, 1.0: 30, 2.0: 19, 3.0: 11, 4.0: 20, 5.0: 21, 6.0: 17, 7.0: 16, 8.0: 26, 9.0: 4}
10={0.0: 23, 1.0: 17, 2.0: 27, 3.0: 16, 4.0: 17, 5.0: 14, 6.0: 2, 7.0: 22, 8.0: 19, 9.0: 21}
10={0.0: 2, 1.0: 8, 2.0: 15, 3.0: 20, 4.0: 7, 5.0: 40, 6.0: 32, 7.0: 40, 8.0: 13, 9.0: 1}
10={0.0: 54, 1.0: 13, 2.0: 22, 3.0: 4, 4.0: 11, 5.0: 15, 6.0: 23, 7.0: 15, 8.0: 11, 9.0: 10}
10={0.0: 52, 1.0: 13, 2.0: 15, 3.0: 11, 4.0: 23, 5.0: 4, 6.0: 11, 7.0: 13, 8.0: 21, 9.0: 15}
10={0.0: 25, 1.0: 7, 2.0: 7, 3.0: 20, 4.0: 25, 5.0: 28, 6.0: 23, 7.0: 13, 8.0: 7, 9.0: 23}
10={0.0: 17, 1.0: 10, 2.0: 36, 3.0: 17, 4.0: 19, 5.0: 14, 6.0: 28, 7.0: 6, 8.0: 9, 9.0: 22}
10={0.0: 18, 1.0: 19, 2.0: 20, 3.0: 36, 4.0: 10, 5.0: 38, 6.0: 11, 7.0: 4, 8.0: 15, 9.0: