From 50fdb9b43229e54e5d2780facde65245e0e15cbf Mon Sep 17 00:00:00 2001 From: Jere Lavikainen <61044352+jerela@users.noreply.github.com> Date: Wed, 29 Nov 2023 14:56:32 +0200 Subject: [PATCH] Added column vector util func, fixed clustering - added column() function to utils to create a column vector for a 1D list - fixed typos in density clustering equations - made hard k-means algorithm also return the cluster label for each data point - improved type hints and docstrings --- documentation/mola.clustering.html | 18 +++++----------- documentation/mola.matrix.html | 4 ++-- documentation/mola.regression.html | 4 ++-- documentation/mola.utils.html | 15 ++++++++++++-- mola/clustering.py | 33 ++++++++++++++++++------------ mola/matrix.py | 2 +- mola/regression.py | 4 ++-- mola/utils.py | 29 ++++++++++++++++++++++---- tests/clustering_test.py | 2 +- 9 files changed, 71 insertions(+), 40 deletions(-) diff --git a/documentation/mola.clustering.html b/documentation/mola.clustering.html index 1d68d87..61f1ee6 100644 --- a/documentation/mola.clustering.html +++ b/documentation/mola.clustering.html @@ -41,7 +41,7 @@ Arguments:
p1 -- list: the first point
p2 -- list: the second point -
find_c_means(data: mola.matrix.Matrix, num_centers=2, max_iterations=100, distance_function=<function distance_euclidean_pow at 0x0000023FFD4614C0>, initial_centers=None)
Return the cluster centers and the membership matrix of points using soft k-means clustering (also known as fuzzy c-means).
+
find_c_means(data: mola.matrix.Matrix, num_centers=2, max_iterations=100, distance_function=<function distance_euclidean_pow at 0x000002B30AB56670>, initial_centers=None)
Return the cluster centers and the membership matrix of points using soft k-means clustering (also known as fuzzy c-means).
 
Fuzzy c-means clustering is an iterative algorithm that finds the cluster centers by first assigning each point to each cluster center with a certain membership value (0 to 1) and then updating the cluster centers to be the weighted mean of the points assigned to them. This process is repeated for a set number of iterations or until the cluster centers converge. The initial cluster centers are either randomized or given by the user.
A major difference between hard k-means clustering and fuzzy c-means clustering is that in fuzzy c-means clustering, the points may belong partially to several clusters instead of belonging completely to one cluster, like in hard k-means clustering. Therefore, this algorithm is well-suited to cluster data that is not clearly separable into distinct clusters (e.g., symmetric distribution of data points).
@@ -59,9 +59,9 @@ Arguments:
data -- Matrix: the data containing the points to be clustered
num_centers -- int: the number of cluster centers to be found (default 2)
-beta -- float: the width of the Gaussian function (default 0.5)
-sigma -- float: the width of the Gaussian function (default 0.5)
-
find_k_means(data: mola.matrix.Matrix, num_centers=2, max_iterations=100, distance_function=<function distance_euclidean_pow at 0x0000023FFD4614C0>, initial_centers=None)
Return the cluster centers using hard k-means clustering.
+beta -- float: the width of the Gaussian function (default 0.5) used to destruct the mountain function
+sigma -- float: the width of the Gaussian function (default 0.5) used to construct the mountain function
+
find_k_means(data: mola.matrix.Matrix, num_centers=2, max_iterations=100, distance_function=<function distance_euclidean_pow at 0x000002B30AB56670>, initial_centers=None) -> mola.matrix.Matrix
Return the cluster centers using hard k-means clustering.
 
K-means clustering is an iterative algorithm that finds the cluster centers by first assigning each point to the closest cluster center and then updating the cluster centers to be the mean of the points assigned to them. This process is repeated for a set number of iterations or until the cluster centers converge. The initial cluster centers are either randomized or given by the user.
 
@@ -74,13 +74,5 @@ distance_function -- function: the distance function to be used (default Euclidean distance); options are squared Euclidean distance (distance_euclidean_pow) and taxicab distance (distance_taxicab)
initial_centers -- Matrix: the initial cluster centers; if not specified, they are initialized randomly (default None)
random() method of random.Random instance
random() -> x in the interval [0, 1).
-

- - - - - -
 
-Data
       INFINITE = 4294967295
-INFINITY = inf
+ \ No newline at end of file diff --git a/documentation/mola.matrix.html b/documentation/mola.matrix.html index beac1fa..9851e36 100644 --- a/documentation/mola.matrix.html +++ b/documentation/mola.matrix.html @@ -270,7 +270,7 @@

make_identity(self) -> None
Set all diagonal elements of the matrix to 1 and all non-diagonal elements to 0.
-
norm_Euclidean(self)
Return the Euclidean norm of the matrix.
+
norm_Euclidean(self) -> float
Return the Euclidean norm of the matrix.
row_is_zeros(self, r: int) -> bool
Return true if all elements in the row are zero-valued. Otherwise, return false.
 
@@ -493,7 +493,7 @@
make_identity(self) -> None
Set all diagonal elements of the matrix to 1 and all non-diagonal elements to 0.
-
norm_Euclidean(self)
Return the Euclidean norm of the matrix.
+
norm_Euclidean(self) -> float
Return the Euclidean norm of the matrix.
print(self, precision=4)
Print a string that describes the matrix.
Rows are delimited by semicolons and newlines. Elements in a single row are delimited by commas.
diff --git a/documentation/mola.regression.html b/documentation/mola.regression.html index d295317..8f91bc0 100644 --- a/documentation/mola.regression.html +++ b/documentation/mola.regression.html @@ -17,7 +17,7 @@ Functions         -
fit_nonlinear(independent_values, dependent_values, h, J, initial=None, max_iters=100)
Return the estimated parameters of a nonlinear model using the Gauss-Newton iteration algorithm.
+
fit_nonlinear(independent_values: mola.matrix.Matrix, dependent_values: mola.matrix.Matrix, h: mola.matrix.Matrix, J: mola.matrix.Matrix, initial=None, max_iters=100)
Return the estimated parameters of a nonlinear model using the Gauss-Newton iteration algorithm.
 
The algorithm uses Gauss-Newton iteration to find the parameters that minimize the least squares criterion ||y-h(theta)||^2, where y is the vector of dependent values, h is the model function, and theta is the vector of the function's parameters. The estimates are improved iteratively by evaluating the gradient of the least squares criterion and using that gradient to update the parameter estimates in small steps. The gradient is approximated by Jacobian matrices.
 
@@ -28,7 +28,7 @@ J -- Matrix: the Jacobian matrix of the model function
initial -- Matrix: the initial guess of the parameters (default None, in which case they are randomized)
max_iters -- int: the maximum number of iterations (default 100)
-
fit_univariate_polynomial(independent_values, dependent_values, degrees=[1], intercept=True, weights=None, regularization_coefficient=None)
Return the parameters of an nth-order polynomial in a tuple.
+
fit_univariate_polynomial(independent_values: mola.matrix.Matrix, dependent_values: mola.matrix.Matrix, degrees=[1], intercept=True, weights=None, regularization_coefficient=None)
Return the parameters of an nth-order polynomial in a tuple.
The algorithm uses least squares regression to minimize the term ||y-H*theta||^2, where y is the vector of dependent values, H is the observation matrix, and theta is the vector of parameters.
The parameters are the coefficients of the polynomial function.
Optional arguments allow including intercept in the parameters, weighting certain data points over others, and L2 (Tikhonov) regularization.
diff --git a/documentation/mola.utils.html b/documentation/mola.utils.html index e8b77ca..fa9a4ba 100644 --- a/documentation/mola.utils.html +++ b/documentation/mola.utils.html @@ -25,7 +25,14 @@ Functions         -
equals_approx(left, right, precision=1e-12) -> bool
Return true if the compared objects are roughly equal elementwise. Otherwise, return false.
+
column(data: list) -> mola.matrix.Matrix
Return a column vector Matrix object constructed from a one-dimensional list.
+This is the same as calling Matrix(data).get_transpose() with a check to make sure the list is one-dimensional.

+Arguments:
+data -- list: the 1D list to be used as the data of the matrix

+Raises an exception if the list is multidimensional.
+
equals_approx(left, right, precision=1e-12) -> bool
Return true if the compared objects are roughly equal elementwise. Otherwise, return false.
 
Arguments:
left -- Matrix, list, tuple, or a single value: the object on the left side of the comparison
@@ -45,7 +52,11 @@ cols -- unsigned integer: width of the matrix (default None)
 
If 'cols' is not specified, the matrix is assumed to have the same number of columns as the number of rows.
-
norm(data)
+
norm(data: mola.matrix.Matrix) -> float
Return the Euclidean norm of a matrix.
+You could also just call data.norm_Euclidean() directly, but this is a wrapper function for convenience.

+Arguments:
+data -- Matrix: the matrix whose Euclidean norm is to be returned
ones(height: int, width: int) -> mola.matrix.Matrix
Return a matrix where all elements are 1.
 
Arguments:
diff --git a/mola/clustering.py b/mola/clustering.py index 7cea26b..0d39d13 100644 --- a/mola/clustering.py +++ b/mola/clustering.py @@ -119,7 +119,7 @@ def find_k_means(data: Matrix, num_centers = 2, max_iterations = 100, distance_f if iteration == max_iterations-1: print("WARNING: k-means centers did not converge in " , str(max_iterations), " iterations. Consider increasing the maximum number of iterations or using fuzzy k-means.") - return centers + return centers, closest_center @@ -219,8 +219,8 @@ def find_density_clusters(data: Matrix, num_centers = 2, beta = 0.5, sigma = 0.5 Arguments: data -- Matrix: the data containing the points to be clustered num_centers -- int: the number of cluster centers to be found (default 2) - beta -- float: the width of the Gaussian function (default 0.5) - sigma -- float: the width of the Gaussian function (default 0.5) + beta -- float: the width of the Gaussian function (default 0.5) used to destruct the mountain function + sigma -- float: the width of the Gaussian function (default 0.5) used to construct the mountain function """ # get the number of data points (samples) and the dimension of each data point @@ -232,26 +232,30 @@ def find_density_clusters(data: Matrix, num_centers = 2, beta = 0.5, sigma = 0.5 mountain_func = [0 for x in range(n_samples)] # construct mountain function value for each data sample - # iterate through centers + # calculate the sum of Gaussian functions centered at each data point for i in range(n_samples): - # iterate through data points for k in range(n_samples): - mountain_func[i] = mountain_func[i] + math.exp( - ( pow(distance_euclidean(data[i,:],data[k,:]),2) ) / (2*sigma**sigma) ) + mountain_func[i] += math.exp( - ( pow(distance_euclidean(data[i,:],data[k,:]),2) ) / (2*sigma*sigma) ) # select cluster centers and destruct mountain functions - mountain_func_new = deepcopy(mountain_func) + mountain_func_prev = deepcopy(mountain_func) + mountain_func_current = deepcopy(mountain_func) c_subtractive = zeros(num_centers,dim) # iterate through the number of labels (assumption is that there are 2 clusters) for k in range(num_centers): + + #mountain_func_current = deepcopy(mountain_func_prev) + + # select cluster centers peak = 0; peak_i = 0; for i in range(n_samples): - if mountain_func_new[i] > peak: - print('For cluster ' + str(k) + ' found peak ' + str(mountain_func_new[i]) + ' at ' + str(data[i,0]) + ',' + str(data[i,1])) - peak = mountain_func_new[i] + if mountain_func_current[i] > peak: + #print('For cluster ' + str(k) + ' found peak ' + str(mountain_func_current[i]) + ' at ' + str(data[i,0]) + ',' + str(data[i,1])) + peak = mountain_func_current[i] peak_i = i; @@ -260,9 +264,12 @@ def find_density_clusters(data: Matrix, num_centers = 2, beta = 0.5, sigma = 0.5 # save cluster centers c_subtractive[k,:] = data[peak_i,:] - # destruct mountain functions + print('For cluster ' + str(k) + ' found peak ' + str(mountain_func_current[peak_i]) + ' at ' + str(data[peak_i,0]) + ',' + str(data[peak_i,1])) + + # destruct mountain functions at the current cluster center (peak of highest mountain function) for i in range(n_samples): - mountain_func_new[i] = mountain_func_new[i] - mountain_func_new[peak_i] * math.exp( - ( pow(distance_euclidean(data[i,:],c_subtractive[k,:]),2)) / (2*beta**beta) ) + mountain_func_current[i] -= math.exp( - ( pow(distance_euclidean(data[i,:],c_subtractive[k,:]),2)) / (2*beta*beta) ) + #mountain_func_current[i] -= mountain_func_current[k]*math.exp( - ( pow(distance_euclidean(data[i,:],c_subtractive[k,:]),2)) / (2*beta*beta) ) # assign all data points to a cluster depending on the distance labeled_subtractive = [0 for x in range(n_samples)] @@ -275,5 +282,5 @@ def find_density_clusters(data: Matrix, num_centers = 2, beta = 0.5, sigma = 0.5 cluster = k; labeled_subtractive[i] = cluster - return labeled_subtractive + return c_subtractive, labeled_subtractive diff --git a/mola/matrix.py b/mola/matrix.py index 871b713..d9a1dfc 100644 --- a/mola/matrix.py +++ b/mola/matrix.py @@ -962,7 +962,7 @@ def __type_two_row_operation(self,operable_row,scalar): # for j in cols_list: # self.data[i][j] = matrix[i-rows_first][j-cols_first] - def norm_Euclidean(self): + def norm_Euclidean(self) -> float: """Return the Euclidean norm of the matrix.""" norm = 0 for i in range(self.n_rows): diff --git a/mola/regression.py b/mola/regression.py index b56014c..b31ce25 100644 --- a/mola/regression.py +++ b/mola/regression.py @@ -24,7 +24,7 @@ def linear_least_squares(H: Matrix, z: Matrix, W=None): return th_tuple -def fit_univariate_polynomial(independent_values, dependent_values, degrees=[1], intercept=True, weights = None, regularization_coefficient = None): +def fit_univariate_polynomial(independent_values: Matrix, dependent_values: Matrix, degrees=[1], intercept=True, weights = None, regularization_coefficient = None): """ Return the parameters of an nth-order polynomial in a tuple. The algorithm uses least squares regression to minimize the term ||y-H*theta||^2, where y is the vector of dependent values, H is the observation matrix, and theta is the vector of parameters. @@ -69,7 +69,7 @@ def fit_univariate_polynomial(independent_values, dependent_values, degrees=[1], return th_tuple # fit nonlinear function parameters using Gauss-Newton iteration -def fit_nonlinear(independent_values, dependent_values, h, J, initial=None, max_iters = 100): +def fit_nonlinear(independent_values: Matrix, dependent_values: Matrix, h: Matrix, J: Matrix, initial=None, max_iters = 100): """ Return the estimated parameters of a nonlinear model using the Gauss-Newton iteration algorithm. diff --git a/mola/utils.py b/mola/utils.py index a878f38..426fc6b 100644 --- a/mola/utils.py +++ b/mola/utils.py @@ -178,8 +178,6 @@ def transpose_list(data: list) -> list: data_transposed.append(new_row) return data_transposed - - # return the unique rows of a matrix or a list def uniques(data): """ @@ -203,6 +201,29 @@ def uniques(data): unique_rows.append(row) return unique_rows +# return the euclidean norm of a matrix +def norm(data: Matrix) -> float: + """ + Return the Euclidean norm of a matrix. + You could also just call data.norm_Euclidean() directly, but this is a wrapper function for convenience. + + Arguments: + data -- Matrix: the matrix whose Euclidean norm is to be returned + """ + return data.norm_Euclidean() + +# construct a Matrix that represents a column vector from one-dimensional list +def column(data: list) -> Matrix: + """ + Return a column vector Matrix object constructed from a one-dimensional list. + This is the same as calling Matrix(data).get_transpose() with a check to make sure the list is one-dimensional. + + Arguments: + data -- list: the 1D list to be used as the data of the matrix + + Raises an exception if the list is multidimensional. + """ + if isinstance(list[0],list): + raise Exception("exception in utils.column(): list is multidimensional") -def norm(data): - return data.norm_Euclidean() \ No newline at end of file + return Matrix(data).get_transpose() \ No newline at end of file diff --git a/tests/clustering_test.py b/tests/clustering_test.py index 25e49e9..68c23f4 100644 --- a/tests/clustering_test.py +++ b/tests/clustering_test.py @@ -7,7 +7,7 @@ def test_k_means_clustering(): """Test hard k-means clustering.""" initial_centers = Matrix([[0,0],[20,0]]) symmetric_points = Matrix([[-1,0],[-2,2],[1,1],[20,2],[18,0],[22,-1],[23,-1]]) - centers = clustering.find_k_means(data=symmetric_points,num_centers=2,initial_centers=initial_centers) + centers = clustering.find_k_means(data=symmetric_points,num_centers=2,initial_centers=initial_centers)[0] assert(utils.equals_approx(centers,Matrix([[-0.6667, 1.0],[20.75, 0.0]]),precision = 1e-4)) def test_c_means_clustering():