Added statistic measures and new regression wrapper

Matrix: - defined iterability for the Matrix class - added method get_absolute_matrix() for returning the matrix where elements are absolute values of the original elements utils: - added methods for calculating covariance, variance, standard deviation and Pearson linear correlation coefficient regression: - added a method for parameter estimation using iteratively reweighted least squares method
jerela · Apr 4, 2024 · 6b19fe0 · 6b19fe0
1 parent ec07272
commit 6b19fe0
Show file tree

Hide file tree

Showing 4 changed files with 210 additions and 13 deletions.
diff --git a/main.py b/main.py
@@ -122,20 +122,19 @@
 
 
 # TEST GAUSS-NEWTON ITERATION
-h = Matrix([lambda a,x: pow(a,x)])
-independents = Matrix([1, 2, 3]).get_transpose()
-y = Matrix([2, 4, 8]).get_transpose()
-# let J be the Jacobian of h(x)
-J = Matrix([lambda a,x: x*pow(a,x-1)])
-
-theta = regression.fit_nonlinear(independents, y, h, J, initial=Matrix([0.5]))
-print(theta)
+#h = Matrix([lambda a,x: pow(a,x)])
+#independents = Matrix([1, 2, 3]).get_transpose()
+#y = Matrix([2, 4, 8]).get_transpose()
+## let J be the Jacobian of h(x)
+#J = Matrix([lambda a,x: x*pow(a,x-1)])
 
+#theta = regression.fit_nonlinear(independents, y, h, J, initial=Matrix([0.5]))
+#print(theta)
 
 # TEST K-MEANS CLUSTERING
 initial_centers = Matrix([[0,0],[20,0]])
 symmetric_points = Matrix([[-1,0],[-2,2],[1,1],[20,2],[18,0],[22,-1],[23,-1]])
-centers = clustering.find_k_means(data=symmetric_points,num_centers=2,initial_centers=initial_centers)
+centers = clustering.find_k_means(data=symmetric_points,num_centers=2,initial_centers=initial_centers)[0]
 
 
 print(centers)
@@ -146,3 +145,35 @@
 density_points = Matrix([[1,2], [0.5,1.5], [0,1], [0,0.5], [0,0], [0,-0.5], [0,-1], [0.5,-1.5], [1,-2], [2,0], [2.5,-0.5], [3,-1], [3,-1.5], [3,-2], [3,-2.5], [3,-3], [2.5,-3.5], [2,-4]])
 mountaintops = clustering.find_density_clusters(data=density_points, num_centers=2, beta = 0.25, sigma = 0.25)
 print(mountaintops)
+
+
+
+x = [1, 2, 4, 7]
+c = utils.cov(x)
+sd = utils.std(x)
+vari = utils.var(x)
+print(x)
+print(c)
+print(sd)
+print(vari)
+X = Matrix([[1, 2], [4, 7]])
+C = utils.cov(X)
+Cm = utils.covmat(X)
+print(X)
+print(C)
+print(Cm)
+
+x = Matrix([1, 2, 3, 4]).get_transpose()
+y = Matrix([2, 4, 6, 8]).get_transpose()
+R = utils.pearson(x,y)
+print(R)
+
+
+# define the observation matrix
+H = Matrix([[2,1],[4,1],[6,1]])
+# define the measurements
+y = Matrix([[0],[1],[2]])
+
+theta = regression.irls(H, y, threshold = 1e-12)
+
+print(theta)
diff --git a/mola/matrix.py b/mola/matrix.py
@@ -24,7 +24,6 @@ def __init__(self, *args):
         elif len(args) == 3:
             self.__construct_by_dimensions(args[0], args[1], args[2])
 
-
     # construct a matrix with r rows, c columns, and some initial value (default 0)
     def __construct_by_dimensions(self,r,c,value=0):
         """
@@ -76,6 +75,18 @@ def __construct_from_lists(self,lists):
             self.n_cols = len(lists)
             self.data = [lists]
 
+    # define iterability for the matrix
+    def __iter__(self):
+        """
+        Return an iterator of the matrix.
+        For a single-column matrix, the iterator loops through the elements of the column.
+        For other matrices, the iterator loops through the columns as lists.
+        """
+        if self.n_cols == 1:
+            return iter(self.get_column(0,as_list=True))
+        else:
+            return iter(self.data)
+
     def __abs__(self):
         """
         Return the absolute value of a 1x1 matrix (i.e., a matrix with just one element).
@@ -987,7 +998,20 @@ def get_dominant_eigenvector(self):
             err = (eigenvector-eigenvector_prev).norm_Euclidean()
 
         return eigenvector 
-
+
+    def get_absolute_matrix(self):
+        """
+        Return a matrix where the elements are the absolute values of the original matrix.
+        """
+
+        n = self.get_height()
+        m = self.get_width()
+        mat = Matrix(n,m,0)
+
+        for i in range(n):
+            for j in range(m):
+                mat[i,j] = abs(self[i,j])
+        return mat
 
 
 class LabeledMatrix(Matrix):

diff --git a/mola/regression.py b/mola/regression.py
@@ -1,5 +1,5 @@
 from mola.matrix import Matrix
-from mola.utils import identity, ones, zeros, randoms
+from mola.utils import identity, ones, zeros, randoms, covmat, diag
 from copy import deepcopy
 
 def linear_least_squares(H: Matrix, z: Matrix, W=None):
@@ -23,6 +23,38 @@ def linear_least_squares(H: Matrix, z: Matrix, W=None):
     th_tuple = (th.get(0,0), th.get(1,0))
     return th_tuple
 
+def irls(H: Matrix, z: Matrix, threshold = 1e-6):
+    """
+    Return the iteratively reweighted least squares (IRLS) estimate of the parameters of a model defined by observation matrix H and dependent values z.
+    
+    Arguments:
+    H -- Matrix: the observation matrix of the linear system of equations
+    z -- Matrix: the observed or dependent values depicting the right side of the linear system of equations
+    """
+
+    # estimate the parameters using ordinary least squares
+    th = ((H.get_transpose())*H).get_inverse() * H.get_transpose() * z
+
+
+    difference = float('inf')
+
+    while difference > threshold:
+
+        # calculate absolute values of residuals
+        residuals = (H*th-z).get_absolute_matrix()
+
+        # estimate sample variance-covariance matrix V using the OLS residuals
+        #V = covmat(residuals)
+        W = diag(residuals)
+        print("W: ", W)
+
+        # re-estimate the parameters using generalized least squares
+        th = ((H.get_transpose())*W*H).get_inverse() * H.get_transpose() * W * z
+
+        difference = residuals.norm_Euclidean()
+
+    th_tuple = tuple(th.get_column(0))
+    return th_tuple
 
 def fit_univariate_polynomial(independent_values: Matrix, dependent_values: Matrix, degrees=[1], intercept=True, weights = None, regularization_coefficient = None):
     """

diff --git a/mola/utils.py b/mola/utils.py
@@ -1,4 +1,5 @@
 import random
+import statistics
 from mola.matrix import Matrix
 
 
@@ -226,4 +227,113 @@ def column(data: list) -> Matrix:
     if isinstance(list[0],list):
         raise Exception("exception in utils.column(): list is multidimensional")
 
-    return Matrix(data).get_transpose()
+    return Matrix(data).get_transpose()
+
+# calculate the variance of a vector of values
+def var(X):
+    """
+    Return the variance of the input.
+    """
+    return statistics.variance(X)     
+
+# calculate the covariance of two vectors
+def cov(X,Y = None) -> float:
+    """
+    Return the covariance of a random variable or between two random variables.
+    
+    Note that the result is the "sample covariance" that is normalized by N-1 rather than N, where N is the number of samples.
+    Use covmat() if you want to calculate the covariance matrix.
+    
+    Arguments:
+    X -- list or Matrix: the values representing samples from the first random variable or column-organized samples from several random variables
+    Y -- list or Matrix: the values representing samples from the second random variable (optional)
+    """
+
+    # if X is a list and Y is not defined, calculate the covariance of X with itself, i.e., variance
+    if isinstance(X,list) and Y is None:
+        return var(X)
+
+    # if X is a matrix and Y is not defined, try to calculate the covariance between the columns of X
+    elif isinstance(X,Matrix) and Y is None:        
+        n_cols = X.get_width()
+        # if X has more than 2 columns, calculate the covariance matrix
+        if n_cols > 2:
+            return covmat(X)
+        # if X has 2 columns, calculate the covariance between them
+        if n_cols == 2:
+            return cov(X[:,0],X[:,1])
+        # if X has 1 column, calculate its covariance with itself (variance)
+        if X.get_width() == 1:
+            return var(X.get_column(0,as_list=True))
+
+    # if X and Y are both lists, calculate their sample covariance
+    elif isinstance(X,list) and isinstance(Y,list):
+        n = len(X)
+        mX = statistics.mean(X)
+        mY = statistics.mean(Y)
+        c = 0
+        for x,y in zip(X,Y):
+            c += (x-mX) * (y-mY)
+        c = c/(n-1)
+        return c
+
+    # if X and Y are both matrices but have 1 column each, calculate their covariance
+    elif isinstance(X,Matrix) and isinstance(Y,Matrix):
+        if X.get_width() == 1 and Y.get_width() == 1:
+            return cov(X.get_column(0,as_list=True),Y.get_column(0,as_list=True))
+        else:
+            raise Exception("Invalid arguments for cov()")
+
+
+
+
+
+def std(X):
+    """
+    Return the standard deviation of the input.
+    """
+    return statistics.stdev(X)
+
+def covmat(X: Matrix) -> Matrix:
+    """
+    Return the covariance matrix for the input matrix.
+    The diagonal of the covariance matrix contains variances.
+    Note that the result is the "sample covariance" that is normalized by N-1 rather than N, where N is the number of samples.
+    
+    Arguments:
+    X -- Matrix: a matrix where the vectors are organized into columns
+    """
+    if isinstance(X,Matrix):
+
+        n = X.get_height()
+        m = X.get_width()
+
+        c = zeros(m,m)
+
+        for i in range(m):
+            for j in range(m):
+                c[i,j] = cov(X.get_column(i,as_list=True),X.get_column(j,as_list=True))
+
+        return c
+    else:
+        raise Exception("Input to covmat() must be a Matrix object.")
+
+
+def pearson(X,Y):
+    return cov(X,Y)/(std(X)*std(Y))
+
+def diag(x):
+    if isinstance(x,Matrix):
+        if x.get_height() == 1:
+            v = x.get_row(0,as_list = True)
+        elif x.get_width() == 1:
+            v = x.get_column(0,as_list = True)
+        else:
+            raise Exception("Input argument to diag() is invalid")
+    n = len(v)
+    mat = identity(n)
+    for i in range(n):
+        mat[i,i] = v[i]
+    return mat
+
+# ITERATIVELY REWEIGHTED GENERALIZED LEAST SQUARES