Improved regression algorithms

visualize_regression.py - added a demonstration of using a weights matrix matrix.py - updated get() so that vector-shaped matrices can be accessed with a single index argument utils.py - added a function to construct a correlation matrix for regression tasks, and a function for calculating RMSE
jerela · Apr 9, 2024 · c9f3847 · c9f3847
1 parent 1b0a9a0
commit c9f3847
Show file tree

Hide file tree

Showing 3 changed files with 98 additions and 4 deletions.
diff --git a/examples/visualize_regression.py b/examples/visualize_regression.py
@@ -27,9 +27,43 @@
 data = LabeledMatrix({'num_items': n_items, 'cost': prices_total})
 print(data)
 
+
+
+
+
 # Next, let's fit a first-order polynomial with an intercept to the data. We treat the number of items as the independent variable and the final cost as the dependent variable.
 # A first order-polynomial with an intercept is just a function of the form y = a*x + b, where y is the dependent variable, x is the independent variable, and a and b are parameters that we are estimating by fitting the function.
-params = regression.fit_univariate_polynomial(independent_values=data.get_column('num_items',as_list=False), dependent_values=data.get_column('cost',as_list=False), degrees=[1], intercept=True)
+params = regression.fit_univariate_polynomial(independent_values=data.get_column('num_items',as_list=False), dependent_values=data.get_column('cost',as_list=False), degrees=[1], intercept=True, weights=None)
+
+# A more mathematical way to fit the polynomial is by using linear algebra as below.
+H = (data.get_column('num_items', as_list=False)).concatenate(utils.ones(100,1))
+y = data.get_column('cost', as_list=False)
+# We save the fitted coefficients into "theta" instead of "params" as earlier.
+theta = (H.get_transpose()*H).get_inverse() * H.get_transpose() * y
+# Note that although the values in params and theta are equal, params is a tuple and theta is a matrix.
+
+print("Theta: ", theta)
+print("Parameters (should be same as theta): ", params)
+
+# To demonstrate the use of a weights matrix, we construct it from the residuals (errors between fitted values and observations).
+estimates = H*theta
+residuals = data.get_column('cost',as_list=False)-estimates
+# The actual weights matrix can be constructed as a correlation matrix using a power variance assumption.
+W = utils.correlation_matrix(n=residuals.get_height(), x=residuals)
+
+# Let's estimate the fitted coefficients while incorporating the assumed weight matrix.
+theta_weighted = (H.get_transpose()*W*H).get_inverse() * H.get_transpose() * W * y
+print('Theta weighted: ', theta_weighted)
+estimates_weighted = H*theta_weighted
+residuals_weighted = data.get_column('cost',as_list=False)-estimates_weighted
+
+# We can evaluate how well the two polynomials fit to this data by calculating their RMSE (root mean square error).
+rmse_linear = utils.rmse(estimates,residuals)
+rmse_linear_weighted = utils.rmse(estimates_weighted,residuals_weighted)
+
+print('RMSE of linear fit: ', rmse_linear)
+print('RMSE of weighted linear fit: ', rmse_linear_weighted)
+# In this case, the incorporating the weight matrix increased RMSE. This was expected, as the weights matrix assumes a non-constant error variance, but we generated the data for this example without modeling such variance.
 
 # To demonstrate nonlinear regression, let's fit a nonlinear function to the data. This function is of the form a*x + c*sin(b*x) + d, where a, b, c, and d are parameters that we are estimating by fitting the function. We can see that a is the slope of a linear term, b is the period of a sine term, c is the amplitude of the sine term, and d is the intercept.
 # First, we define the function in a form that can be used by the regression algorithm. We use a lambda function, where theta constains the parameters and x contains the independent variables. The function takes as input the list of parameters and the list of independent variables, and returns the value of the function for the given parameters and independent variables.
@@ -46,12 +80,14 @@
 params_nl = regression.fit_nonlinear(independent_values=data.get_column('num_items',as_list=False), dependent_values=data.get_column('cost',as_list=False), h = h_mat, J = J_mat, initial=initial_guess)
 
 # Also, print the parameters.
-print("Parameters of nonlinear fit: " + str(params))
+print("Parameters of linear fit: " + str(params))
 print("Parameters of nonlinear fit: " + str(params_nl))
 
 # Let's plot the data and the fitted function (first-order polynomial with an intercept).
 plt.plot(n_items, prices_total, 'b.', label='data points')
 plt.plot([0,30],[params[1],params[1]+30*params[0]],'r-', label=f'linear fit y={round(params[0],2)}x+{round(params[1],2)}')
+# We also plot the weighted linear fit with an intercept.
+plt.plot([0,30],[theta_weighted.get(1),theta_weighted.get(1)+30*theta_weighted.get(0)],'r:', label=f'weighted linear fit y={round(theta_weighted.get(0),2)}x+{round(theta_weighted.get(1),2)}')
 
 # Let's also plot the fitted nonlinear function.
 x = [i/100 for i in range(3000)]

diff --git a/mola/matrix.py b/mola/matrix.py
@@ -408,8 +408,13 @@ def set(self, i: int, j: int, value):
         self.data[i][j] = value
 
     # get a single value in a given index
-    def get(self, i: int, j: int):
+    def get(self, i: int, j: int = None):
         """Get the element at specified position."""
+        if self.n_rows == 1 and j is None:
+            j = i
+            i = 0
+        if self.n_cols == 1 and j is None:
+            j = 0
         return self.data[i][j]
 
     # define what happens when the matrix is converted to a string, such as when print(Matrix) is called

diff --git a/mola/utils.py b/mola/utils.py
@@ -1,5 +1,6 @@
 import random
 import statistics
+import math
 from mola.matrix import Matrix
 
 
@@ -336,4 +337,56 @@ def diag(x):
         mat[i,i] = v[i]
     return mat
 
-# ITERATIVELY REWEIGHTED GENERALIZED LEAST SQUARES
+def correlation_matrix(n, x=None, sigma=0.5, delta=1.1, epsilon=1e-3, phi=1.1, process=None):
+    """
+    Return a square correlation matrix with given assumptions.
+    
+    Arguments:
+    n -- int: number of rows and columns in the matrix to return
+    x -- list or Matrix: a vector defining the variance function, such as a vector of residuals (default None)
+    sigma -- float: assumed standard deviation of the constant part of model errors (default 0.5)
+    delta -- float: shape parameter of the assumed power function of error variance (default 1.1)
+    epsilon -- float: a factor used to prevent the power function from approaching zero (default 1e-3)
+    phi -- float: the assumed correlation between successive observations when assuming an AR(1) process
+    process -- string: 'AR(1)' or None (default None)
+    
+    Use this function to return a correlation matrix when assuming a non-constant error variance or correlation between residual errors, or a combination of both.
+    """
+    W = identity(n)
+
+    if process == 'AR(1)' and x is None:
+        for i in range(n):
+            for j in range(n):
+                W[i,j] = phi**abs(i-j)
+    elif process == 'AR(1)' and x is not None:
+        for i in range(n):
+            for j in range(n):
+                W[i,j] = sigma**2 * abs(x[i]*x[j])**delta * phi**abs(i-j)
+    elif process is None and x is not None:
+        for i in range(n):
+            W[i,i] = sigma**2 * (epsilon+abs(x[i])**delta)**2
+    else:
+        raise Exception('Invalid parameters in correlation_matrix()')
+
+    return W
+
+def rmse(x1,x2):
+    """
+    Return the root mean square error of two vectors.
+    
+    Arguments:
+    x1 -- list or Matrix: the first vector
+    x2 -- list or Matrix: the vector to compare against
+    
+    Raises an exception if the vectors are not of equal length.
+    """
+    if isinstance(x1,Matrix) and isinstance(x2,Matrix):
+        x1 = x1.get_column(0,as_list=True)
+        x2 = x2.get_column(0,as_list=True)
+
+    n = len(x1)
+    if len(x1)!=len(x2):
+        raise Exception('Arguments must be of equal length in rmse()')
+
+    return math.sqrt(sum([(x-y)**2 for x,y in zip(x1,x2)])/n)
+