In [1]:
import numpy as np

In [2]:
# You can add additional common utils(functions that you want to use across multiple questions) in this cell
# This cell will be run for every question

## Cost Function in Ridge Regression
Implement the `compute_cost_ridge_regression()` function, which computes <br>the  cost value in Ridge regression ($L_2$ regularized linear regression) using vectorization.

**Arguments:**

* **`X`** : Design Matrix.
  * A 2D numpy array of shape (num of instances, num of features)

* **`Y`** : Target values corresponding to each training instance in $X$.
  * A 2D numpy array of shape (num of instances, 1)

* **`theta`** : Parameters ($\theta_0, \theta_1,..,\theta_n$)
  * A 2D numpy array of shape (num of features + 1, 1)

* **`Lambda`** :  lambda($\lambda$)
  * A float value

**Returns:**

* $L_2$ regularized cost value with the given parameters and data
<br><br>$\hspace{20mm}J(\theta) = \frac{1}{2m}(X\theta - Y)^T(X\theta-Y) + \frac{\lambda}{2m}\theta^TI'\theta\\[0.5pt]$
<br>$\hspace{2cm}$(where $I'$ is an identity matrix with the first element as 0)


In [3]:
def compute_cost_ridge_regression(X, Y, theta, Lambda):
  #ADD YOUR CODE HERE
  X = np.insert(X,0,1,axis = 1)
  H = np.dot(X,theta)
  diff = H-Y
  m = len(X)
  I = np.identity(len(theta))
  I[0][0] = 0
  cost_term = (1/(2*m)) * np.dot(diff.T,diff)
  reg_term = (Lambda/(2*m)) * np.dot(np.dot(theta.T,I), theta)
  J = cost_term + reg_term
  return np.squeeze(J)


In [4]:
#SAMPLE TEST CASE
X = np.array([[5, 2, 7.3, 6],
             [6, 1, 0.2, 4],
             [10, 12.1, 1.4, 4]])
Y = np.array([[0.2], [1.1], [2.3]])
theta = np.array([[0.1], [0.3], [-0.5], [-0.2], [0.4]])
Lambda = 0.1
print(np.round(compute_cost_ridge_regression(X, Y, theta, Lambda),3))

3.459


**Expected Output:**
```
3.459
```

## Cost Function in Lasso Regression
Implement the `compute_cost_lasso_regression()` function, which computes<br> the cost value in Lasso regression ($L_1$ regularized linear regression).

**Arguments:**

* **`X`** : Design Matrix.
  * A 2D numpy array of shape (num of instances, num of features)

* **`Y`** : Target values corresponding to each training instance in $X$.
  * A 2D numpy array of shape (num of instances, 1)

* **`theta`** : Parameters ($\theta_0, \theta_1,..,\theta_n$)
  * A 2D numpy array of shape (num of features + 1, 1)

* **`Lambda`** :  Regularization parameter($\lambda$)
  * A float value

**Returns:**

* $L_1$ regularized cost value with the given parameters and data
<br><br>$\hspace{20mm}J(\theta) = \frac{1}{2m}(X\theta - Y)^T(X\theta-Y) + \frac{\lambda}{2m}\sum_{j=1}^n|\theta_j|$


In [11]:
from os import XATTR_CREATE
def compute_cost_lasso_regression(X, Y, theta, Lambda):
  #ADD YOUR CODE HERE
  X = np.insert(X,0,1,axis = 1)
  H = np.dot(X,theta)
  diff = H-Y
  m = len(X)
  cost_term = (1/(2*m)) * np.dot(diff.T, diff) 
  theta_abs = np.absolute(theta[1:])
  reg_term = (Lambda/(2*m)) * np.sum(theta_abs)
  J = cost_term + reg_term
  return np.squeeze(J)



In [12]:
#SAMPLE TEST CASE
X = np.array([[5, 2, 7.3, 6],
             [6, 1, 0.2, 4],
             [10, 12.1, 1.4, 4]])
Y = np.array([[0.2], [1.1], [2.3]])
theta = np.array([[0.1], [0.3], [-0.5], [-0.2], [0.4]])
Lambda = 0.1
print(np.round(compute_cost_lasso_regression(X, Y, theta, Lambda),3))


3.473


**Expected Output:**
```
3.473
```

## Gradient Descent in Ridge Regression
Implement the `gradient_descent_ridge_regression()` function, which computes the optimal parameter values using gradient descent in Ridge regression.

**Arguments:**

* **`X`** : Design Matrix.
  * A 2D numpy array of shape (num of instances, num of features)

* **`Y`** : Target values corresponding to each training instance in $X$.
  * A 2D numpy array of shape (num of instances, 1)

* **`theta`** : Parameters ($\theta_0, \theta_1,..,\theta_n$)
  * A 2D numpy array of shape (num of features + 1, 1)

* **`Lambda`** :  Regularization parameter($\lambda$)
  * A float value

* **`cost_diff_threshold`** : threshold of the absolute cost difference to stop iterating in gradient descent
  * A float value

* **`learning_rate`** :  Learning rate($\alpha$)
  * A float value

**Returns:**

* Optimal parameters with the given data <br><br>$\hspace{20mm}\theta = \theta -  \frac{\alpha}{m}[X^T(X\theta-Y) + \lambda I'\theta]\\[0.1pt]$
<br>$\hspace{2cm}$(where $I'$ is an identity matrix with the first element as 0)


**NOTE:**
* The gradient descent is said to be converged when the absolute value of the cost difference is less than the given threshold.
* Stop iterating when the gradient descent starts to diverge.
* You can call `compute_cost_ridge_regression()` implemented in the previous question to compute the cost value for convergence check.

In [40]:
def gradient_descent_cost_term(X, Y, theta, Lambda):
  X = np.insert(X,0,1,axis = 1)
  H = np.dot(X, theta)
  diff = H - Y
  cost_term = np.dot(X.T, diff)
  I = np.identity(len(theta))
  I[0][0] = 0
  reg_term = Lambda * (np.dot(I, theta))
  m = len(X)
  d_theta = (1/m) * (np.dot(X.T, diff) + reg_term)
  return d_theta

def gradient_descent_ridge_regression(X, Y, theta, Lambda, cost_diff_threshold, learning_rate):
   i=0
   cost_diff = cost_diff_threshold + 1
   costs = [compute_cost_ridge_regression(X, Y, theta, Lambda)]

   while(abs(cost_diff) > cost_diff_threshold):
     d_theta = gradient_descent_cost_term(X, Y, theta, Lambda)
     theta = theta - (learning_rate * d_theta)
     costs.append(compute_cost_ridge_regression(X, Y, theta, Lambda))
     cost_diff = costs[i+1] - costs[i]
     if cost_diff > 0:
       break
     i = i+1
   
   return theta

In [41]:
#SAMPLE TEST CASE
X = np.array([[5, 2, 7.3, 6],
             [6, 1, 0.2, 4],
             [10, 12.1, 1.4, 4]])
Y = np.array([[0.2], [1.1], [2.3]])
theta = np.array([[0.8], [0.1], [0.1], [-0.1], [-0.1]])

Lambda = 0.9
cost_diff_threshold =1e-11
learning_rate = 0.01

optimal_theta = np.round(gradient_descent_ridge_regression(X, Y, theta, Lambda, cost_diff_threshold, learning_rate),3)
print(*(optimal_theta.flatten()))

0.801 0.061 0.099 -0.118 -0.037


**Expected Output:**
```
0.801 0.061 0.099 -0.118 -0.037
```

## Normal Equation method in Ridge Regression
Implement the `normal_equation_ridge_regression()` function, which computes the optimal parameter values using the Normal Equation method in Ridge regression.

**Arguments:**

* **`X`** : Design Matrix.
  * A 2D numpy array of shape (num of instances, num of features)

* **`Y`** : Target values corresponding to each training instance in $X$.
  * A 2D numpy array of shape (num of instances, 1)


* **`Lambda`** : Regularization parameter($\lambda$)
  * A float value


**Returns:**

* Optimal parameters with the given data
<br><br>$\hspace{20mm}\theta = (X^TX+\lambda I')^{-1} X^TY\\[0.5pt]$
<br>$\hspace{2cm}$(where $I'$ is an identity matrix with the first element as 0)


In [58]:
def normal_equation_ridge_regression(X, Y, Lambda):
  #ADD YOUR CODE HERE
  X = np.insert(X,0,1,axis = 1)
  xtx = np.dot(X.T,X)
  n = X.shape
  I = np.identity((n[1]))
  I[0][0] = 0
  li = Lambda * I
  inverse = np.linalg.inv(xtx + li)
  xty = np.dot(X.T,Y)
  theta = np.dot(inverse, xty)
  return theta
  

In [59]:
#SAMPLE TEST CASE
X = np.array([[5, 2, 7.3, 6],
             [6, 1, 0.2, 4],
             [10, 12.1, 1.4, 4]])
Y = np.array([[0.2], [1.1], [2.3]])
Lambda = 0.9

print(*np.round(normal_equation_ridge_regression(X, Y, Lambda).flatten(),3))

0.801 0.061 0.099 -0.118 -0.037


**Expected Output:**
```
0.801 0.061 0.099 -0.118 -0.037
```