In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt

# Loading data

data_matrix_train, COP_train, data_matrix_test, COP_test, names = np.load('data_center_data_matrix.npy', allow_pickle=True)

# Constructing matrices for min_w ||A w - b||_2**2

matrix_mean = np.mean(data_matrix_train, axis=0)
M = data_matrix_train - matrix_mean
matrix_std = np.std(M, axis=0)
M = M / matrix_std

A = np.hstack([M, np.ones((M.shape[0],1)), -(M.T * COP_train[:,3]).T])
b = COP_train[:,3]

# Constructing matrices for the test set

M_test = (data_matrix_test - matrix_mean) / matrix_std
A_test = np.hstack([M_test, np.ones((M_test.shape[0],1)), -(M_test.T * COP_test[:,3]).T])
b_test = COP_test[:,3]


# Loading raw data
import pandas as pd
data = pd.read_csv('Raw_Dataset_May.csv')

def name_to_subcategory_and_details(col_name):
    if np.isreal(col_name):
        col_name = names[col_name]
    indices = np.nonzero((data['NAME'] == col_name).values)[0]
    if len(indices) > 0:
        subcategory = data['SUBCATEGORY'].iloc[[indices[0]]].values[0]
        details = data['DETAILS'].iloc[[indices[0]]].values[0]
        return subcategory, details
    else:
        print('unknown name')

$\textbf{Question 3.1:}$



With have: $\forall t, (Aw)_t=b_t$ hence $(A w)_t=\tilde{x}(t)^{\top} w_1+w_0-y(t) \times \tilde{x}(t)^{\top} w_2=y(t)$

By factorization and division, we get: 

$$
y(t)=\frac{w_1^{\top} \tilde{x}(t)+w_0}{w_2^{\top} \tilde{x}(t)+1} .
$$


$\textbf{Question 3.2:}$

In [3]:
w_train = np.linalg.lstsq(A, b, rcond=None)[0]

print("The solution for the least squares problem is: \n\n",w_train)

The solution for the least squares problem is: 

 [-0.00927821  0.08309371 -0.03672704 ...  0.01980595 -0.03057174
 -0.01188614]


$\textbf{Question 3.3:}$

In [4]:
y_predicted = A_test @ w_train

norm2=np.linalg.norm(y_predicted-b_test, ord=2)

print("The quality of the solution using norm 2 is:",norm2)

The quality of the solution using norm 2 is: 530.9466555560931


$\textbf{Question 3.4:}$

In [5]:
lambda_param = 100
lambda_I = np.sqrt(lambda_param) * np.identity(A.shape[1])

A_augmented = np.vstack([A, lambda_I])
b_augmented = np.concatenate([b, np.zeros(A.shape[1])])

w_ridge= np.linalg.lstsq(A_augmented, b_augmented, rcond=None)[0]

y_predicted_ridge = A_test @ w_ridge

norm2_ridge=np.linalg.norm(y_predicted_ridge-b_test)

print("Mean Absolute Error with Ridge regularization:", norm2_ridge)

Mean Absolute Error with Ridge regularization: 329.66770078662984


$\textbf{Question 3.5:}$

$\forall w, \nabla f_1(w)=A^T(Aw-b)+\lambda w$

$f_1$ is convex as a sum of two convex functions. 







$\textbf{Question 3.6:}$

$|| \nabla f_1(w_1)-\nabla f_1(w_2) ||=||A^T(Aw_1-b)+\lambda w_1 - A^T(Aw_2-b)-\lambda w_2||=||(w_1-w_2)(A^TA+\lambda)||\leq|||A^TA+\lambda I_n|||*||w_1-w_2||$

Hence $\nabla f_1$ is $L$-Lipschitz with $L=|||A^TA+\lambda I_n|||$. We can take for the step size: $\gamma = 1/L$ 

In [6]:
L = np.linalg.norm(A.T.dot(A)+lambda_param*np.identity(A.shape[1]), ord=2)

def gradient_f(w):
    return (A.T.dot(A).dot(w)-A.T.dot(b)+lambda_param * w)

step=1/L

def gradient_descent(A, step, epochs):
    m, n = A.shape
    x = np.random.rand(n)
    gradient_f_history = []
    for epoch in range(epochs):
        gradient = gradient_f(x)
        x = x - step *gradient 
        gradient_f_history.append(np.linalg.norm(gradient))

    return x, gradient_f_history




$\textbf{Question 4.1:}$

$F_2=f_2+g_2$ with $\forall w, f_2(w)=\frac{1}{2}||Aw-b||^2$ and $g_2(w)=\lambda ||w||_1$

$prox_{g_2}(x)=argmin_y(g_2(y)+\frac{1}{2}||x-y||^2)$

$\forall w, \nabla f_2(w)=A^T(Aw-b)$


$\textbf{Question 4.2:}$


In [7]:
L = np.linalg.norm(A.T.dot(A)+lambda_param*np.identity(A.shape[1]), ord=2)

step=1/L

def prox_g_2(x, step):
    return np.sign(x) * np.maximum(np.abs(x) - step, 0.)

def proximal_gradient_descent(A, step, epochs):
    m,n=A.shape
    x = np.random.rand(n)
    gradient_f_history = []
    
    for epoch in range(epochs):
        gradient=gradient_f(x)
        x = prox_g_2(x-step*gradient, step)
        gradient_f_history.append(np.linalg.norm(gradient))

    return x, gradient_f_history

proximal_gradient_descent(A, 2*step, 1000)



    
    

(array([ 0.39273513,  0.36118052, -0.12455115, ...,  0.24016322,
        -0.12827975,  0.25459819]),
 [19241300.78895369,
  18864267.432160076,
  18736921.51881801,
  18688889.690239936,
  18668655.81834893,
  18659179.87411243,
  18654328.93062344,
  18651675.309246924,
  18650141.213657483,
  18649215.279516347,
  18648626.808846634,
  18648237.433179148,
  18647962.311999813,
  18647761.52673302,
  18647603.5013641,
  18647477.87659605,
  18647370.095898185,
  18647279.401277192,
  18647196.901454676,
  18647125.001809377,
  18647057.274115115,
  18646996.85922612,
  18646938.77679305,
  18646886.22376448,
  18646834.722569264,
  18646787.84072,
  18646741.250175346,
  18646698.702291343,
  18646655.640332088,
  18646616.3701204,
  18646576.356636364,
  18646539.75188188,
  18646502.14702774,
  18646467.695714954,
  18646432.07877293,
  18646399.448458087,
  18646365.539110806,
  18646333.99513156,
  18646301.413300484,
  18646271.19657262,
  18646239.754633807,
  18646210.6452071,
