# Implementation of the Lasso With High Missing Rate.

The goal of this notebook is to implement the lasso with high missing rate described [here](https://www.ijcai.org/proceedings/2019/0491.pdf). 

## Imports

In [2]:
# Imports
import numpy as np
import pandas as pd

import cvxpy as cp
from sklearn.preprocessing import StandardScaler

In [None]:
help(StandardScaler)

## HMLasso

In [44]:
class HMLasso():
  """
  Lasso regularization that performs well with high missing rate.

  mu is the hypermarameter of the model.
  """

  def __init__(self, mu=1):
    self.mu = mu
    self.S_pair = None
    self.rho_pair = None
    self.R = None
  
  def fit(self, X, y):
    """
    Fit the HMLasso on (X, y).
    X must be a mean-centered numpy array.
    """
    
    assert type(X) == np.ndarray
    assert type(y) == np.ndarray
    assert X.shape[0] == y.shape[0]
    assert len(y.shape) == 1

    
    self.__verify_centering__(X) # Veryfing whether X is well centered.

    # self.S_pair, self.rho_pair, self.R = self.__impute_params__(self, X, y)

    print("Model fitted")

  def __verify_centering__(self, X, tolerance=1e-8):
    for col in range(X.shape[1]):
      current_mean = X[:, col].mean()
      if abs(current_mean) > tolerance:
        raise Exception(f"Data is not centered: column {col} has mean of {current_mean}")
  
  def __impute_params__(self, X, y):
    n, p = X.shape
    S_pair = np.zeros((p, p))
    rho_pair = np.zeros((p, 1))
    R = np.zeros((p, p))

    for j in range(p):
      for k in range(p):
        I_jk = [i for i in range(n) if ~np.isnan(X[i, j]) and ~np.isnan(X[i, k])]
        n_jk = len(I_jk)

        # Matrix of weights
        R[j, k] = n_jk/n 

        # Pairwise covariance matrix of X
        for i in I_jk:
          S_pair[j, k] += X[i, j]*X[i, k]
        S_pair[j, k] /= n_jk

        # Pairwise covariance vector of X and y
        if k==j:
          for i in I_jk:
            rho_pair[j, 0] += X[i, j]*y[i]
          rho_pair[j, 0] /= n_jk

    return S_pair, rho_pair, R

lasso = HMLasso(mu=2)
X = np.array([[1, 2], [-1, -2]])
y = np.array([3, 5])
lasso.fit(X,y)



Model fitted


In [9]:
# Test
rho_pair = np.array([1, 2])
R = np.array([[1, 2], [2, 4]])
S_pair = np.array([[3, 6], [7, 5]])
mu = 1

## First problem

# Variable to optimize
n = R.shape[0]
print(n)
Sigma = cp.Variable((n, n), PSD=True)

# Objective to minimize
obj = cp.Minimize(cp.sum_squares(cp.multiply(R, Sigma-S_pair)))

# Constraints
constraints = [Sigma >> 0] # We want Sigma to be semi definite positive.

# Solve the optimization problem
prob = cp.Problem(obj, constraints)
prob.solve()

Sigma_opt = Sigma.value

print(Sigma_opt)
print(np.linalg.eig(Sigma_opt))

## Second problem
beta = cp.Variable(n)
obj2 = cp.Minimize(0.5 * cp.quad_form(beta, Sigma_opt) - rho_pair.T @ beta + mu * cp.norm1(beta))

# Solving
prob2 = cp.Problem(obj2, constraints)
prob2.solve()

beta_opt = beta.value

print(Sigma_opt, beta_opt)

2
[[6.11942845 5.65628764]
 [5.65628764 5.22819903]]
(array([1.13476275e+01, 3.78141962e-10]), array([[ 0.73434966, -0.67877138],
       [ 0.67877138,  0.73434966]]))
[[6.11942845 5.65628764]
 [5.65628764 5.22819903]] [6.60623935e-07 1.91269526e-01]


In [24]:
Sigma_opt[0, 1]=np.nan

In [35]:
y = np.array([3, 5])

In [40]:
len(y.shape)

1