### SimpleMKL Training

- In scikit-learn we utilized a single kernel (radial basis function) with cross validated bandwidth (via grid search)

- We now want to extend to the case of multiple kernels (linear combination of a basis set)

- Inspiration in our original implementation from here 

     - https://github.com/qintian0321/SimpleMKL_python

In [1]:
import numpy as np
import pandas as pd
from sklearn import svm

# SimpleMKL optimization objective

### Primal problem 


### Dual Problem

$$\max_\alpha \ \frac{-1}{2} \sum_{i,j} \alpha_i \alpha_j y_i y_j \sum_m d_mK_m(x_i,x_j) +\sum_i \alpha_i$$

s.t. $$\sum_i \alpha_i y_i=0$$ and $$C \geq \alpha_i \geq 0$$



- Coefficient vector $ \alpha$ is importance of observed features on classification problem




### Abstract Kernel Class 

In [2]:
class Kernel():
    def __init__(self,kernel_type,order=None):
        self.kernel_type=kernel_type
        self.order=order
    
    def compute_kernel(self,X):
        
        if self.kernel_type=='linear':
            return np.dot(X,X.T)
        
        if self.kernel_type=='gaussian':
            return np.exp(X)
        
        if self.kernel_type=='polynomial':
            return np.dot(X**self.order,X.T**self.order)
        

### Functions for Multiple Kernel Optimization

In [3]:
def compute_dual(X,y,kernel_list,d_m,constant,compute_gap=True):
    """ Compute dual objective value
    """
    kernel=compose_kernels(X,kernel_list,d_m)
    single_kernel=svm.SVC(C=constant,kernel='precomputed')
   
    single_kernel.fit(kernel,y)
    
    alpha=np.empty(len(y))
    alpha[single_kernel.support_]=np.abs(single_kernel.dual_coef_[0]) 
    alpha[alpha==None]=0
    
    
    J=0.5*np.dot(np.dot(alpha,kernel*y),alpha.T)+np.sum(alpha)
    
    if compute_gap:
        kernel_eval=[np.dot(np.dot(np.dot(alpha,alpha.T),np.dot(y,y.T)),k_i.compute_kernel(X)) for k_i in kernel_list]
        duality_gap=J-np.sum(alpha)+0.5*np.max(kernel_eval)
        
        return J,duality_gap,alpha
    
    return J

def compose_kernels(X,kernel_list,weights):
    """ Compute positive linear combination of kernels 
    """
    return np.sum(np.array([weights[ct]*k_i.compute_kernel(X) for ct,k_i in enumerate(kernel_list)]),axis=0)
    
def compute_gradient(kernel,X,y,alpha):
    """ Compute gradient of MKL objective closed form ; vector
    """
    kernel_mat=kernel.compute_kernel(X)
    gradient_obj=np.sum(np.dot(-0.5*np.dot(alpha.reshape(-1,1), np.dot(np.dot(alpha.reshape(1,-1),y.reshape(-1,1)) , y.reshape(1,-1))),kernel_mat))

    return gradient_obj


def descent_direction(m,d_m,mu,grad_dm,grad_mu,D):
    """ Compute direction of gradient descent ; vector 
    """
    if d_m==0 and grad_dm-grad_mu>0:
      
        return 0
    
    elif d_m>0 and m!=mu:
      
        return -grad_dm+grad_mu
    
    elif m==mu:
        a=np.arange(0,len(D))
        b=np.array(mu)
        grad_index=np.setdiff1d(a,b)
        
        
        if len(grad_index)>0:
            direction=np.sum(D[grad_index]-grad_mu)
        else:
            direction=-grad_mu
        return direction
    else:
        return 0
    
def line_search(X,y,kernel_list,D,d_m,gamma_max,disc):
    """ Selects step size to minimize obj value;  
    
        Update from heuristic to exact Armijo's rule 
    """

    if gamma_max==0:
        return gamma_max
    
    # grid of step size begins bigger than 0
    grid=np.arange(0+gamma_max/disc,gamma_max,gamma_max/disc)
    
    min_gamma,min_obj_val=None,10e8
    for gamma_i in grid:
        d_i=d_m+gamma_i*D
        dual_obj_val=compute_dual(X,y,kernel_list,d_m,constant=100,compute_gap=False)
        
        if abs(dual_obj_val)<abs(min_obj_val):
            min_obj_val=dual_obj_val
            min_gamma=gamma_i
        
    
    return min_gamma

def primal_dual_opt(X,y,m,kernel_type,order,gap=10e-4,weight_threshold=0.01,maxiter=250, verbose=True):
    """ X feature set, y are class outcomes
        d_m is weight vector  on kernels, alpha is coefficient vector
    """
    
    duality_gap=1
    C=0.01# penalization param
    line_search_steps=25
    n=len(y)
    counter=0
    gamma_max=0
    
    # optimziation init 
    d_m=np.ones(m)/m
    D=np.ones(m)
    mu=0
    nu=0
    
    
    kernel_list=[Kernel(kernel_type,i) for i in range(1,order+1)]

    
    # stopping criteria
    while duality_gap>gap:
        old_gap=duality_gap
        if counter>maxiter:
            return d_m
        counter+=1
        kernel=compose_kernels(X,kernel_list,d_m)
     
    
        # compute svm objective
        J_d,duality_gap,alpha=compute_dual(X,y,kernel_list,d_m,C) 
        if verbose:
            print("Duality",duality_gap)
         
        if abs(duality_gap-old_gap)<gap:
            return d_m
        
        # gradient wrt each kernel
        gradient_j=[compute_gradient(i,X,y,alpha) for i in kernel_list] 
        grad_mu=gradient_j[mu]
   
        # Need to investigate if normalization  occurs
        D=np.array([descent_direction(i,d_m[i],mu,gradient_j[i],grad_mu,D) for i in range(0,len(gradient_j))])
        norm_D=np.sqrt(D.dot(D))
        mu=np.argmax(d_m)
        D=D/norm_D
        
        J_hat=0
        d_hat=d_m
        D_hat=D
        
        
        # descent direction update
        ### Error -> Weight vector goes negative -> ratio goes negative -> step size goes negative
        
        
        while J_hat<J_d: 
            
            ratio_update=-d_hat[D_hat<0]/D_hat[D_hat<0]
            if len(ratio_update)==0:
                break
            else:
                ratio=ratio_update
            
            nu=np.argmin(ratio)

            D=D_hat
            d_m=d_hat

            gamma_max=-d_m[nu]/D[nu]
            
           
          
            d_hat=d_m+gamma_max*D
            d_hat[d_hat<weight_threshold]=0
            
            D_hat[mu]=D[mu]-D[nu]
            D_hat[nu]=0
          
            J_hat=compute_dual(X,y,kernel_list,d_hat,C,compute_gap=False)
            
        # line search in descent direction  
        gamma_step=line_search(X,y,kernel_list,D,d_m,gamma_max,disc=line_search_steps)
        
        d_m=(d_m+gamma_step*D)
        
        # normalize and drop threshold
        d_m[d_m<weight_threshold]=0
        d_m=d_m/np.sum(d_m)
        
        print(d_m)
       
        
    return d_m
    

In [5]:
def test_mkl(verbose=True):
    
    n=100 # data set size
    
    m= 5# num kernels
    
    y=np.sign(np.random.uniform(-1,1,size=n)) # sample class labels 1,-1
    x=np.random.rand(n,10) # # sample features
    kernel_type='polynomial'
    
  
    d_m=np.round(primal_dual_opt(x,y,m,kernel_type,order=m,verbose=verbose),2)
    
    
    return d_m 

test_mkl()

Duality 2682.368110808049
[0.24047638 0.18981959 0.18802484 0.19497584 0.18670335]
Duality 2683.4348995650707
[0.27896883 0.18013816 0.17663663 0.19019797 0.17405842]
Duality 2991.602059711026
[0.31561989 0.17091987 0.1657932  0.18564865 0.16201839]
Duality 2988.0514268714815
[0.3505586  0.16213227 0.15545639 0.18131188 0.15054087]
Duality 2991.3059777475733
[0.38390223 0.15374584 0.14559148 0.1771731  0.13958734]
Duality 3002.00904172969
[0.41575758 0.14573375 0.1361669  0.17321905 0.12912272]
Duality 3623.3806713815616
[0.44622211 0.13807147 0.12715379 0.16943764 0.11911499]
Duality 3008.196578121404
[0.47538496 0.13073657 0.1185258  0.1658178  0.10953487]
Duality 3631.6525083760675
[0.50332781 0.12370853 0.11025874 0.16234939 0.10035553]
Duality 3329.139716098166
[0.53012565 0.11696847 0.10233045 0.15902311 0.09155232]
Duality 3332.454963174247
[0.55584745 0.11049906 0.09472051 0.15583039 0.0831026 ]
Duality 3642.858633202395
[0.58055673 0.1042843  0.08741012 0.15276335 0.07498549]


  gamma_max=-d_m[nu]/D[nu]


ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values