### SimpleMKL Training

- In scikit-learn we utilized a single kernel (radial basis function) with cross validated bandwidth (via grid search)

- We now want to extend to the case of multiple kernels (linear combination of a basis set)

- Inspiration in our original implementation from here 

     - https://github.com/qintian0321/SimpleMKL_python

In [2]:
import numpy as np
import pandas as pd
from sklearn import svm

# SimpleMKL optimization objective

### Primal problem 


### Dual Problem

$$\max_\alpha \ \frac{-1}{2} \sum_{i,j} \alpha_i \alpha_j y_i y_j \sum_m d_mK_m(x_i,x_j) +\sum_i \alpha_i$$

s.t. $$\sum_i \alpha_i y_i=0$$ and $$C \geq \alpha_i \geq 0$$



- Coefficient vector $ \alpha$ is importance of observed features on classification problem




### Abstract Kernel Class 

In [60]:
class Kernel():
    def __init__(self,kernel_type,order=None):
        self.kernel_type=kernel_type
        self.order=order
    
    def compute_kernel(self,X):
        
        if self.kernel_type=='linear':
            return np.dot(X,X.T)
        
        if self.kernel_type=='gaussian':
            return None
        
        if self.kernel_type=='polynomial':
            return np.dot(X**self.order,X.T**self.order)
        

### Functions for Multiple Kernel Optimization

In [207]:
def compute_dual(X,y,kernel_list,d_m,constant,compute_gap=True):
    """ Compute dual objective value
    """
    kernel=compose_kernels(X,kernel_list,d_m)
    single_kernel=svm.SVC(C=constant,kernel='precomputed')

    single_kernel.fit(kernel,y)
    
    alpha=np.empty(len(y))
    alpha[single_kernel.support_]=np.abs(single_kernel.dual_coef_[0]) 
    alpha[alpha==None]=0
    
    
    J=0.5*np.dot(np.dot(alpha,kernel*y),alpha.T)+np.sum(alpha)
    
    if compute_gap:
        kernel_eval=[np.dot(np.dot(np.dot(alpha,alpha.T),np.dot(y,y.T)),k_i.compute_kernel(X)) for k_i in kernel_list]
        duality_gap=J-np.sum(alpha)+0.5*np.max(kernel_eval)
        
        return J,duality_gap
    
    return J

def compose_kernels(X,kernel_list,weights):
    """ Compute positive linear combination of kernels 
    """
    return np.sum(np.array([weights[ct]*k_i.compute_kernel(X) for ct,k_i in enumerate(kernel_list)]),axis=0)
    
def compute_gradient(kernel,X,y,alpha):
    """ Compute gradient of MKL objective closed form 
    """
    kernel_mat=kernel.compute_kernel(X)
    gradient_obj=-0.5*np.dot(np.dot(np.dot(alpha,alpha.T),np.dot(y,y.T)),kernel_mat)
    
    return gradient_obj


def descent_direction(d_m,mu,grad_dm,grad_mu):
    """ Compute direction of gradient descent 
    
    THINK THERE IS AN ISSUE HERE 
    """
    if d_m==0 and grad_dm-grad_mu>0:
        return 0
    
    elif d_m>0 and d_m!=mu:
        return -grad_dm+grad_mu
    
    elif d_m==mu:
        pass
    
def line_search(X,y,kernel_list,D,d_m,gamma_max,disc):
    """ Selects step size to minimize obj value
    """
    
    # grid of step size begins bigger than 0
    grid=np.arange(0+gamma_max/disc,gamma_max,gamma_max/disc)
    
    min_gamma,min_obj_val=None,10e8
    for gamma_i in grid:
        d_i=d_m+gamma_i*D
        dual_obj_val=compute_dual(X,y,kernel_list,d_m,constant=100,compute_gap=False)
        
        
        
        if abs(dual_obj_val)<abs(min_obj_val):
            min_obj_val=dual_obj_val
            min_gamma=gamma_i
        
    
    return min_gamma

def primal_dual_opt(X,y,m,kernel_type,order,gap=10e-4,maxiter=10):
    """ X feature set, y are class outcomes
        d_m is weight vector  on kernels, alpha is coefficient vector
    """
    
    duality_gap=1
    C=0.1# penalization param
    line_search_steps=5
    n=len(y)
    counter=0
    
    # optimziation init 
    d_m=np.ones(m)/m
    D=np.ones(m)
    
    kernel_list=[Kernel(kernel_type,i) for i in range(1,order+1)]
    alpha=np.zeros(n)
    
    # stopping criteria
    while duality_gap>gap:
        
        if counter>maxiter:
            return d_m
        counter+=1
        kernel=compose_kernels(X,kernel_list,d_m)
     
    
        # compute svm objective
        J_d,duality_gap=compute_dual(X,y,kernel_list,d_m,C) 
        print(duality_gap)
        
        
        # gradient wrt each kernel
        gradient_j=[compute_gradient(i,X,y,alpha) for i in kernel_list] 
        mu=np.argmax(d_m)
        
        J_hat=0
        d_hat=d_m
        D_hat=D
        
        
        # descent direction
        while J_hat<J_d: 
            
            d_m=d_hat
            D=D_hat
            ratio=-d_m/D
          
            vu=np.argmin(ratio)
            gamma_max=d_m[vu]/D[vu]
            
            
            kernel=compose_kernels(X,kernel_list,d_m)
            J_hat=compute_dual(X,y,kernel_list,d_m,constant=1,compute_gap=False)
        
            
        # line search in descent direction  
        gamma_step=line_search(X,y,kernel_list,D,d_m,gamma_max,disc=line_search_steps)
        
        d_m=(d_m+gamma_step*D)/np.sum(d_m+gamma_step*D)
        
        
    return d_m
    

In [208]:
def test_mkl():
    
    n=50 # data set size
    
    m=3 # num kernels
    
    y=np.random.rand(n).round() # class labels
    x=np.random.rand(n,5) # 
    kernel_type='polynomial'
    order=3
  
    primal_dual_opt(x,y,m,kernel_type,order)
    
    
    return

test_mkl()

298.390694660662
[0.33333333 0.33333333 0.33333333]
298.390694660662
[0.33333333 0.33333333 0.33333333]
298.390694660662
[0.33333333 0.33333333 0.33333333]
298.390694660662
[0.33333333 0.33333333 0.33333333]
298.390694660662
[0.33333333 0.33333333 0.33333333]
298.390694660662
[0.33333333 0.33333333 0.33333333]
298.390694660662
[0.33333333 0.33333333 0.33333333]
298.390694660662
[0.33333333 0.33333333 0.33333333]
298.390694660662
[0.33333333 0.33333333 0.33333333]
298.390694660662
[0.33333333 0.33333333 0.33333333]
298.390694660662
[0.33333333 0.33333333 0.33333333]
