### Multiclass Support Vector Machine

&nbsp;

This notebook focuses on multiclass SVM with different methods including OVR (One-Versus-Rest), OVO (One-Versus-One), DAG (Directed Acyclic Graph). 

Reference to Binary SVM

https://github.com/je-suis-tm/machine-learning/blob/master/binary%20support%20vector%20machine.ipynb

In [1]:
import networkx as nx
import cvxopt.solvers
import pandas as pd
import numpy as np
import copy
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import os
os.chdir('d:/python/data')

In [2]:
#plz refer to binary svm for this function
def binary_svm(x_train,y_train,kernel='linear',poly_constant=0.0,poly_power=1,gamma=5):

    y_product=np.outer(y_train,y_train)
    
    if kernel=='linear':
        x_product=np.outer(x_train,x_train)
    elif kernel=='polynomial':
        arr=np.outer(x_train,x_train)
        x_product=np.apply_along_axis(lambda x:(x+poly_constant)**poly_power,0,arr.ravel()).reshape(arr.shape)
    else:
        arr=np.mat([i-j for j in x_train for i in x_train]).reshape(len(x_train),len(x_train))
        x_product=np.apply_along_axis(lambda x:np.exp(-1*gamma*(np.linalg.norm(x))**2),0,arr.ravel()).reshape(arr.shape)
    
    P=cvxopt.matrix(x_product*y_product)
    q=cvxopt.matrix(-1*np.ones(len(x_train)))
    G=cvxopt.matrix(np.diag(-1 * np.ones(len(x_train))))
    h=cvxopt.matrix(np.zeros(len(x_train)))
    A=cvxopt.matrix(y_train,(1,len(x_train)))
    b=cvxopt.matrix(0.0)

    solution=cvxopt.solvers.qp(P, q, G, h, A, b)
    alpha=pd.Series(solution['x'])
    w=np.sum(alpha*y_train*x_train)

    b=-(min(x_train[y_train==1.0]*w)+max(x_train[y_train==-1.0]*w))/2

    return w,b    

In [3]:
#first, one vs one multiclass svm
#given n classes, we do n*(n-1)/2 times binary classification as one vs one
#we would obtain w and b for each binary classification
#when we make a prediction, we use each w and b to get the classification
#now that we have a classification list of n*(n-1)/2
#we just select the value with the most frequency in the list
#that would be our prediction, voila!
def get_accuracy_ovo(train,test,**kwargs):
    
    #calculate w and b for each binary classification
    multiclass=train['y'].drop_duplicates().tolist()
    multiclass_params={}
    for i in range(len(multiclass)):
        for j in range(i+1,len(multiclass)):
            data=copy.deepcopy(train)
            arr=np.select([data['y']==multiclass[i],data['y']==multiclass[j]], \
                            [-1.0,1.0],default=0.0)
            data['y']=arr
            data=data[data['y']!=0.0]
            multiclass_params['{},{}'.format(multiclass[i], \
                                             multiclass[j])]=binary_svm(data['x'], \
                                                                        data['y'], \
                                                                        **kwargs)
            
    result=[]
    
    #store all the predictions in one list
    #and select the value with the most frequency in this list
    predict=[]
    for i in train['x']:
        arr=[]
        for j in multiclass_params:
            w=multiclass_params[j][0]
            b=multiclass_params[j][1]
            value=np.sign(np.multiply(w,i)+b)
            arr.append(j.split(',')[0] if value==-1.0 else j.split(',')[1])
        
        predict.append(max(set(arr), key=arr.count))
        
    predict=pd.Series(predict).apply(int)
    result.append('train accuracy: %.2f'%(
        len(predict[predict==train['y']])/len(predict)*100)+'%')
    
    
    #kinda the same as training sample prediction
    predict=[]
    for i in test['x']:
        arr=[]
        for j in multiclass_params:
            w=multiclass_params[j][0]
            b=multiclass_params[j][1]
            value=np.sign(np.multiply(w,i)+b)
            arr.append(j.split(',')[0] if value==-1 else j.split(',')[1])
            
        predict.append(max(set(arr), key=arr.count))

    predict=pd.Series(predict).apply(int)
    
    result.append('test accuracy: %.2f'%(
        len(predict[predict==test['y']])/len(predict)*100)+'%')
    
    return result

In [4]:
#alternatively, one vs rest multiclass svm
#given n classes, we do n times binary classification as one vs rest
#we would obtain w and b for each binary classification
#when we make a prediction, we use each w and b to get the decision function value
#we select the classifier with the maximum decision function value
#that classifier would return +1.0 and we would take it as the result
def get_accuracy_ovr(train,test,**kwargs):
    
    multiclass=train['y'].drop_duplicates()
    multiclass_params={}
    
    #calculate w and b for each binary classification
    for i in multiclass:
        data=copy.deepcopy(train)
        data['y']=np.where(data['y']==i,1.0,-1.0)
        multiclass_params[i]=binary_svm(data['x'],data['y'],**kwargs)

    result=[]
        
    #store all the decision function values in one list
    #and select the classifier which gives the largest value
    predict=[]
    for i in train['x']:
        max_value=float('-inf')
        idx=0
        for j in multiclass_params:
            w=multiclass_params[j][0]
            b=multiclass_params[j][1]
            value=np.multiply(w,i)+b
            if value>max_value:
                max_value=value
                idx=j
    
        predict.append(idx)
    
    predict=pd.Series(predict).apply(int)
    result.append('train accuracy: %.2f'%(
        len(predict[predict==train['y']])/len(predict)*100)+'%')
    
    #kinda the same as training sample prediction
    predict=[]
    for i in test['x']:
        max_value=float('-inf')
        idx=0
        for j in multiclass_params:
            w=multiclass_params[j][0]
            b=multiclass_params[j][1]
            value=np.multiply(w,i)+b
            if value>max_value:
                max_value=value
                idx=j
    
        predict.append(idx)

    predict=pd.Series(predict).apply(int)
    result.append('test accuracy: %.2f'%(
        len(predict[predict==test['y']])/len(predict)*100)+'%')

    return result

In [5]:
#dagsvm is not supported in sklearn
#it is an optimization for multiclass ovo
#it uses graph theory to avoid n*(n-1)/2 binary classification
#it only takes n-1 binary classification
def get_accuracy_dag(train,test,**kwargs):
    
    #the same as ovo
    #except one more line to build a graph structure
    #we denote the class as the node
    #the edge as binary svm between two classes
    #the weight as parameters w and b
    multiclass=train['y'].drop_duplicates().tolist()
    graph=nx.DiGraph()
    for i in range(len(multiclass)):
        for j in range(i+1,len(multiclass)):
                
            data=copy.deepcopy(train)
            arr=np.select([data['y']==multiclass[i],data['y']==multiclass[j]], \
                            [-1.0,1.0],default=0.0)
            data['y']=arr
            data=data[data['y']!=0.0]
            graph.add_edge(multiclass[i],multiclass[j],weight=binary_svm(data['x'], \
                                                                        data['y'], \
                                                                        **kwargs))
   
    result=[]
    
    #use directed acyclic graph to boost the speed of ovo
    #for ovo, the time complexity is n*(n-1)/2
    #where n is the dimension of classes
    #for dag, the time complexity is only n-1
    #in dag, once we have checked two classes and got the result
    #we would remove the negative result from the graph structure
    #and move on to the comparison with the next class 
    #until we only have one class left in dag
    #which would become the final result
    #as a tradeoff for time complexity
    #the result isnt as accurate as ovo
    predict=[]
    for i in train['x']:
        g=copy.deepcopy(graph)
        
        while len(g.nodes)>1:
            
            #beware, graph.nodes aint a list type
            node0=list(g.nodes)[0]
            node1=list(g.nodes)[1]
            
            #since the graph structure is directed
            #the opposite direction between nodes do not exist
            #we would not get the parameters w and b
            #thats why we need to put a try to avoid keyerror
            #if we make this graph structure undirected
            #it would be impossible to identify which class is -1.0
            try:
                w=g[node0][node1]['weight'][0]
                b=g[node0][node1]['weight'][1]
                value=np.sign(np.multiply(w,i)+b)
                g.remove_node(node1 if value==-1 else node0)
            except KeyError:
                w=g[node1][node0]['weight'][0]
                b=g[node1][node0]['weight'][1]
                value=np.sign(np.multiply(w,i)+b)
                g.remove_node(node0 if value==-1 else node1)
        
        predict+=list(g.nodes)
        
    predict=pd.Series(predict).apply(int)
    result.append('train accuracy: %.2f'%(
        len(predict[predict==train['y']])/len(predict)*100)+'%')
    
    
    #the same as training samples
    predict=[]
    for i in test['x']:
        g=copy.deepcopy(graph)
        
        while len(g.nodes)>1:
            
            node0=list(g.nodes)[0]
            node1=list(g.nodes)[1]
            
            try:
                w=g[node0][node1]['weight'][0]
                b=g[node0][node1]['weight'][1]
                value=np.sign(np.multiply(w,i)+b)
                g.remove_node(node1 if value==-1 else node0)
            except KeyError:
                w=g[node1][node0]['weight'][0]
                b=g[node1][node0]['weight'][1]
                value=np.sign(np.multiply(w,i)+b)
                g.remove_node(node0 if value==-1 else node1)
        
        predict+=list(g.nodes)
        
    predict=pd.Series(predict).apply(int)
    result.append('test accuracy: %.2f'%(
        len(predict[predict==test['y']])/len(predict)*100)+'%')
    
    return result

In [6]:
#using official sklearn package with the same parameters
def skl_multiclass_svm(x_train,x_test,y_train,y_test,**kwargs):
    
    m=SVC(**kwargs).fit(np.array(x_train).reshape(-1, 1), \
                        np.array(y_train).ravel())
    
    train=m.score(np.array(x_train).reshape(-1, 1), \
                  np.array(y_train).ravel())*100
    test=m.score(np.array(x_test).reshape(-1, 1), \
                 np.array(y_test).ravel())*100
    
    print('\ntrain accuracy: %s'%(train)+'%')
    print('\ntest accuracy: %s'%(test)+'%')

### ETL

In [7]:
df=pd.read_csv('iris.csv')

In [8]:
df['y']=np.select([df['type']=='Iris-setosa', \
                   df['type']=='Iris-versicolor', \
                   df['type']=='Iris-virginica'],[1,2,3])

In [9]:
#for simplicity, let us reduce the dimension of x to 1
#reference to pca
# https://github.com/je-suis-tm/machine-learning/blob/master/principal%20component%20analysis.ipynb
high_dims=pd.concat([df[i] for i in df.columns if 'length' in i or 'width' in i],axis=1)
x=PCA(n_components=1).fit_transform(high_dims)

In [10]:
x=pd.Series([x[i].item() for i in range(len(x))])
y=df['y']

In [11]:
#train test split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

In [12]:
#crucial!!!!
#or we would get errors in the next step
x_test.reset_index(inplace=True,drop=True)
y_test.reset_index(inplace=True,drop=True)
x_train.reset_index(inplace=True,drop=True)
y_train.reset_index(inplace=True,drop=True)

In [13]:
train=pd.DataFrame({'x':x_train,'y':y_train})
test=pd.DataFrame({'x':x_test,'y':y_test})

### Run

In [14]:
ovr=get_accuracy_ovr(train,test)

     pcost       dcost       gap    pres   dres
 0: -9.6089e+01 -2.2204e+02  1e+02  2e-15  2e+00
 1: -2.0969e+02 -2.1293e+02  3e+00  2e-14  1e+00
 2: -1.3831e+04 -1.3834e+04  2e+00  2e-13  1e+00
 3: -8.3446e+07 -8.3446e+07  1e+02  7e-09  1e+00
 4: -5.0973e+10 -5.0973e+10  8e+04  2e-06  1e+00
Terminated (singular KKT matrix).
     pcost       dcost       gap    pres   dres
 0: -1.7104e+01 -3.4514e+01  4e+02  2e+01  2e+00
 1: -2.0842e+01 -1.6714e+01  2e+02  8e+00  7e-01
 2: -2.0117e+01 -1.0351e+01  1e+02  4e+00  4e-01
 3: -2.2540e+00 -1.9307e+00  8e+00  3e-01  3e-02
 4: -6.3726e-01 -1.3288e+00  7e-01  9e-16  2e-15
 5: -1.1219e+00 -1.2159e+00  9e-02  3e-16  2e-15
 6: -1.1852e+00 -1.1959e+00  1e-02  5e-16  2e-15
 7: -1.1955e+00 -1.1956e+00  2e-04  2e-16  2e-15
 8: -1.1956e+00 -1.1956e+00  2e-06  1e-15  2e-15
 9: -1.1956e+00 -1.1956e+00  2e-08  8e-16  2e-15
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -4.5089e+01 -1.0384e+02  3e+02  1e+01  2e+00
 1: -9.9358e+0

In [15]:
ovo=get_accuracy_ovo(train,test)

     pcost       dcost       gap    pres   dres
 0: -6.1869e+00 -1.2269e+01  2e+02  1e+01  2e+00
 1: -7.9032e+00 -5.8295e+00  6e+01  4e+00  5e-01
 2: -5.5158e-01 -2.0978e+00  1e+01  4e-01  6e-02
 3: -4.5189e-01 -1.3564e+00  9e-01  4e-16  8e-16
 4: -1.0793e+00 -1.2008e+00  1e-01  2e-16  1e-15
 5: -1.1741e+00 -1.1977e+00  2e-02  7e-16  1e-15
 6: -1.1954e+00 -1.1957e+00  2e-04  4e-16  1e-15
 7: -1.1956e+00 -1.1956e+00  2e-06  5e-16  1e-15
 8: -1.1956e+00 -1.1956e+00  2e-08  1e-15  1e-15
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -2.7304e+01 -7.4656e+01  3e+02  1e+01  3e+00
 1: -8.4162e+01 -1.7339e+02  3e+02  1e+01  2e+00
 2: -5.3939e+02 -6.7259e+02  1e+02  7e+00  1e+00
 3: -1.3672e+03 -1.6072e+03  2e+02  6e+00  1e+00
 4: -2.2173e+03 -2.5712e+03  4e+02  6e+00  1e+00
 5: -4.1833e+03 -4.7694e+03  6e+02  6e+00  1e+00
 6: -4.3857e+03 -4.9965e+03  6e+02  6e+00  1e+00
 7: -2.3723e+04 -2.5237e+04  2e+03  6e+00  1e+00
 8: -2.6862e+05 -2.7513e+05  7e+03  6e+00  1e+0

In [16]:
dag=get_accuracy_dag(train,test)

     pcost       dcost       gap    pres   dres
 0: -6.1869e+00 -1.2269e+01  2e+02  1e+01  2e+00
 1: -7.9032e+00 -5.8295e+00  6e+01  4e+00  5e-01
 2: -5.5158e-01 -2.0978e+00  1e+01  4e-01  6e-02
 3: -4.5189e-01 -1.3564e+00  9e-01  4e-16  8e-16
 4: -1.0793e+00 -1.2008e+00  1e-01  2e-16  1e-15
 5: -1.1741e+00 -1.1977e+00  2e-02  7e-16  1e-15
 6: -1.1954e+00 -1.1957e+00  2e-04  4e-16  1e-15
 7: -1.1956e+00 -1.1956e+00  2e-06  5e-16  1e-15
 8: -1.1956e+00 -1.1956e+00  2e-08  1e-15  1e-15
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -2.7304e+01 -7.4656e+01  3e+02  1e+01  3e+00
 1: -8.4162e+01 -1.7339e+02  3e+02  1e+01  2e+00
 2: -5.3939e+02 -6.7259e+02  1e+02  7e+00  1e+00
 3: -1.3672e+03 -1.6072e+03  2e+02  6e+00  1e+00
 4: -2.2173e+03 -2.5712e+03  4e+02  6e+00  1e+00
 5: -4.1833e+03 -4.7694e+03  6e+02  6e+00  1e+00
 6: -4.3857e+03 -4.9965e+03  6e+02  6e+00  1e+00
 7: -2.3723e+04 -2.5237e+04  2e+03  6e+00  1e+00
 8: -2.6862e+05 -2.7513e+05  7e+03  6e+00  1e+0

In [17]:
print('one vs rest self implementation')
for i in ovr:
    print('\n',i)

one vs rest self implementation

 train accuracy: 89.52%

 test accuracy: 97.78%


In [18]:
#normally ovo should work better than ovr 
#as time complexity of ovo is higher
#n*(n-1)/2>n
print('one vs one self implementation')
for i in ovo:
    print('\n',i)

one vs one self implementation

 train accuracy: 89.52%

 test accuracy: 97.78%


In [19]:
print('dag self implementation')
for i in dag:
    print('\n',i)

dag self implementation

 train accuracy: 89.52%

 test accuracy: 97.78%


In [20]:
print('one vs rest sklearn')
skl_multiclass_svm(x_train,x_test,y_train,y_test,kernel='linear',decision_function_shape='ovr')

one vs rest sklearn

train accuracy: 92.38095238095238%

test accuracy: 88.88888888888889%


In [21]:
print('one vs one sklearn')
skl_multiclass_svm(x_train,x_test,y_train,y_test,kernel='linear',decision_function_shape='ovo')

one vs one sklearn

train accuracy: 92.38095238095238%

test accuracy: 88.88888888888889%
