In [1]:
import cvxopt.solvers
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import os
os.chdir('d:/python/data')

In [2]:
df=pd.read_csv('iris.csv')

In [3]:
#the classification has to be float instead of int
#this is requested by cvxopt
#for a binary classification
#the value should be either -1.0 or 1.0
df['y']=np.select([df['type']=='Iris-setosa', \
                   df['type']=='Iris-versicolor', \
                   df['type']=='Iris-virginica'],[-1.0,0.0,1.0])

In [4]:
df=df[df['y']!=0.0]

In [5]:
x=PCA(n_components=1).fit_transform(pd.concat([df[i] for i in df.columns if 'length' in i or 'width' in i],axis=1))

In [6]:
x=pd.Series([x[i].item() for i in range(len(x))])

In [7]:
y=df['y']

In [8]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

In [9]:
#this is crucial,cuz train_test_split makes the index in random order
x_test.reset_index(inplace=True,drop=True)
y_test.reset_index(inplace=True,drop=True)
x_train.reset_index(inplace=True,drop=True)
y_train.reset_index(inplace=True,drop=True)

In [10]:
#this is outer product matrix
#which is the combination of all inner products
#alternatively,we can write outer product in
#np.mat([np.dot(y_train[i],y_train[j]) for j in y_train.index for i in y_train.index]).reshape(len(y_train),len(y_train))
#or just np.mat(y_train).T*np.mat(y_train)
y_product=np.outer(y_train,y_train)
x_product=np.outer(x_train,x_train)

In [11]:
P=cvxopt.matrix(x_product*y_product)

In [12]:
q=cvxopt.matrix(-1*np.ones(len(x_train)))

In [13]:
G=cvxopt.matrix(np.diag(-1 * np.ones(len(x_train))))

In [14]:
h=cvxopt.matrix(np.zeros(len(x_train)))

In [15]:
A=cvxopt.matrix(y_train,(1,len(x_train)))

In [16]:
b=cvxopt.matrix(0.0)

In [17]:
solution=cvxopt.solvers.qp(P, q, G, h, A, b)

     pcost       dcost       gap    pres   dres
 0: -2.7074e+00 -4.0020e+00  2e+02  1e+01  1e+00
 1: -2.1585e-01 -5.7775e-01  1e+01  1e+00  1e-01
 2:  3.4062e-02 -4.9433e-01  5e-01  1e-16  1e-15
 3: -1.1555e-01 -1.7878e-01  6e-02  2e-17  4e-16
 4: -1.3169e-01 -1.9041e-01  6e-02  5e-17  4e-16
 5: -1.7161e-01 -1.7761e-01  6e-03  7e-17  3e-16
 6: -1.7575e-01 -1.7584e-01  1e-04  3e-17  3e-16
 7: -1.7582e-01 -1.7582e-01  1e-06  3e-17  3e-16
 8: -1.7582e-01 -1.7582e-01  1e-08  6e-17  4e-16
Optimal solution found.


In [18]:
alpha=pd.Series(solution['x'])

In [19]:
w=np.sum(alpha*y_train*x_train)

In [20]:
#here i am using prof andrew ng's way
#alternatively, we can do a normal average of all value b
#b=np.mean(y_train-w*x_train)
b=-(min(x_train[y_train==1.0]*w)+max(x_train[y_train==-1.0]*w))/2

In [21]:
print('train accuracy: %s'%(len(y_train[np.sign(np.multiply(w,x_train)+b)==y_train])/len(y_train)*100)+'%')

train accuracy: 100.0%


In [22]:
print('test accuracy: %s'%(len(y_test[np.sign(np.multiply(w,x_test)+b)==y_test])/len(y_test)*100)+'%')

test accuracy: 100.0%
