Use sklearn.svm kit. 

In [26]:
import pandas as pd
from sklearn import svm
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
train = pd.read_excel('train.xlsx')
test = pd.read_excel('test.xlsx')
train.columns = ['dig','inten','symm']
test.columns = ['dig','inten','symm']


def one_vs_all(tar, train, C, Q):
	train_c = train.copy()
	Y = np.zeros(train_c.shape[0])
	Y[train_c.dig == tar] = 1 # Y is the label
	X = train.iloc[0:, 1:3] # X is the input
	gram = (1+np.dot(X, X.T))**Q ## pre-compute the kernel
	clf = svm.SVC(C=0.01, kernel = "precomputed")
	clf.fit(gram, Y)
	return [clf,gram,Y]
err = []
for i in list([0,2,4,6,8]):
	fit = one_vs_all(i,train,0.01,2)
	clf = fit[0]
	Y = fit[2]
	gram = fit[1]
	train_pred = clf.predict(gram)
	error = 1-np.sum(np.sign(train_pred) == np.sign(Y))/train_pred.shape[0]
	err.append(error)
print(err)


[0.10588396653408316, 0.10026059525442321, 0.089425318886298122, 0.091071183651076693, 0.074338225209162001]


Q2. As shown in the output, 0 has the highest $E_{in}$

In [18]:
err = []
for i in np.arange(1,11,2):
	fit = one_vs_all(i,train,0.01,2)
	clf = fit[0]
	Y = fit[2]
	gram = fit[1]
	train_pred = clf.predict(gram)
	error = 1-np.sum(np.sign(train_pred) == np.sign(Y))/train_pred.shape[0]
	err.append(error)
print(err)

[0.014401316691811772, 0.090248251268687407, 0.076258400768070222, 0.088465231106844011, 0.088328075709779186]


Q3. As shown in the output, 1 has the lowest $E_{in}$

In [19]:
err = []
sv = []
err_out = []
def one_vs_one(tar1, tar2, train, Co, Q):
	X = train[train['dig'].isin([tar1,tar2])]
	Y = np.zeros(X.shape[0])
	Y[X.dig == tar1] = 1 # set up Y
	X = X.iloc[0:, 1:3]
	gram = (1+np.dot(X, X.T))**Q ## pre-compute the kernel
	clf = svm.SVC(C = Co, kernel = "precomputed")
	clf.fit(gram, Y)
	return clf
def error_compute(clf, train, test, Q):
	Xout = Xout = test[test['dig'].isin([1,5])]
	Yout = np.zeros(Xout.shape[0])
	Yout[Xout.dig == 1] = 1
	Xout = Xout.iloc[:,1:3]
	X = train[train['dig'].isin([1,5])]
	X = X.iloc[0:,1:3]
	gram_out = (1+np.dot(Xout, X.T))**Q
	test_pred = clf.predict(gram_out)
	error_out = 1-np.sum(np.sign(test_pred) == np.sign(Yout))/test_pred.shape[0]
	return error_out
for c in [0.001,0.01,0.1,1]:
	fit = one_vs_one(1,5,train,c,2)
	clf = fit
	error = error_compute(clf,train,train,2)
	err.append(error)
	sv.append(np.sum(clf.n_support_))
	# Eout compute
	error_out = error_compute(clf,train,test,2)
	err_out.append(error_out)
print('in sample error',err)
print('number of supporter vectors',sv)
print('out of sample error', err_out)

in sample error [0.004484304932735439, 0.004484304932735439, 0.004484304932735439, 0.0032030749519538215]
number of supporter vectors [76, 34, 24, 24]
out of sample error [0.01650943396226412, 0.018867924528301883, 0.018867924528301883, 0.018867924528301883]


Q4. As shown above. $E_{in}$, $E_{out}$ and number of support vectors are not strictly going up/down.
The only right option is maximum C achieves the lowest $E_{in}$

In [21]:
error = np.empty([4,4])
sv = np.empty([4,2])
i = 0
for c in [0.0001,0.001,0.01,1]:
	fit2 = one_vs_one(1,5,train,c,2)
	fit5 = one_vs_one(1,5,train,c,5)
	clf2 = fit2
	clf5 = fit5
	sv[i,:] = np.array([np.sum(clf2.n_support_), np.sum(clf5.n_support_)])
	err2_in = error_compute(clf2, train, train,2)
	err5_in = error_compute(clf5, train, train,5)
	err2_out = error_compute(clf2, train, test,2)
	err5_out = error_compute(clf5, train, test,5)
	error[i,:] = np.array([err2_in,err2_out,err5_in,err5_out])
	i += 1
error = pd.DataFrame(error)
error.columns = ['Q=2 Ein', 'Q=2 Eout', 'Q=5 Ein', 'Q=5 Eout']
sv = pd.DataFrame(sv)
sv.columns = ['Q=2 n_sv','Q=5 n_sv']
print(error)
print(sv)

    Q=2 Ein  Q=2 Eout   Q=5 Ein  Q=5 Eout
0  0.008969  0.016509  0.004484  0.018868
1  0.004484  0.016509  0.004484  0.021226
2  0.004484  0.018868  0.003844  0.021226
3  0.003203  0.018868  0.003203  0.021226
   Q=2 n_sv  Q=5 n_sv
0     236.0      26.0
1      76.0      25.0
2      34.0      23.0
3      24.0      21.0


Q6, as shown above. C=0.0001, $E_{in}$ is lower at Q=5  
C = 0.001, n_support_vector is lower at Q=5  
C = 0.01, $E_{in}$ is lower at Q=5  
C = 1, $E_{out}$ is higher at Q=5  

In [23]:
def cross_val(train, tar1, tar2, k, c, Q):
	# step1 k fold split
	kf = KFold(n_splits = k, shuffle = True)
	X_use = train[train['dig'].isin([tar1,tar2])]
	error = np.empty(k)
	i = 0
	for train_index, test_index in kf.split(X_use): 
	# X_use is dig(tar1 or tar2), par1, par2
		Xtrain = X_use.iloc[train_index,:]
		Xtest = X_use.iloc[test_index,:]
		clf = one_vs_one(tar1,tar2,Xtrain,c,Q)
		err = error_compute(clf,Xtrain,Xtest,Q)
		error[i] = err
		i += 1
	return np.mean(err)

s = []
Cs = [0.0001,0.001,0.01,0.1,1]
for i in range(0,100):
	error = np.empty(5)
	j = 0
	for c in Cs:
		err = cross_val(train, 1, 5, 10, c, 2)
		error[j] = err
		j += 1
	s.append(np.argmin(error))
counts = np.bincount(np.array(s))
print(Cs[np.argmax(counts)])


0.001


Q7. As shown here. I use KFold function to split the training set into training set and validation set, fold = 10, randomly. The most frequent winner is 0.01

In [24]:
error = np.empty(100)
for i in range(0,100):
	err = cross_val(train, 1, 5, 10, 0.001, 2)
	error[i] = err
print(np.mean(error))

0.00480769230769


Q8. As shown here, the cross validation error is closes to 0.005.

In [27]:
Xin_use = train[train['dig'].isin([1,5])]
Xout_use = test[test['dig'].isin([1,5])]
Yin = np.zeros(Xin_use.shape[0])
Yout = np.zeros(Xout_use.shape[0])
Yin[Xin_use.dig == 1] = 1
Yout[Xout_use.dig == 1] =1
Cs = [0.01,1,100, 10**4, 10**6]
Ein = []
Xin = Xin_use.iloc[:,1:3]
Xout = Xout_use.iloc[:,1:3]
for c in Cs:
	clf = svm.SVC(C=c,kernel='rbf')
	clf.fit(Xin, Yin)
	Y_pred = np.sign(clf.predict(Xin))
	Ein.append(accuracy_score(np.sign(Yin),Y_pred))
print(1-np.array(Ein))

[ 0.00384369  0.0044843   0.00320307  0.00256246  0.00128123]


Q9, as shown above, the larges C gives us the lowest $E_{in}$, which make sense in that the rbf is in the infinite dimensions, so the more strict the error-tolerance (c) is, the higher effective dimension it will reach, therefore producing a lower in-sample error.

In [28]:
Eout = []
for c in Cs:
	clf = svm.SVC(C=c,kernel='rbf')
	clf.fit(Xin, Yin)
	Y_pred = np.sign(clf.predict(Xout))
	Eout.append(accuracy_score(np.sign(Yout),Y_pred))
print(1-np.array(Eout))

[ 0.02122642  0.02122642  0.01886792  0.01886792  0.02122642]


Q10. As shown above. C=100 gives the best Eout. As reasoned before, the more strict C is, the higher dimension the model will reach, and consequently increases the risk of overfitting. Since SVM is good at self controlling overfitting, so the difference is not very big.  