##### Kernel Notes
* Can reach a kernel without having to carry out the entire operation
    * K(X, X') = (1+X*X')^p, p and n variables
* Sometimes, a linearbly seperable data form does not exists for your data
    * exp(-y||x-x'||^2) -> should always find a dimension, but is this helpful
    * Risk overfitting
        * If you fit past data perfectly, since future data is not perfect, your model will mess up
    * Want less than 10% of Support Vectors compared to Total Samples (#SV/#Samples)
    * Soft vs Hard Margin (Degree of Error vs. Perfect Seperating Line)
    * Slack yi(xiw+b)>= 1 - SLACK where SLACK > 0
        * Want to minimize this SLACK
        * Minimize 0.5||w||^2 + cSUM(SLACK)
            * Variable c will punish slack, higher c -> more punishment

Code For Using CVXOPT to solve the optimization problem
Uses three different kernel types

In [1]:
import numpy as np
from numpy import linalg
import cvxopt
import cvxopt.solvers

def linear_kernel(x1, x2):
    return np.dot(x1, x2)

def polynomial_kernel(x,y, p=3):
    return (1 + np.dot(x,y)) ** p

def gaussian_kernel(x, y, sigma=5.0):
    return np.exp(-linalg.norm(x-y)**2 / (2 * (sigma ** 2)))

class SVM(object):
    
    def __init(self, kernel=linear_kernel, C=None):
        self.kernel = kernel
        self.C = C
        if self.C is not None: self.C = float(self.C)
            
    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        K = np.zeros((n_samples, n_samples))
        for i in range(n_samples):
            for j in range(n_samples):
                K[i, j] = self.kernel(X[i], X[j])
                
        P = cvxopt.matrix(np.outer(y,y) * K)
        q = cvxopt.matrix(np.ones(n_samples) * -1)
        A = cvxopt.matrix(y, (1, n_samples))
        b = cvxopt.matrix(0.0)
        
        if self.C is None:
            G = cvxopt.matrix(np.diag(np.ones(n_samples) * -1))
            h = cvxopt.matrix(np.zeros(n_samples))
        else:
            tmp1 = np.diag(np.ones(n_samples) * -1)
            tmp2 = np.identity(n_samples)
            G = cvxopt.matrix(np.vstack(tmp1, tmp2))
            tmp1 = np.zeros(n_samples)
            tmp2 = np.ones(n_samples) * self.C
            h = cvxopt.matrix(np.hstack((tmp1, tmp2)))
            
        solution = cvxopt.solvers.qp(P, q, G, h, A, b)
        
        a = np.ravel(solution['x'])
        
        sv = a > 1e-5
        ind = np.arange(len(a))[sv]
        self.a = a[sv]
        self.sv = X[sv]
        self.sv_y = y[sv] 
        print("Support Vectors" + len(self.a))
        print("Total Samples" + n_samples)
        
        self.b = 0
        for n in range(len(self.a)):
            self.b += self.sv_y[n]
            self.b -= np.sum(self.a * self.sv_y * K[ind[n], sv])
        self.b /= len(self.a)
        
        if self.kernel == linear_kernel:
            self.w = np.zeros(n_features)
            for n in range(len(self.a)):
                self.w += self.a[n] * self.sv_y[n] * self.sv[n]
        else:
            self.w = None
            
    def project(self, X):
        if self.w is not None:
            return np.dot(X, self.w) + self.b
        else:
            y_predict = np.zeros(len(X))
            for i in range(len(X)):
                s = 0
                for a, sv_y, sv in zip(self.a, self.sv_y, self.sv):
                    s += a * sv_y * self.kernel(X[i], sv)
                y_predict[i] = s
            return y_predict + self.b
    def predict(self, X):
        return np.sign(self.project(X))
            

* OVR: One vs Rest Seperation
    * Create a seperating vector between every class and every other hyperplane
    * What if data is very evenly seperated, hard to tel the difference
* OVO: One vs One Seperation
    * One vs One between every pair
    * More complicated
        * Sometimes can change and then find which one its classified into

In [17]:
import numpy as np
from sklearn import preprocessing, model_selection, svm
import pandas as pd

df = pd.read_csv('breast-cancer-wisconsin.txt')
df.replace('?', -99999, inplace=True)
df.drop(['id'], axis=1, inplace=True)

X = np.array(df.drop(['class'], axis=1))
y = np.array(df['class'])
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

#clf = svm.SVC()
clf = svm.SVC(C=10, kernel="linear")
clf.fit(x_train, y_train)
acc = clf.score(x_test, y_test)
print(acc)

AttributeError: 'SVC' object has no attribute 'n_support'