In [100]:
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import pandas as pd
import random as rd
from cvxpy import *
from sklearn.utils import shuffle


style.use('ggplot')


class SVM():
    def fit(self):
        #(w, b, success_ratio)
        best_success_ratio = 0
        for w, b, success_ratio in self.results:
            if success_ratio > best_success_ratio:
                best_success_ratio = success_ratio
                self.w = w
                self.b = b
        self.success_ratio = best_success_ratio
            
        
        
        
    def cross_validate(self, data):
        self.split_data = split_data(data)
        n = len(self.split_data)
        results = []
        for i in range(n):
            test = self.split_data[i]
            train = [df for num, df in enumerate(self.split_data) if not num == i]
            train = pd.concat(train)
            test_X = np.array(test.drop(['class'], 1)).astype(np.float)
            test_y = np.array(test['class']).astype(np.float)
            train_X = np.array(train.drop(['class'], 1)).astype(np.float)
            train_y = np.array(train['class']).astype(np.float)
            self.w, self.b = optimize(train_X, train_y)
            succes_ratio = self.test(test_X, test_y)
            results.append((self.w, self.b, succes_ratio))
        self.results = results

    
    def predict(self, X):
        return np.sign(np.dot(X, self.w) + self.b)
    

    
    
    def test(self, X, y):
        all_instances = 0
        correct_instances = 0
        for index, instance in enumerate(X):
            all_instances += 1
            if self.predict(instance) == y[index]:
                correct_instances += 1
        success_ratio = correct_instances/all_instances
        return success_ratio
        
    
def normalize_01(X):
    X_normed = (X - X.min(0)) / X.ptp(0)
    return X_normed
    
    
    
def optimize(X, y, C=5):

    # train to get w and b for hyperplane
    # solving optimizational problem for min ||w|| + C Sum(e_i)
    # subject to y_i (w^t x_i + b) >= 1 - e_i and e_i >= 0 for i = 1, ... , len data[0]

    n = len(X)
    m = len(X[0])

    w = Variable(m)
    b = Variable(1)
    e = Variable(n)


    obj = Minimize(norm(w) + C * sum_entries(e))

    constraints = [e >= 0]

    for i in range(n):
        constraints.append(1 - e[i]- y[i] * (w.T * X[i] + b) <= 0)

    prob = Problem(obj, constraints)
    prob.solve()

    w = np.squeeze(np.asarray(w.T.value))
    b = b.value        
    return (w, b)
        
        
        

def split_data(data, k=10):
    '''
    :param data:
    :param k:
    :return:
    '''
    data = shuffle(data)
    data.reset_index(drop=True,inplace=True)
    split_data = partition(data, k)
    return split_data
        


def partition(lst, n):
    '''
    :param lst: array of elemnts
    :param n: number of partitions
    :return: array partitioned into n arrays
    '''
    division = len(lst) / n
    return [lst[round(division * i):round(division * (i + 1))] for i in range(n)]




df = pd.read_csv('breast-cancer-wisconsin.data.txt')
df.replace('?', -99, inplace=True)
df.drop(['id'], 1, inplace=True)

df.loc[df['class'] == 2, 'class'] = 1
df.loc[df['class'] == 4, 'class'] = -1


X = df.drop(['class'], 1)
y = df['class']

X = np.array(X).astype(np.float)


#clf = SVM()

#clf.cross_validate(df)
normalize_01(X[:10])


array([[ 0.57142857,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.25      ,  0.        ,  0.        ],
       [ 0.57142857,  0.33333333,  0.33333333,  0.57142857,  1.        ,
         1.        ,  0.25      ,  0.16666667,  0.        ],
       [ 0.28571429,  0.        ,  0.        ,  0.        ,  0.        ,
         0.11111111,  0.25      ,  0.        ,  0.        ],
       [ 0.71428571,  0.77777778,  0.77777778,  0.        ,  0.2       ,
         0.33333333,  0.25      ,  1.        ,  0.        ],
       [ 0.42857143,  0.        ,  0.        ,  0.28571429,  0.        ,
         0.        ,  0.25      ,  0.        ,  0.        ],
       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.25      ,  0.        ,  0.        ],
       [ 0.14285714,  0.        ,  0.11111111,  0.        ,  0

In [73]:
clf.fit()

In [75]:
clf.w

array([-0.309228  , -0.05133639, -0.30427716, -0.0908156 , -0.06622205,
       -0.0250459 , -0.31781802, -0.09022429, -0.09415279])

In [76]:
clf.b

4.9657522942709402

In [77]:
df[:10]

Unnamed: 0,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,1
1,5,4,4,5,7,10,3,2,1,1
2,3,1,1,1,2,2,3,1,1,1
3,6,8,8,1,3,4,3,7,1,1
4,4,1,1,3,2,1,3,1,1,1
5,8,10,10,8,7,10,9,7,1,-1
6,1,1,1,1,2,10,3,1,1,1
7,2,1,2,1,2,1,3,1,1,1
8,2,1,1,1,2,1,1,1,5,1
9,4,2,1,1,2,1,2,1,1,1


In [80]:
a = X[0]
a

array([ 5.,  1.,  1.,  1.,  2.,  1.,  3.,  1.,  1.])

In [93]:
uspeli = 0 
vsi = 0
for i in range(len(X)):
    vsi += 1
    if clf.predict(X[i]) == y[i]:
        uspeli += 1
uspeli / vsi

0.9628040057224606

In [95]:
clf.success_ratio

0.9857142857142858