In [None]:
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import pandas as pd
from cvxpy import *
from sklearn.utils import shuffle


style.use('ggplot')


class SVM():
    def __init__(self, data):
        self.data = data.copy()
        self.normalized_data = normalize_df(data)
        self.maximum = np.array(self.data[df.columns.difference(['razred'])].max())
        self.minimum = np.array(self.data[df.columns.difference(['razred'])].min())
     
    def fit(self, C=5):
        data = self.normalized_data
        X = np.array(data.drop(['razred'], 1)).astype(np.float)
        y = np.array(data['razred']).astype(np.float)
        self.w, self.b = self._fit(X, y, C)
                        
    def cross_validate(self, k=10):
        self.split_data = split_data(self.data, k)
        n = len(self.split_data)
        results = []
        tests = []
        for i in range(n):
            test = self.split_data[i]
            train = [df for num, df in enumerate(self.split_data) if not num == i]
            train = pd.concat(train)
            test_X = normalize_01(np.array(test.drop(['razred'], 1)).astype(np.float))
            test_y = np.array(test['razred']).astype(np.float)
            train_X = normalize_01(np.array(train.drop(['razred'], 1)).astype(np.float))
            train_y = np.array(train['razred']).astype(np.float)
            w, b = self._fit(train_X, train_y)
            accuracy = self.test(test_X, test_y, w, b)
            results.append((w, b, accuracy))
            tests.append(accuracy)
        self.results = results
        self.accuracy_test = np.array(tests)
        self.accuracy = np.mean(self.accuracy_test)
           
    def predict(self, X):
        X = (X - self.minimum) / (self.maximum - self.minimum)
        return np.sign(np.dot(X, self.w) + self.b)
    
    def _predict(self, X, w, b):
        return np.sign(np.dot(X, w) + b)
    
    def test(self, X, y, w, b):
        all_instances = 0
        correct_instances = 0
        for index, instance in enumerate(X):
            all_instances += 1
            if self._predict(instance, w, b) == y[index]:
                correct_instances += 1
        accuracy = correct_instances/all_instances
        return accuracy
    
    
    def visualize(self):
        if len(self.data.columns) != 3:
            return 'Number of attribute dimensions is not 2'
        self.fig = plt.figure()
        self.ax = self.fig.add_subplot(1,1,1)
        df = self.normalized_data
        first_class = df.loc[df['razred'] == 1]
        second_class = df.loc[df['razred'] == -1]
        fst_att, snd_att, _ = first_class.columns
        self.ax.scatter(first_class[fst_att], first_class[snd_att], color = 'red')
        self.ax.scatter(second_class[fst_att], second_class[snd_att], color = 'blue')
        
        t = np.arange(-0.2, 1.0, 0.01)
        s = - clf.w[0]/clf.w[1] * t - clf.b/clf.w[1]

        s_plus = - clf.w[0]/clf.w[1] * t - (clf.b - 1)/clf.w[1]
        s_minus = - clf.w[0]/clf.w[1] * t - (clf.b + 1)/clf.w[1]
        
        self.ax.plot(t, s, 'black')
        self.ax.plot(t, s_plus, 'r--')
        self.ax.plot(t, s_minus, 'b--')
        
        plt.show()
        
    def _fit(self, X, y, C=5):
        
        # train to get w and b for hyperplane
        # solving optimizational problem for min ||w|| + C Sum(e_i)
        # subject to y_i (w^t x_i + b) >= 1 - e_i and e_i >= 0 for i = 1, ... , len data[0]

        n = len(X)
        m = len(X[0])

        w = Variable(m)
        b = Variable(1)
        e = Variable(n)
        
        if C == 0:
            obj = Minimize(norm(w))

            constraints = []

            for i in range(n):
                constraints.append(1 - y[i] * (w.T * X[i] + b) <= 0)

            prob = Problem(obj, constraints)
            prob.solve()

            w = np.squeeze(np.asarray(w.T.value))
            b = b.value        
            return (w, b)
        
        obj = Minimize(norm(w) + C * sum_entries(e))

        constraints = [e >= 0]

        for i in range(n):
            constraints.append(1 - e[i]- y[i] * (w.T * X[i] + b) <= 0)

        prob = Problem(obj, constraints)
        prob.solve()

        w = np.squeeze(np.asarray(w.T.value))
        b = b.value        
        return (w, b)
        
    

def normalize_df(df):
    df[df.columns.difference(['razred'])] = df[df.columns.difference(['razred'])].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
    return df    
    
def normalize_01(X):
    X_normed = (X - X.min(0)) / X.ptp(0)
    return X_normed
    
def split_data(data, k=10):
    '''
    :param data:
    :param k:
    :return:
    '''
    data = shuffle(data)
    data.reset_index(drop=True,inplace=True)
    split_data = partition(data, k)
    return split_data
        
def partition(lst, n):
    '''
    :param lst: array of elemnts
    :param n: number of partitions
    :return: array partitioned into n arrays
    '''
    division = len(lst) / n
    return [lst[round(division * i):round(division * (i + 1))] for i in range(n)]



# df = pd.read_csv('breast-cancer-wisconsin.data.txt')
# df.replace('?', -99, inplace=True)
# df.drop(['id'], 1, inplace=True)
# df[['bare_nuclei']] = df[['bare_nuclei']].astype(float)
#df.rename(columns={'class': 'razred'}, inplace=True)

# df.loc[df['razred'] == 2, 'razred'] = 1
# df.loc[df['razred'] == 4, 'razred'] = -1


# X = df.drop(['razred'], 1)
# y = df['razred']

# X = np.array(X).astype(np.float)


# clf = SVM(df)

# clf.cross_validate()

#################################

# df = pd.read_csv('bezdekIris.txt', sep=",", header=None)
# df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'razred' ]

# df = df[df.razred != 'Iris-versicolor']


# df.replace({'Iris-setosa': 1, 'Iris-virginica': -1,}, inplace=True)
# df.reset_index()


# df.drop(['petal_length', 'petal_width'], 1, inplace=True)

# # X = df.drop(['razred'], 1)
# # y = df['razred']

# # X = np.array(X).astype(np.float)
# # y = np.array(y).astype(np.float)


# clf = SVM(df)

# clf.cross_validate()


###################################

# df = pd.read_csv('bezdekIris.txt', sep=",", header=None)
# df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'razred' ]

# df = df[df.razred != 'Iris-setosa']


# df.replace({'Iris-versicolor': 1, 'Iris-virginica': -1,}, inplace=True)
# df.reset_index()


# df.drop(['petal_length', 'petal_width'], 1, inplace=True)

# X = df.drop(['razred'], 1)
# y = df['razred']

# X = np.array(X).astype(np.float)
# y = np.array(y).astype(np.float)


# clf = SVM(df)

# clf.cross_validate()

####################################

# df = pd.read_csv('bezdekIris.txt', sep=",", header=None)
# df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'razred' ]

# df = df[df.razred != 'Iris-virginica']


# df.replace({'Iris-setosa': 1, 'Iris-versicolor': -1,}, inplace=True)
# df.reset_index()


# df.drop(['petal_length', 'petal_width'], 1, inplace=True)


# clf = SVM(df)

# clf.cross_validate()


In [None]:
df = pd.read_csv('bezdekIris.txt', sep=",", header=None)
df.columns = ['dolzina_casnega_lista', 'sirina_casnega_lista', 'dolzina_vencnega_lista', 'sirina_vencnega_lista', 'razred' ]
df = shuffle(df)
df.reset_index(drop=True,inplace=True)
df.reset_index(drop=True,inplace=True)
df[:10]

In [None]:
df = df[df.razred != 'Iris-virginica']
df.replace({'Iris-setosa': 1, 'Iris-versicolor': -1,}, inplace=True)
df.drop(['dolzina_vencnega_lista', 'sirina_vencnega_lista'], 1, inplace=True)
df.reset_index()
df[:10]

In [None]:
clf = SVM(df)

In [None]:
clf.fit()

In [None]:
clf.visualize()

In [None]:
clf.cross_validate()

In [None]:
clf.accuracy

In [None]:
clf.predict([6.0,3.0])

In [None]:
df = pd.read_csv('breast-cancer-wisconsin.data.txt')
df.replace('?', -99, inplace=True)
df.drop(['id'], 1, inplace=True)
df[['bare_nuclei']] = df[['bare_nuclei']].astype(float)
df.rename(columns={'class': 'razred'}, inplace=True)
df.loc[df['razred'] == 2, 'razred'] = 1
df.loc[df['razred'] == 4, 'razred'] = -1
df[:10]

In [None]:
clf = SVM(df)

In [None]:
clf.fit()

In [None]:
clf.cross_validate()

In [None]:
clf.accuracy

In [None]:
clf.predict([5, 4, 4, 5, 7, 10, 3, 2, 1])

In [None]:
df = pd.read_csv('bezdekIris.txt', sep=",", header=None)
df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'razred' ]

df = df[df.razred != 'Iris-setosa']


df.replace({'Iris-versicolor': 1, 'Iris-virginica': -1,}, inplace=True)
df.reset_index()


df.drop(['petal_length', 'petal_width'], 1, inplace=True)


clf = SVM(df)
clf.fit()
clf.cross_validate()

In [None]:
clf.visualize()

In [None]:
clf.accuracy