# Линейный SVM "своими руками"

## Генерируем обучающую и тестовую выборку для экспериментов

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import datasets

X, y = datasets.make_classification(
    n_samples=10000, n_features=20, 
    n_classes=2, n_informative=20, 
    n_redundant=0,
    random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    random_state=42
)

print (len(X), len(y))
print (len(X_train))

In [None]:
y_train[y_train == 0] = -1
y_test[y_test == 0] = -1

## Пишем свой класс для SVM

In [None]:
import numpy as np
from random import randint
import random


np.random.seed(42)
random.seed(42)


class MySVM(object):
    def __init__(self, C=10000, batch_size = 100):
        self.C = C # regularization constant
        self.batch_size = batch_size

    # f(x) = <w,x> + w_0
    def f(self, x):
        return np.dot(self.w, x) + self.w0

    # a(x) = [f(x) > 0]
    def a(self, x):
        return 1 if self.f(x) > 0 else -1
    
    # predicting answers for X_test
    def predict(self, X_test):
        return np.array([model.a(x) for x in X_test])

    # l2-regularizator
    def reg(self):
        return 1.0 * sum(self.w ** 2) / (2.0 * self.C)

    # l2-regularizator derivative
    def der_reg(self):
        return self.w/self.C

    # hinge loss vectorized
    def loss(self, x, answer):
        return np.vectorize(lambda x_v, answer_v: max([0, 1 - answer_v * self.f(x_v)]),
                            signature='(m),()->()')(x, answer)

    # hinge loss derivative
    def _dl(self, x_v, answer_v):
        return -answer_v if 1 - answer_v * self.f(x_v) > 0 else 0.0
    
    def der_loss(self, x, answer):
        return np.vectorize(lambda x_v, answer_v: self._dl(x_v, answer_v), signature=
                           '(m),()->()')(x, answer)
    
    def der_loss_wrt_w(self, x, answer):
        #print(self.der_loss(x, answer))
        return np.mean((np.multiply(x.T, self.der_loss(x, answer))), axis=1)
    
    def der_loss_wrt_w0(self, x, answer):
        return np.mean(self.der_loss(x, answer))
    
    def trans_to_01(y):
        y[y==-1] = 1
        return y
    def trans_to_11(y):
        y[y == 0] = -1 
        return y

    # fitting w and w_0 with SGD
    def fit(self, X_train, y_train):
        y_train = self.trans_to_11(y_train)
        dim = len(X_train[0])
        self.w = np.random.rand(dim) # initial value for w
        self.w0 = np.random.randn() # initial value for w_0
        
        # 10000 steps is OK for this example
        # another variant is to continue iterations while error is still decreasing
        loss_a = 1.
        delta = 1.
        cnt = 0
        glob_cnt = 0
        #stops if too long
        while (cnt<100 or abs(delta/loss_a) > 1e-3) and glob_cnt < 10000:  
            
            # random example choise
            # rand_index = randint(0, len(X_train) - 1,) # generating random index
            rand_index = np.random.randint(low=0, high=X_train.shape[0], size=self.batch_size)
            x = X_train[rand_index]
            y = y_train[rand_index]
            
            loss_b = self.loss(x, y).sum()

            # simple heuristic for step size
            
            step = 1./(glob_cnt+1)
            # w update
            #print(self.der_loss_wrt_w(x, y), self.der_reg())
            
            self.w += step * (-self.der_loss_wrt_w(x, y) - self.der_reg())
            
            # w_0 update
            self.w0 += -step * self.der_loss_wrt_w0(x, y)
            
            loss_a = self.loss(x, y).sum()
            delta = abs(loss_a - loss_b)
            if abs(delta/loss_a) > 1e-3: 
                cnt = 0
            else:
                cnt+=1
            glob_cnt += 1 

## Пробуем обучить наш классификатор и посмотреть на качество на тесте

In [None]:
model = MySVM(C=100, batch_size=200)
model.fit(X_train, y_train)
print(model.w, model.w0)

In [None]:
from checkers import svm_checker

In [None]:
pip = svm_checker.Checker()

In [None]:
pip.check(script_path='./checkers/svm_impl_shtanko.py')

In [1]:
from checkers import text_classification_params_checker

In [2]:
pip2 = text_classification_params_checker.Checker()

In [3]:
pip2.check(params_path='./checkers/txt_params_cls.json')

0.89395474501857475