# Non-Linear SVM

Classification of linearly inseparable data. We'll use the Iris dataset to build and train a non-linear SVM classifier to detect whether datapoints represent the Iris varieties. 

In [8]:
import numpy as np
from sklearn import datasets

## Tensorflow initialization

Let's import tensorflow and clear the default computational graph

In [9]:
import tensorflow as tf
from tensorflow.python.framework import ops
ops.reset_default_graph()

## Session declaration

In [10]:
session = tf.Session()

## Load dataset

In [18]:
dataset = datasets.load_iris()

In [20]:
dataset.data[0:10]

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1]])

In [22]:
dataset.data[0]

array([ 5.1,  3.5,  1.4,  0.2])

In [16]:
X = np.array([[X[0], X[3]] for X in dataset.data])

In [25]:
X[0:5]

array([[ 5.1,  0.2],
       [ 4.9,  0.2],
       [ 4.7,  0.2],
       [ 4.6,  0.2],
       [ 5. ,  0.2]])

In [26]:
y = np.array([1 if y == 0 else 0 for y in dataset.target])

In [27]:
y[0:5]

array([1, 1, 1, 1, 1])

## Set up model parameters, placeholder grids

In [31]:
batch_size = 200

# placeholder grids
X_grid = tf.placeholder(shape = [None, 2], dtype = tf.float32)
y_grid = tf.placeholder(shape = [None, 1], dtype = tf.float32)

In [32]:
# grid for predictions
prediction_grid = tf.placeholder(shape = [None, 2], dtype = tf.float32)

In [33]:
# b value for the SVM kernel
b = tf.Variable(tf.random_normal(shape = [1, batch_size]))

## Construct the RBF kernel

https://en.wikipedia.org/wiki/Radial_basis_function_kernel

### Gamma

The gamma is a constant for use in the RBF kernel that effectively determines the range of influence for a single subsample (ex: the raidus).

* Smaller values for gamme increase that relative influence, producing a wider kernel
* Larger values decrease the influence of a subsample, producing "tighter" looking decision boundaries

In [37]:
# tensorflow representation of the RBF kernel (gamme is negative)
gamma = tf.constant(-45.0) # gamma is some constant, which we made negative
sq_vec = tf.multiply(2., tf.matmul(X_grid, tf.transpose(X_grid)))
rbf_kernel = tf.exp(tf.multiply(gamma, tf.abs(sq_vec)))

## Computational step

The non-linear SVM actually aims at maximizing the loss function, specifically by minimizing its negative:

In [38]:
first = tf.reduce_sum(b)
b_cross = tf.matmul(tf.transpose(b), b)
y_grid_cross = tf.matmul(y_grid, tf.transpose(y_grid))
second = tf.reduce_sum(tf.multiply(rbf_kernel, tf.multiply(b_cross, y_grid_cross)))

In [39]:
# loss is negative here because this value needs to be maximized
# minimizing a negative maximizes the positive equivalent
loss = tf.negative(tf.subtract(first, second))

## Build and apply a prediction kernel

In [42]:
rA = tf.reshape(tf.reduce_sum(tf.square(X_grid), 1), [-1, 1])
rB = tf.reshape(tf.reduce_sum(tf.square(prediction_grid), 1), [-1, 1])

In [44]:
pred_sq_dist = tf.add(tf.subtract(rA, tf.multiply(2., tf.matmul(X_grid, tf.transpose(prediction_grid)))), tf.transpose(rB))
pred_kernel = tf.exp(tf.multiply(gamma, tf.abs(pred_sq_dist)))

In [45]:
pred_output = tf.matmul(tf.multiply(tf.transpose(y_grid), b), pred_kernel)
prediction = tf.sign(pred_output - tf.reduce_mean(pred_output))
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.squeeze(prediction), tf.squeeze(y_grid)), tf.float32))