# Comparing Classifiers
## Firstly
This part I'm just defining classes described and defined [here](https://github.com/jonchar/ml-python/blob/master/SVM.ipynb)

So probably just scroll down a bit

In [1]:
import numpy as np
from numpy import array,zeros,vstack,repeat,ones,eye,ndarray
from cvxopt import *
import pylab as pl
from sklearn import preprocessing as pp
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from timeit import default_timer as timer

In [2]:
class Linear():
    def __call__(self, a, b):
        x = np.array(a)
        y = np.array(b)
        y = np.transpose(y)
        return np.dot(x, y)

class Polynomial():
    def __call__(self, a, b, p=2):
        x = np.array(a)
        y = np.array(b)
        y = np.transpose(y)
        return (1 + np.dot(x, y)) ** p

class Gaussian():
    def __call__(self, a, b, sigma=5.0):
        x = np.array(a)
        y = np.array(b)
        y = np.transpose(y)
        return np.exp(-np.linalg.norm(x-y)**2 / (2 * (sigma ** 2)))

In [3]:
class svm_problem():
    def __init__(self, C=0.1, gamma=0.001, delta=100.0, kernel=Gaussian()):
        self.C = C
        self.gamma = gamma
        self.delta = delta
        self.kernel = kernel

    def set_variables(self, X, Xstar, Y):
        if(isinstance(X, ndarray)):
            self.X = X
        else:
            self.X = array(X)
        if(isinstance(Xstar, ndarray)):
            self.Xstar = Xstar
        else:
            self.Xstar = array(Xstar)
        if(isinstance(Y, ndarray)):
            self.Y = Y
        else:
            self.Y = array(Y)
        self.num = len(self.X)
        self.dimensions = len(self.X[0])
        self.xi_xj = self.gram_matrix(self.X, self.X)
        self.xstari_xstarj = self.gram_matrix(self.Xstar, self.Xstar)
        self.yi_yj = self.gram_matrix(self.Y, self.Y)

    def gram_matrix(self, X1, X2):
        K = zeros((len(X1), len(X1)))
        for i in range(len(X1)):
            for j in range(len(X1)):
                K[i,j] = self.kernel(X1[i], X2[j])
        return K

In [4]:
class classifier():

    def __init__(self):
        self.w = 0
        self.b = 0
        self.alphas = []
        self.support_vectors = []

    def predict(self, x):
        return np.sign(np.dot(self.w,x)+self.b)
    
    def f_star(self, x, y): # This won't make sense now, but we come back to it later
        return y*(np.dot(self.w,x)+self.b)

In [5]:
class SVM():
    def get_name(self):
        return "SVM"
    def train(self, x, prob : svm_problem):
        x = x
        y = prob.Y
        C = prob.C

        NUM = x.shape[0]
        DIM = x.shape[1]

        K = y[:, None] * x # Yeah, this is a bit different so that it can work on x and x*
        K = np.dot(K, K.T)
        P = matrix(K, tc='d')
        q = matrix(-np.ones((NUM, 1)), tc='d')
        G1 = -np.eye(NUM)
        G2 = np.eye(NUM)
        G = np.vstack((G1, G2))
        G = matrix(G, tc='d')
        h1 = np.zeros(NUM).reshape(-1,1)
        h2 = np.repeat(C, NUM).reshape(-1,1)
        h = np.vstack((h1, h2))
        h = matrix(h, tc='d')
        A = matrix(y.reshape(1, -1), tc='d')
        b = matrix(np.zeros(1), tc='d')
        solvers.options['show_progress'] = False
        sol = solvers.qp(P, q, G, h, A, b)
        alphas = np.array(sol['x'])
        w = np.sum(alphas * y[:, None] * x, axis = 0)
        bacond1 = (alphas > 1e-8)
        bacond2 = (alphas < (C))
        bcond = np.array([a and b for a, b in zip(bacond1, bacond2)]).flatten()
        yS = y[bcond]
        xS = x[bcond]
        aS = alphas[bcond]
        sumTotal = 0
        for s in range(len(yS)):
            innerTotal = 0
            for m in range(len(yS)):
                am = aS[m]
                ym = yS[m]
                xm_xs = prob.kernel(xS[m], xS[s])
                innerTotal += am*ym*xm_xs
            sumTotal += yS[s] - innerTotal
        bias = sumTotal/len(yS)
        clf = classifier()
        clf.w = w
        clf.b = bias[0]
        clf.alphas = alphas
        clf.support_vectors = x[bacond1.flatten()]
        return clf

In [6]:
class SVMdp_simp():
    def get_name(self):
        return "SVM+ - simplified approach"
    def train(self, prob : svm_problem):
        x = prob.X
        xStar = prob.Xstar
        y = prob.Y
        C = prob.C

        NUM = x.shape[0]
        DIM = x.shape[1]
        
        svm = SVM()
        xStar_clf = svm.train(xStar, prob)
        
        xi_star_amended = np.zeros(prob.num)
        for i in range(prob.num):
            output = (1- prob.Y[i]*(np.dot(xStar_clf.w,prob.Xstar[i])+xStar_clf.b))
            xi_star_amended[i] = max(0, output)

        Ky = prob.yi_yj
        Kx = prob.xi_xj
        K = Ky*Kx
        P = matrix(K, tc='d')
        q = matrix(-np.ones((NUM, 1)), tc='d')
        G1 = -np.eye(NUM)
        G2 = np.eye(NUM)
        G3 = xi_star_amended.reshape(1,-1)
        G = np.vstack((G1, G2))
        G = np.vstack((G, G3))
        G = matrix(G, tc='d')
        h1 = np.zeros(NUM).reshape(-1,1)
        h2 = np.repeat(C, NUM).reshape(-1,1)
        h3 = sum(xi_star_amended)*C
        h = np.vstack((h1, h2))
        h = np.vstack((h, h3))
        h = matrix(h, tc='d')
        A = matrix(y.reshape(1, -1), tc='d')
        b = matrix(np.zeros(1), tc='d')
        solvers.options['show_progress'] = False
        sol = solvers.qp(P, q, G, h, A, b)
        alphas = np.array(sol['x'])
        w = np.sum(alphas * y[:, None] * x, axis = 0)

        bacond1 = (alphas > 1e-8)
        bacond2 = (alphas < C)
        bcond = np.array([a and b for a, b in zip(bacond1, bacond2)]).flatten()

        yS = y[bcond]
        xS = x[bcond]
        aS = alphas[bcond]

        sumTotal = 0
        for s in range(len(yS)):
            innerTotal = 0
            for m in range(len(yS)):
                am = aS[m]
                ym = yS[m]
                xm_xs = prob.kernel(xS[m], xS[s])
                innerTotal += am*ym*xm_xs
            sumTotal += yS[s] - innerTotal

        bias = sumTotal/len(yS)

        clf = classifier()
        clf.w = w
        clf.b = bias
        clf.alphas = alphas
        clf.support_vectors = prob.X[bacond1.flatten()]
        return clf

In [7]:
class SVMdp():
    def get_name(self):
        return "SVM+"
    def train(self, prob : svm_problem):
        kernel = prob.kernel
        C = prob.C

        L = prob.num
        M = prob.dimensions

        x = prob.X
        y = prob.Y

        gamma = prob.gamma
        delta = prob.delta

        H11 = (prob.xi_xj * prob.yi_yj) + gamma*(prob.xstari_xstarj * prob.yi_yj)
        H12 = -gamma*(prob.xstari_xstarj * prob.yi_yj)
        H22 = gamma*(prob.xstari_xstarj * prob.yi_yj)
        H1 = np.hstack((H11, H12))
        H2 = np.hstack((H12, H22))
        H = np.vstack((H1, H2))

        f = np.hstack((np.repeat(-1, L),np.zeros(L)))

        positiveEye = np.eye(L, dtype='d')
        negativeEye = -np.eye(L, dtype='d')
        zeros = np.zeros((L, L))
        g1 = np.hstack((zeros, negativeEye))
        g2 = np.hstack((zeros, positiveEye))
        g3 = np.hstack((negativeEye, zeros))
        g4 = np.hstack((positiveEye, negativeEye))

        G = np.vstack((g1,g2))
        G = np.vstack((G,g3))
        G = np.vstack((G,g4))

        h1 = np.zeros(((L),1))
        h2 = np.repeat(C, (L)).reshape(-1,1)
        h2 = np.vstack((h1, h2))
        h3 = np.vstack((h2, h1))
        h4 = np.repeat((delta*C), L).reshape(-1,1)
        h = np.vstack((h3, h4))

        Aeq1 = np.hstack((prob.Y, np.zeros(L)))
        Aeq2 = np.hstack((np.zeros(L), prob.Y))
        Aeq = np.vstack((Aeq1, Aeq2))

        beq = np.zeros(2)
        beq = beq.reshape(-1,1)

        P = matrix(H, tc='d')
        q = matrix(f, tc='d')
        G = matrix(G, tc='d')
        h = matrix(h, tc='d')
        A = matrix(Aeq, tc='d')
        b = matrix(beq, tc='d')

        solvers.options['show_progress'] = False
        sol = solvers.qp(P, q, G, h, A, b)
        alphasAndDeltas = np.array(sol['x'])
        alphas = alphasAndDeltas[:L]
        deltas = alphasAndDeltas[L:]
        w = np.sum(alphas * y[:, None] * x, axis = 0)
        bacond = (alphas > 1e-8)
        bdcond = (deltas < C)
        bxcond = (x != 0)

        bxcond2 = list(range(0, L))
        index = 0
        for dataPoint in bxcond:
            if np.any(dataPoint):
                bxcond2[index] = True
            else:
                bxcond2[index] = False
            index += 1

        bcond = np.array([a and b for a, b in zip(bacond, bdcond)]).flatten()
        bcond = np.array([a and b for a, b in zip(bcond, bxcond2)]).flatten()
        yK = y[bcond]
        xK = x[bcond]
        
        b = []
        for k in range(len(xK)):
            b.append(1-yK[k]*np.dot(w, xK[k]))
        bias = (1- (sum(b) / len(b)))
        
        clf = classifier()
        clf.w = w
        clf.b = bias
        clf.alphas = alphas
        clf.support_vectors = x[bacond.flatten()]
        return clf

## The other side!
Phew, right, now we've done that. Let's see what we want to do. The plan is a follows.
### The plan
1) import some data, probably a small set so that we can look at it
    
    1.1) [This dataset](https://archive.ics.uci.edu/ml/datasets/Balloons) looks like it should do the job
    
2) Create 10 permutations of the data
    
3) Split each permutation into training and test data

4) Split the data into $X$, $X^*$ and $Y$.

5) Transform it so that we're dealing with numbers instead of text.

6) See how well each classifier performs.

## Importing Data
So this is a tiny dataset. The plan is try small and work our way bigger. First we'll oad in the dataset. It's all strings, so let's just get it in, then we can start working on it.

In [8]:
def get_array_from_file(file):
    ''' Return the data from a file ignoring the column headers '''
    array = np.loadtxt(file, delimiter=',', dtype=bytes).astype(str)
    return array[1:,:]

In [9]:
#data = get_array_from_file("adult+stretch.csv")

In [10]:
#data = np.array(data.tolist())

Let's look at what we have

In [11]:
#data

This data follows a simple rule. If the balloon is owened by an adult and is "stretched", then it will be inflated (T), else it will not be inflated (F). 

Let's get some permuattions of this data.

In [12]:
def rotate (X):
    np.random.shuffle(X)
    np.random.shuffle(X)
    np.random.shuffle(X)
    np.random.shuffle(X)
    np.random.shuffle(X)
    return X

In [13]:
def getNcopies(A, X):
    nCopies = []
    for i in range(A):
        nCopies.append(np.copy(X))
    return nCopies

In [14]:
def shuffleCopies(copyList):
    for copy in copyList:
        rotate(copy)
    return copyList

In [15]:
#copies = getNcopies(5, data)
#shuffledCopies = shuffleCopies(copies)

In [16]:
def split_train_test(X):
    testSize = len(X)//1.001
    trainSize = len(X) - testSize
    return(X[:trainSize,:], X[trainSize:,:])

In [17]:
def split_features_target(X, targetCol):
    target = X[:,[targetCol]]
    features = np.delete(X, targetCol, 1)
    return (features, target)

In [18]:
#train, test = split_train_test(data)
#train_features, train_target = split_features_target(train, 4)

In [19]:
def target_labels_to_classes(target):
    lb = pp.LabelBinarizer(neg_label=-1)
    y = lb.fit_transform(target).flatten()
    return y

In [20]:
#t = target_labels_to_classes(train_target)
#t

So let's start off by splitting this into features and a target, the converting the tearget to -1 and 1 for our binary classification.

In [21]:
#lb = pp.LabelBinarizer(neg_label=-1)

In [22]:
#features = data[:,:-1]
#target = data[:,-1:]

In [23]:
#target

In [24]:
#lb.fit(target)

In [25]:
#y = lb.transform(target).flatten()

In [26]:
#features

Next up, we should decide what's privileged information ($X^*$). I'm going to suggest the stretch / dip attribute as this is important for classification, without it you can't have a reliable classifier.

In [27]:
#xStar = features[:,[2]]

In [28]:
#x = np.delete(features, 2, 1)

Then we're going to encode the data so that it's of a one-of-K form.

In [29]:
#enc = pp.LabelEncoder()
#ohe = OneHotEncoder(sparse=False)
#colour = enc.fit_transform(x[:,[0]].flatten()).reshape(-1,1)
#colour = ohe.fit_transform(colour)
#size = enc.fit_transform(x[:,[1]].flatten()).reshape(-1,1)
#size = ohe.fit_transform(size)
#age = enc.fit_transform(x[:,[2]].flatten()).reshape(-1,1)
#age = ohe.fit_transform(age)
#x = np.hstack((colour, size))
#x = np.hstack((x, age))

In [30]:
#xStar = enc.fit_transform(xStar.flatten()).reshape(-1,1)
#xStar = ohe.fit_transform(xStar)

In [31]:
#prob = svm_problem()
#prob.set_variables(x, xStar, y)

Ok, let's just double check that the classifiers work and that there aren't any bugs...

In [32]:
##svm = SVM()
#svm_clf = svm.train(x, prob)
#svmdp_simp = SVMdp_simp()
#svmdpsimp_clf = svmdp_simp.train(prob)
#svmdp = SVMdp()
#svmdp_clf = svmdp.train(prob)
#svm_clf.predict([0,0,0,0,0,0]), svmdpsimp_clf.predict([0,0,0,0,0,0]), svmdp_clf.predict([0,0,0,0,0,0])

That's cool. Now we know what we're going to do, we need to go about splitting the data into training and test data for various permutations. So we'll start from the begining set and create the folds, then preprocess, then test.

In [33]:
def constructProblem(data, xIndices, xStarIndices, yIndex):
    x = data[:,[xIndices]].astype(np.float)
    x = x[:,0]
    xStar = data[:,[xStarIndices]].astype(np.float)
    xStar = xStar[:,0]
    y = data[:,[yIndex]].astype(np.float).flatten()
    prob = svm_problem()
    prob.set_variables(x, xStar, y)
    return prob

In [34]:
def all_copies_relabel_target(original_data, targetCol, copies):
    lb = pp.LabelBinarizer(neg_label=-1)
    features, target = split_features_target(original_data, targetCol)
    lb.fit(target)
    amended_copies = []
    for copy in copies:
        features, target = split_features_target(copy, targetCol)
        target = lb.transform(target)
        amended_copies.append(np.hstack((features, target)))
    return amended_copies

In [35]:
def all_copies_relabel_feature(original_data, featCol, copys):
    copies = np.copy(copys)
    enc = pp.LabelEncoder()
    feature_to_learn = original_data[:,[featCol]].flatten()
    enc.fit(feature_to_learn)
    amended_copies = []
    for copy in copies:
        feature = copy[:,[featCol]].flatten()
        feature = enc.transform(feature).reshape(-1,1)
        copy[:,[featCol]] = feature
        amended_copies.append(copy)
    return amended_copies

In [36]:
#h = all_copies_relabel_target(data, 4, shuffledCopies)
#h = all_copies_relabel_feature(data, 0, h)
#h = all_copies_relabel_feature(data, 1, h)
#h = all_copies_relabel_feature(data, 2, h)
#h = all_copies_relabel_feature(data, 3, h)

In [37]:
def get_x_xs_y(data, xIndices, xStarIndices, yIndex):
    x = data[:,[xIndices]].astype(np.float)
    x = x[:,0]
    xStar = data[:,[xStarIndices]].astype(np.float)
    xStar = xStar[:,0]
    y = data[:,[yIndex]].astype(np.float).flatten()
    return x, xStar, y

In [38]:
def train_and_test(data, xIndices, xStarIndices, yIndex, model):
    train, test = split_train_test(data)
    train_prob = constructProblem(train, xIndices, xStarIndices, yIndex)
    svm = model
    
    if isinstance(svm, SVM):
        clf = svm.train(train_prob.X, train_prob)
    else:
        clf = svm.train(train_prob)
    test_x, test_xs, test_y = get_x_xs_y(test, xIndices, xStarIndices, yIndex)
    print("weight: ", clf.w)
    print("bias: ", clf.b)
    predictions = []
    for test_point in test_x:
        predictions.append(clf.predict(test_point))
        
    tp = 0
    fp = 0
    fn = 0
    tn = 0
    for i in range(len(test_y)):
        if test_y[i] == 1 and predictions[i] == 1:
            tp += 1
        if test_y[i] == -1 and predictions[i] == 1:
            fp += 1
        if test_y[i] == 1 and predictions[i] == -1:
            fn += 1
        if test_y[i] == -1 and predictions[i] == -1:
            tn += 1
    return (tp, fp, fn, tn)

In [39]:
def get_accuracy(tp, fp, fn, tn):
    return (tp+tn)/(tp+fp+fn+tn+0.000001)

In [40]:
def get_error(tp, fp, fn, tn):
    return (fp+fn)/(tp+fp+fn+tn+0.000001)

In [41]:
def get_recall(tp, fp, fn, tn):
    return (tp)/(tp+fn+0.000001)

In [42]:
def get_specificity(tp, fp, fn, tn):
    return (tn)/(fp+tn+0.000001)

In [43]:
def get_precision(tp, fp, fn, tn):
    return (tp)/(tp+fp+0.000001)

In [44]:
def get_prevalence(tp, fp, fn, tn):
    return (tp+fn)/(tp+fp+fn+tn+0.000001)

In [45]:
def compare_classifiers(shuffled):
    clfs = [SVM(), SVMdp_simp(), SVMdp()]
    x_in = [1,3,4,5,6,7,11,12]
    xs_in = [0,8,9]
    y_in = [14]

    
    for clf in clfs:
        tp = 0
        fp = 0
        fn = 0
        tn = 0
        average_time = 0
        for permutation in shuffled:
            start = timer()
            a, b, c, d = train_and_test(permutation, x_in, xs_in, y_in, clf)
            average_time += timer() - start
            tp += a
            fp += b
            fn += c
            tn += d
        average_time = average_time/len(shuffled)
        accuracy = get_accuracy(tp, fp, fn, tn)
        error = get_error(tp, fp, fn, tn)
        recall = get_recall(tp, fp, fn, tn)
        specificity = get_specificity(tp, fp, fn, tn)
        precision = get_precision(tp, fp, fn, tn)
        prevalence = get_prevalence(tp, fp, fn, tn)
        print(clf.get_name())
        print("=====================================")
        print("|          |  Pred: YES |  Pred: NO |")
        print("+----------+------------+-----------+")
        print("| Act: YES |  ", '{:7d}'.format(tp), " | ", '{:7d}'.format(fn), " |")
        print("+----------+------------+-----------+")
        print("| Act: NO  |  ", '{:7d}'.format(fp), " | ", '{:7d}'.format(tn), " |")
        print("+----------+------------+-----------+")
        
        print("accuracy = ", accuracy)
        print("error = ", error)
        print("recall = ", recall)
        print("specificity = ", specificity)
        print("precision = ", precision)
        print("prevalence = ", prevalence)
        print("average time to train classifier = ", average_time, "\n")

In [46]:
#compare_classifiers(h)

# Gawd I need to tidy the above up
### But making progress, so just going to power on and come back to this later

In [47]:
data = get_array_from_file("adult.csv")
copies = getNcopies(5, data)
shuffledCopies = shuffleCopies(copies)
#shuffledCopies = []
#shuffledCopies.append(data)

In [48]:
#
# This doesn't work - come back later
#
def all_copies_ohe_feature(copys, featCol):
    
    copies = np.copy(copys)
    ohe = OneHotEncoder(sparse=False)
    orig = copies[0]
    print(orig[0])
    feature_to_learn = orig[:,[featCol]]
    print(feature_to_learn)
    ohe.fit(feature_to_learn)
    
    amended_copies = []
    for copy in copies:
        feature = ohe.transform(copy[:,[featCol]])
        amended_copies.append(feature)
    return amended_copies

In [49]:
h = all_copies_relabel_target(data, 14, shuffledCopies)
h = all_copies_relabel_feature(data, 1, h)
h = all_copies_relabel_feature(data, 2, h)
h = all_copies_relabel_feature(data, 3, h)
h = all_copies_relabel_feature(data, 4, h)
h = all_copies_relabel_feature(data, 5, h)
h = all_copies_relabel_feature(data, 6, h)
h = all_copies_relabel_feature(data, 7, h)
h = all_copies_relabel_feature(data, 8, h)
h = all_copies_relabel_feature(data, 9, h)
h = all_copies_relabel_feature(data, 10, h)
h = all_copies_relabel_feature(data, 11, h)
h = all_copies_relabel_feature(data, 12, h)
h = all_copies_relabel_feature(data, 13, h)

In [50]:
compare_classifiers(h)



weight:  [  6.76459820e-02   9.42379482e-03  -5.01820701e-03  -4.51083398e-02
   1.04839348e-02  -3.73629560e-02  -4.23699444e-07  -1.13586778e-03]
bias:  -0.522030412355
weight:  [-0.29701669 -0.0017198  -0.13251156 -0.07410763  0.08329104 -0.51116178
 -0.0519024  -0.02449697]
bias:  -0.591666675459
weight:  [ 0.23521469  0.02694045 -0.05022306  0.08455754  0.00975473 -0.16044242
  0.03862242 -0.00975472]
bias:  -0.14290308183
weight:  [ 0.00231356  0.0517854   0.00297095 -0.01930693  0.04945561 -0.11506992
  0.0419286  -0.00389398]
bias:  -0.602777068958
weight:  [ 0.08226012 -0.02948615 -0.0458581   0.02367448  0.01601924 -0.34065645
  0.05865348  0.16189332]
bias:  0.123155359185
SVM
|          |  Pred: YES |  Pred: NO |
+----------+------------+-----------+
| Act: YES |     19607  |    19558  |
+----------+------------+-----------+
| Act: NO  |     52428  |    71042  |
+----------+------------+-----------+
accuracy =  0.5573769483779175
error =  0.4426230516159337
recall =  0.5006