# Classification Models

In [None]:
## Table of contents:
* [Maximum Likelihood logistic regression](#maxlike)
* [Bayesian logistic regression](#bayesian)
* [Non-linear logistic regression](#non-linear)
* [Dual logistic regression](#dual)
* [Relevance vector classification](#relevant-vector)
* [Incremental fitting and boosting](#boosting)
* [Classification trees](#trees)
* [Random trees, forests, and ferns](#forest)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as spst
%matplotlib inline

# usual gangs

In [2]:
class DataSet:
    def __init__(self):
        pass
        
    def generate_data_set_dim2_nonlinear(self, ns):
        """
        this function generates a 2d data set of two classes which are linearly separated.

        Arguments:
            ns: number of samples

        Output:
            ds1: data on the negative side (3, ns). ds1[1] = 0 is the label.
            ds2: data on the positive side (3, ns). ds2[1] = 0 is the label.
            data  : data combined ds1 and ds2 and shuffled. (2, ns)
            label : label combined ds1 and ds2 and shuffled. (2, ns)
        """
        ns_neg = int(0.5 * ns)
        ns_pos = ns - ns_neg
        nfar_neg = int(0.99 * ns_neg)
        nbdr_neg = ns_neg - nfar_neg
        nfar_pos = int(0.99 * ns_pos)
        nbdr_pos = ns_pos - nfar_pos

        far_neg = 8 * (np.random.rand(nfar_neg) - .5) - 5
        bdr_neg = np.random.standard_t(5, size=nbdr_neg) - 0.5
        ds1x = np.hstack((far_neg, bdr_neg))
        ds1y = 10 * (np.random.rand(ns_neg) - .5)
        ds1 = np.vstack((ds1x, ds1y))

        far_pos = 8 * (np.random.rand(nfar_pos) - .5) + 5
        bdr_pos = np.random.standard_t(5, size=nbdr_pos) + 0.5
        ds2x = np.hstack((far_pos, bdr_pos))
        ds2y = 10 * (np.random.rand(ns_pos) - .5)
        ds2 = np.vstack((ds2x, ds2y))

        ang = np.pi * np.random.rand() 
        c, s = np.cos(ang), np.sin(ang)
        rot = np.array([[c, s], [-s, c]])

        ds1 = np.dot(rot, ds1)
        ds2 = np.dot(rot, ds2)

        ds1s = np.vstack((ds1, np.zeros(ns_neg)))
        ds2s = np.vstack((ds2, np.ones(ns_pos)))
        ds = np.hstack((ds1s, ds2s))
        np.random.shuffle(ds.T)

        data = ds[:2]
        label = ds[2][np.newaxis, :]

        datas, mu, sg = self.scale_coordinates(data)
        ds1s = (ds1[:2] - mu)/sg
        ds2s = (ds2[:2] - mu)/sg
        ds = {'1': ds1s, '2': ds2s}

        red = plt.scatter(ds1s[0], ds1s[1], c=(1., 0., 0.), s=3)
        blu = plt.scatter(ds2s[0], ds2s[1], c=(0., 0., 1.), s=3)
        plt.xlim((-3, 3))
        plt.ylim((-3, 3))
        plt.legend([red, blu], ["group 0", "group 1"])
        
        return ds, datas, label

    def scale_coordinates(self, x):
        mu = x.mean(keepdims=1)
        sg = x.std()
        xs = (x - mu)/sg
        return xs, mu, sg


In [None]:
class KernelDualLogisticRegression:
    def __init__(self, x, w):
        self.D = x.shape[0]
        self.N = x.shape[1]
        self.x = x
        self.w = w
        self.psi = np.zeros((self.N, 1))
        self.sig = np.zeros((self.N, 1))
        self.cost = 0
        self.mu = np.zeros((self.N, 1))
        self.sgm = np.zeros((self.N, self.N))
        self.lmd = 0
        self.sgps = 100
        assert x.shape[1] == w.shape[1]

    def cost_function(self):
        """
        this function computes logistic cost function

        Arguments:
            psi    :  (N, 1)
            self.x :  (D, N)
        
        Intermediates:
            sig :  (N, 1)

        Output:
            L:     scalar. cost function.
        """
        L  = -np.dot(self.w, np.log(self.sig))
        L += -np.dot(1 - self.w, np.log(1 - self.sig))
        return L

    def sigmoid(self):
        """
        this function computes sigmoid function

        Arguments:
            psi    :  (N, 1)
            self.x :  (D, N)

        Intermediates:
            arg :  (1, N)
         
        Output:
            sig :  (N, 1)
        """
        arg = np.dot(self.psi.T, self.Kxx)
        sig = 1 / (1 + np.exp(-arg))
        return sig.T
    
    def set_psi_sig_cost(self, psi):
        """
        this function sets psi and update the sigmoid.

        Arguments:
            psi    :  (N, 1)
        """
        assert psi.shape[0] == self.N
        assert psi.shape[1] == 1
        self.psi = psi
        self.sig = self.sigmoid()
        self.cost = self.cost_function()

    def dL_dtheta(self):
        """
        this function generates dda/dphidalp.

        Arguments:
            self.x :  (D, N)

        Intermediate:
            sig   : (N, 1)
            pre   : (1, N)
            xx    : (N, N)

        Output:
            dLdt  : (N, 1)
        """
        err = self.sig.T - self.w
        dLdt = -(err * self.Kxx).sum(axis=1, keepdims=1)
        return dLdt

    def newton_method_update(self, psi, alpha):
        """
        this function computes sigmoid function

        Arguments:
            alpha:   scalar
            
        Intermediate:
            dLdt : (N, 1)
            hess : (N, N)

        Output:
            psi  :  (N, 1)
        """
        dLdt = self.dL_dtheta() - psi / self.sgps
        hess = self.ddL_dtheta_dtheta()
        self.hess = hess
        hess -= 1 / self.sgps
        hessinv = np.linalg.inv(hess)
        update = alpha * np.dot(hessinv, dLdt)
        psi -= update
        return psi

    def optimization(self, psi, lmd, sgps, beta1=0.9, alpha=0.01, nstep=1000):
        """
        this function computes sigmoid function

        Arguments:
            datab:   (D, ns)  D contains bias as well.
            label:   (1, ns)
            phi  :   (1, D)
            alpha:   scalar

        Output:
            sig  :  (ns, 1)
        """
        
        self.lmd = lmd
        self.sgps = sgps
        self.Kxx = self.kernel_rbf(self.x, self.x)
        dlt = 1
        step = 0
        costs = np.zeros(nstep)
        self.set_psi_sig_cost(psi)
        costs[step] = self.cost
        vsum = np.zeros((psi.shape))
        while True:
            step += 1
            if step == nstep:
                break
            psi, vsum =  self.momentum(self.psi, alpha, beta1, vsum)
            #psi =  self.vanilla(self.psi, alpha)
            #psi = self.newton_method_update(self.psi, alpha)
            self.set_psi_sig_cost(psi)
            costs[step] = self.cost
            #dlt = np.abs(costs[step] - costs[step - 1])
        strng = '{:d} steps done:'.format(step)
        print(strng)
        self.mu = self.psi
        self.sgm = -np.linalg.inv(self.ddL_dtheta_dtheta())
        return self.psi
    
    def plot_decision_boundary_dim2(self):
        """
        this function computes sigmoid function

        Arguments:
            phi  : phi. (1, (K+1))
            alp  : alpha. (D, K)
            beta : scalar. learning rate.
            lmd  : scalar.

        Intermediate:
            dLdt   : ((K+1+KD), 1)
            ddLdtdt: ((K+1+KD), (K+1+KD))

        Output:
            theta: (1, (K+1+KD))
        """
        n = 201
        nn = n**2
        x = np.linspace(-3, 3, n)
        y = np.linspace(-3, 3, n)
        xg, yg = np.meshgrid(x, y)
        xss = xg.reshape(1, nn)
        yss = yg.reshape(1, nn)
        xys = np.vstack((xss, yss))
        Kxxs = self.kernel_rbf(self.x, xys)
        mua = np.dot(self.mu.T, Kxxs)
        sgma = (np.dot(self.sgm * 1e-15, Kxxs) * Kxxs).sum(axis=0, keepdims=1)
        inf = self.laplace_approximated_inference(mua.squeeze(), sgma.squeeze())
        inf = inf.reshape(n, n)
        inf = np.flipud(inf)
        plt.imshow(inf, extent=[-3, 3, -3, 3])

In [12]:
n = 5
nn = n**2
x = np.linspace(0, 4, n)
y = np.linspace(0, 4, n)
z = np.linspace(0, 4, n)
xg, yg = np.meshgrid(x, y)
xs = xg.reshape(1, nn)
ys = yg.reshape(1, nn)
xs = xg.reshape(1, nn)
ys = yg.reshape(1, nn)

In [13]:
xs

array([[ 0.,  1.,  2.,  3.,  4.,  0.,  1.,  2.,  3.,  4.,  0.,  1.,  2.,
         3.,  4.,  0.,  1.,  2.,  3.,  4.,  0.,  1.,  2.,  3.,  4.]])

In [11]:
xg, yg = np.meshgrid(x, y)
xss = xg.reshape(1, nn)
yss = yg.reshape(1, nn)