In [157]:
"""Input and output helpers to load in data.
"""
import numpy as np

def read_dataset(path_to_dataset_folder,index_filename):
    """ Read dataset into numpy arrays with preprocessing included
    Args:
        path_to_dataset_folder(str): path to the folder containing samples and indexing.txt
        index_filename(str): indexing.txt
    Returns:
        A(numpy.ndarray): sample feature matrix A = [[1, x1], 
                                                     [1, x2], 
                                                     [1, x3],
                                                     .......] 
                                where xi is the 16-dimensional feature of each sample
            
        T(numpy.ndarray): class label vector T = [y1, y2, y3, ...] 
                             where yi is +1/-1, the label of each sample 
    """
    with open(path_to_dataset_folder+'/'+index_filename, 'r') as f:
        label_sample_path = f.readlines()
    T = np.array([int(label_sample_path[i].split(' ')[0]) for i in range(len(label_sample_path))])
    sample_path = [label_sample_path[i].split(' ')[1].replace('\n','') for i in range(len(label_sample_path))]
    
    A = []
    for i in range(len(sample_path)):
        with open(path_to_dataset_folder+'/'+sample_path[i], 'r') as f:
            row_data = f.read().strip().split('  ')
            A.append([1.  if i ==0 else float(row_data[i-1]) for i in range(len(row_data)+1)])
    A = np.array(A)
    
        
    
    return A, T

In [158]:
X, Y_true = read_dataset('C:/Users/PIxel/CS446/mp3/data/trainset','indexing.txt')

In [159]:
print(len(X), len(X[0]), (len(Y_true)))

1290 17 1290


In [160]:
"""logistic model class for binary classification."""

import numpy as np

class LogisticModel(object):
    
    def __init__(self, ndims, W_init='zeros'):
        """Initialize a logistic model.

        This function prepares an initialized logistic model.
        It will initialize the weight vector, self.W, based on the method
        specified in W_init.

        We assume that the FIRST index of W is the bias term, 
            self.W = [Bias, W1, W2, W3, ...] 
            where Wi correspnds to each feature dimension

        W_init needs to support:
          'zeros': initialize self.W with all zeros.
          'ones': initialze self.W with all ones.
          'uniform': initialize self.W with uniform random number between [0,1)
          'gaussian': initialize self.W with gaussion distribution (0, 0.1)

        Args:
            ndims(int): feature dimension
            W_init(str): types of initialization.
        """
        self.ndims = ndims
        self.W_init = W_init
        self.W = None
        if W_init == 'zeros':
            self.W = np.zeros(self.ndims+1)    
        elif W_init == 'ones':
            self.W = np.ones(self.ndims+1)
        elif W_init == 'uniform':
            self.W = np.random.uniform(0,1,self.ndims+1)
        elif W_init == 'gaussian':
            self.W = np.radnom.normal(0,0.1, self.ndims+1)
        else:
            print ('Unknown W_init ', W_init) 
        self.X = None
        
    def save_model(self, weight_file):
        """ Save well-trained weight into a binary file.
        Args:
            weight_file(str): binary file to save into.
        """
        self.W.astype('float32').tofile(weight_file)
        print ('model saved to', weight_file)

    def load_model(self, weight_file):
        """ Load pretrained weghit from a binary file.
        Args:
            weight_file(str): binary file to load from.
        """
        self.W = np.fromfile(weight_file, dtype=np.float32)
        print ('model loaded from', weight_file)

    def forward(self, X):
        """ Forward operation for logistic models.
            Performs the forward operation, and return probability score (sigmoid).
        Args:
            X(numpy.ndarray): input dataset with a dimension of (# of samples, ndims+1)
        Returns:
            (numpy.ndarray): probability score of (label == +1) for each sample 
                             with a dimension of (# of samples,)
        """
        self.X = X
        f = 1./ ( 1. + np.exp(-np.matmul(self.X,self.W)))
        return f

    def backward(self, Y_true, X):
        """ Backward operation for logistic models. 
            Compute gradient according to the probability loss on lecture slides
        Args:
            X(numpy.ndarray): input dataset with a dimension of (# of samples, ndims+1)
            Y_true(numpy.ndarray): dataset labels with a dimension of (# of samples,)
        Returns:
            (numpy.ndarray): gradients of self.W
        """
        ###############################################################
        # Fill your code in this function
        
        Y_true.reshape((-1,1))
        
        D1 = -np.transpose(np.multiply(Y_true,np.transpose(X)))    
        D2 = np.exp(-np.multiply(Y_true,np.matmul(self.X,self.W)))
        N = np.add(1,D2)
        DN = np.divide(D2, N)       
        total_grad = np.matmul(DN, D1)
        
        return total_grad

    def classify(self, X):
        """ Performs binary classification on input dataset.
        Args:
            X(numpy.ndarray): input dataset with a dimension of (# of samples, ndims+1)
        Returns:
            (numpy.ndarray): predicted label = +1/-1 for each sample
                             with a dimension of (# of samples,)
        """
        ###############################################################
        # Fill your code in this function
        ###############################################################
        f = self.forward(X)
        
        return np.array([1  if f[i] >= 0.5 else -1 for i in range(len(f))])
    
    def fit(self, Y_true, X, learn_rate, max_iters):
        """ train model with input dataset using gradient descent. 
        Args:
            Y_true(numpy.ndarray): dataset labels with a dimension of (# of samples,)
            X(numpy.ndarray): input dataset with a dimension of (# of samples, ndims+1)
            learn_rate: learning rate for gradient descent
            max_iters: maximal number of iterations
            ......: append as many arguments as you want
        """
        
        def accuracy(Y_t, Y_p):
            return np.sum([1 if Y_t[i] == Y_p[i] else 0 for i in range(len(Y_p))])/len(Y_p)
            
        for i in range(max_iters):
            total_grad = self.backward(Y_true, X)
            self.W -= learn_rate * total_grad
            
            if i% 100 == 0:
                Y_n = self.classify(X)
                acc = accuracy(Y_true, Y_n)
                print(acc)
            
        return self.W
            
            
        
        

    

In [161]:
model = LogisticModel(16, 'ones')

In [162]:
f = model.forward(X)
print(f[1:10], len(f))
c = model.classify(X)
# print(c[220:400], len(c))

[ 0.99999373  0.99997404  0.99997296  0.99998708  0.99992228  0.99999874
  0.9999981   0.99998111  0.99995259] 1290


In [163]:
total_grad1 = model.backward(Y_true, X)

In [164]:
print(c)
print(len(total_grad1))#, len(total_grad1[0]))

[1 1 1 ..., 1 1 1]
17


In [165]:
model.fit(Y_true, X, 0.001,20)
#print(np.sum([1 if Y_true[i]==-1 else 0 for i in range(len(Y_true))]))

0.582170542636


array([ 0.41631785,  1.0104817 ,  1.61936464, -3.62188704, -1.52157379,
        0.79156122, -0.66503333,  1.96390844,  2.20266528,  2.18294404,
        0.03468568, -0.20858415,  0.73128432,  0.30091493,  0.96691144,
        0.63954187,  0.91082492])

In [153]:
c = model.classify(X)
print(c[10:15])
print(Y_true[10:15])


[-1 -1 -1 -1 -1]
[-1 -1 -1 -1 -1]


In [154]:
model.save_model('trained_weights.np')

model saved to trained_weights.np


In [155]:
model.load_model('trained_weights.np')

model loaded from trained_weights.np


In [150]:
c = model.classify(X)
print(c[10:15])
print(Y_true[10:15])


[-1 -1 -1 -1 -1]
[-1 -1 -1 -1 -1]


In [156]:
model.W

array([ 1.46911895, -0.0405732 ,  0.44669679, -2.33442211, -0.77778155,
       -0.52750027, -1.32548511,  2.19791985,  2.64580703,  1.19006515,
       -1.24726415, -3.28505969, -0.43239495, -0.62762898,  2.59026861,
       -2.24624372, -0.2439252 ], dtype=float32)