In [57]:
"""Input and output helpers to load in data.
"""
import numpy as np

def read_dataset_tf(path_to_dataset_folder,index_filename):
    """ Read dataset into numpy arrays with preprocessing included
    Args:
        path_to_dataset_folder(str): path to the folder containing samples and indexing.txt
        index_filename(str): indexing.txt
    Returns:
        A(numpy.ndarray): sample feature matrix A = [[1, x1], 
                                                     [1, x2], 
                                                     [1, x3],
                                                     .......] 
                                where xi is the 16-dimensional feature of each sample
            
        T(numpy.ndarray): class label vector T = [y1, y2, y3, ...] 
                             where yi is +1/-1, the label of each sample 
    """
    with open(path_to_dataset_folder+'/'+index_filename, 'r') as f:
        label_sample_path = f.readlines()
    T = np.array([max(0,float(label_sample_path[i].split(' ')[0])) for i in range(len(label_sample_path))])
    sample_path = [label_sample_path[i].split(' ')[1].replace('\n','') for i in range(len(label_sample_path))]
    
    A = []
    for i in range(len(sample_path)):
        with open(path_to_dataset_folder+'/'+sample_path[i], 'r') as f:
            row_data = f.read().strip().split('  ')
#             print(row_data)
            A.append([1.  if i ==0 else float(row_data[i-1]) for i in range(len(row_data)+1)])
    A = np.array(A)
    
        
    
    return A, T

In [58]:
X, Y_true = read_dataset_tf('C:/Users/PIxel/CS446/mp3/data/trainset','indexing.txt')

In [59]:
print(Y_true)

[ 0.  0.  0. ...,  1.  1.  1.]


In [64]:
"""logistic model class for binary classification."""
import tensorflow as tf
import numpy as np

class LogisticModel_TF(object):
    
    def __init__(self, ndims, W_init='zeros'):
        """Initialize a logistic model.

        This function prepares an initialized logistic model.
        It will initialize the weight vector, self.W, based on the method
        specified in W_init.

        We assume that the FIRST index of Weight is the bias term, 
            Weight = [Bias, W1, W2, W3, ...] 
            where Wi correspnds to each feature dimension

        W_init needs to support:
          'zeros': initialize self.W with all zeros.
          'ones': initialze self.W with all ones.
          'uniform': initialize self.W with uniform random number between [0,1)
          'gaussian': initialize self.W with gaussion distribution (0, 0.1)

        Args:
            ndims(int): feature dimension
            W_init(str): types of initialization.
        """
        self.ndims = ndims
        self.W_init = W_init
        self.W0 = None
        ###############################################################
        # Fill your code below
        ###############################################################
        if W_init == 'zeros':
            # Hint: self.W0 = tf.zeros([self.ndims+1,1])
            self.W0 = tf.zeros([self.ndims+1, 1])
        elif W_init == 'ones':
            self.W0 = tf.ones([self.ndims+1, 1])
        elif W_init == 'uniform':
            self.W0 = tf.random_uniform([self.ndims+1, 1], maxval=1)
        elif W_init == 'gaussian':
            self.W0 = tf.random_normal([self.ndims+1, 1],mean=0.0,stddev=0.1)
        else:
            print ('Unknown W_init ', W_init) 
        #self.graph = tf.Graph()
        
    def build_graph(self, learn_rate):
        """ build tensorflow training graph for logistic model.
        Args:
            learn_rate: learn rate for gradient descent
            ......: append as many arguments as you want
        """
        ###############################################################
        # Fill your code in this function
        ###############################################################
        # Hint: self.W = tf.Variable(self.W0)
        self.W = tf.Variable(self.W0)
        self.lr = learn_rate
#         self.X_TF = X 
#         self.y_TF = np.array([Y_true]).T.astype(int) 
        self.X_TF = tf.placeholder(tf.float32, [None, self.ndims+1])
        self.y_TF = tf.placeholder(tf.int32, [None, 1])
        self.predictions = tf.sigmoid(tf.matmul(tf.cast(self.X_TF,tf.float32), self.W))
        self.cost = tf.reduce_mean(tf.square(tf.subtract(tf.cast(self.y_TF,tf.float32), self.predictions)))
        self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.lr).minimize(self.cost)        

        pass
        
    def fit(self, Y_true, X, max_iters):
        """ train model with input dataset using gradient descent. 
        Args:
            Y_true(numpy.ndarray): dataset labels with a dimension of (# of samples,1)
            X(numpy.ndarray): input dataset with a dimension of (# of samples, ndims+1)
            max_iters: maximal number of training iterations
            ......: append as many arguments as you want
        Returns:
            (numpy.ndarray): sigmoid output from well trained logistic model, used for classification
                             with a dimension of (# of samples, 1)
        """
        ###############################################################
        # Fill your code in this function
        ###############################################################
        def accuracy(Y_t, Y_p):
            acc_vec = np.array([1 if Y_t[i] == Y_p[i] else 0 for i in range(len(Y_p))])
            acc_val = np.mean(acc_vec)
            return acc_vec, acc_val
        
#         self.build_graph(learn_rate, Y_true, X)
        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            sess.run(init)
            Y = np.array([Y_true]).T.astype(int)    
            for epoch in range(max_iters):
                sess.run(self.optimizer, feed_dict={self.X_TF: X, self.y_TF: Y})
                if epoch % 100 == 0:
                    pred = sess.run(self.predictions, feed_dict={self.X_TF: X})
                    self.classify = np.array([1  if pred[i]>= 0.5 else 0 for i in range(len(pred))])
                    acc_vec, acc_val = accuracy(Y, self.classify)
                    cost = sess.run(self.cost, feed_dict={self.X_TF: X, self.y_TF: Y})
                    
                    print(epoch, '..', cost,acc_val)
                if epoch+1 == max_iters:
                    pred = sess.run(self.predictions, feed_dict={self.X_TF: X})
                    self.classify = np.array([1  if pred[i]>= 0.5 else 0 for i in range(len(pred))])
                    acc_vec, acc_val = accuracy(Y, self.classify)
                    print("Final step accuracy:", acc_val)
                    return pred
                    
    
    

In [65]:
model =LogisticModel_TF(16,'ones')

In [66]:
model.build_graph(0.01)

In [67]:
acc_ve = model.fit(Y_true, X,10000)

0 .. 0.571242 0.414728682171
100 .. 0.56734 0.416279069767
200 .. 0.562977 0.42015503876
300 .. 0.558415 0.42480620155
400 .. 0.553953 0.429457364341
500 .. 0.549821 0.434108527132
600 .. 0.546129 0.437209302326
700 .. 0.542892 0.443410852713
800 .. 0.540078 0.448062015504
900 .. 0.537632 0.448062015504
1000 .. 0.535493 0.450387596899
1100 .. 0.533601 0.45503875969
1200 .. 0.531901 0.45503875969
1300 .. 0.530346 0.458139534884
1400 .. 0.528893 0.459689922481
1500 .. 0.527506 0.462015503876
1600 .. 0.526153 0.463565891473
1700 .. 0.524801 0.464341085271
1800 .. 0.523422 0.465891472868
1900 .. 0.521989 0.466666666667
2000 .. 0.520475 0.466666666667
2100 .. 0.518858 0.46511627907
2200 .. 0.517121 0.46511627907
2300 .. 0.515251 0.462015503876
2400 .. 0.513245 0.467441860465
2500 .. 0.511101 0.468217054264
2600 .. 0.508823 0.468217054264
2700 .. 0.506409 0.46976744186
2800 .. 0.50385 0.468992248062
2900 .. 0.501119 0.471317829457
3000 .. 0.49817 0.477519379845
3100 .. 0.494924 0.48062015503

In [68]:
print(acc_ve[10:50])

[[  1.18065841e-01]
 [  3.92793864e-02]
 [  1.88404560e-01]
 [  2.57774610e-02]
 [  2.34458208e-01]
 [  5.01027480e-02]
 [  7.75374472e-02]
 [  1.52949896e-02]
 [  9.01941024e-03]
 [  3.52382064e-02]
 [  6.32262463e-03]
 [  1.17558241e-03]
 [  1.38561660e-02]
 [  5.98012693e-02]
 [  7.99618720e-04]
 [  4.76046326e-03]
 [  6.84783212e-04]
 [  3.76665555e-02]
 [  2.02227547e-03]
 [  1.10060321e-02]
 [  6.24356270e-02]
 [  3.66048096e-03]
 [  8.88417591e-04]
 [  6.36089360e-04]
 [  4.26505413e-03]
 [  1.14038102e-02]
 [  2.51440536e-02]
 [  2.60696840e-02]
 [  4.33882289e-02]
 [  7.62630478e-02]
 [  8.08231592e-01]
 [  3.56773697e-02]
 [  7.56500568e-03]
 [  1.47413025e-02]
 [  2.05336809e-01]
 [  5.66402934e-02]
 [  9.20932647e-03]
 [  5.34996875e-02]
 [  1.43743515e-01]
 [  7.44304806e-03]]


In [143]:
print(1290/2)
print(Y_true[770:800])

645.0
[ 0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
