# Assignment1: Logistic Regression, SGD, and Regularization

First, let's load the data.

In [1]:
sst_home = '../trees'

import re
import random

# Let's do 2-way positive/negative classification instead of 5-way
easy_label_map = {0:0, 1:0, 2:None, 3:1, 4:1}

def load_sst_data(path):
    data = []
    with open(path) as f:
        for i, line in enumerate(f): 
            example = {}
            example['label'] = easy_label_map[int(line[1])]
            if example['label'] is None:
                continue
            
            # Strip out the parse information and the phrase labels---we don't need those here
            text = re.sub(r'\s*(\(\d)|(\))\s*', '', line)
            example['text'] = text[1:]
            data.append(example)

    random.seed(1)
    random.shuffle(data)
    return data
     
training_set = load_sst_data(sst_home + '/train.txt')
dev_set = load_sst_data(sst_home + '/dev.txt')
test_set = load_sst_data(sst_home + '/test.txt')

#print(training_set[0])
#print(dev_set)
#print(test_set)

# Note: Unlike with feature based classifiers, evaluation here should be fast, 
# and we don't need to trim down the dev and test sets. 

And extract bag-of-words feature vectors.

In [2]:
import collections
import numpy as np

def feature_function(datasets):
    '''Annotates datasets with feature vectors.'''
    
    # Extract vocabulary
    def tokenize(string):
        return string.split()
    
    word_counter = collections.Counter()
    for example in datasets[0]:
        word_counter.update(tokenize(example['text']))
    
    vocabulary = set([word for word in word_counter])
                                
    feature_names = set()
    for i, dataset in enumerate(datasets):
        for example in dataset:
            example['features'] = collections.defaultdict(float)
            
            # Extract features (by name) for one example
            word_counter = collections.Counter(tokenize(example['text']))
            for x in word_counter.items():
                if x[0] in vocabulary:
                    example["features"]["word_count_for_" + x[0]] = x[1]
            
            feature_names.update(example['features'].keys())
                            
    # By now, we know what all the features will be, so we can
    # assign indices to them.
    feature_indices = dict(zip(feature_names, range(len(feature_names))))
    indices_to_features = {v: k for k, v in feature_indices.items()}
    dim = len(feature_indices)
                
    # Now we create actual vectors from those indices.
    for dataset in datasets:
        for example in dataset:
            example['vector'] = np.zeros((dim))
            for feature in example['features']:
                example['vector'][feature_indices[feature]] = example['features'][feature]
    return indices_to_features, dim
    
indices_to_features, dim = feature_function([training_set, dev_set, test_set])



And define an evalution function. This is a bit different, since it's designed to let us test an entire big batch of examples at once with the classifier, rather than passing them in one by one. (For larger models or larger training sets, this could run out of memory, but it should be fine for now.)

In [3]:
def evaluate_classifier(classifier, eval_set):
    correct = 0
    hypotheses = classifier(eval_set)
    for i, example in enumerate(eval_set):
        hypothesis = hypotheses[i]
        if hypothesis == example['label']:
            correct += 1        
    return correct / float(len(eval_set))

Now for the fun part! The below should be a working implementation of logistic regression in TensorFlow.

In [4]:
import tensorflow as tf

In [9]:
class logistic_regression_classifier:
    def __init__(self, dim, lr=1.0, epchs=50):
        # Define the hyperparameters
        self.learning_rate = lr  # Maybe? Let's tune this --> BEST learning rate: 0.9
        self.training_epochs = epchs  # How long to train for - chosen to fit within class time
        self.display_epoch_freq = 1  # How often to test and print out statistics
        self.dim = dim  # The number of features
        self.batch_size = 256  # Somewhat arbitrary - can be tuned, but often tune for speed, not accuracy
        
        # Define the inputs
        self.x = tf.placeholder(tf.float32, [None, dim])
        self.y = tf.placeholder(tf.int32, [None])
        
        # Define (most of) the model
        self.W = tf.Variable(tf.zeros([self.dim, 2]))
        self.b = tf.Variable(tf.zeros([2]))
        self.logits = tf.matmul(self.x, self.W) + self.b

        # Define the cost function (here, the exp and sum are built in)
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y))
        # Loss function with L2 Regularization with beta=0.01
        regularizer = tf.nn.l2_loss(self.W)
        self.cost = self.cost+self.b*regularizer
        
        # This library call performs the main SGD update equation
        self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.cost)
        
        # Create an operation to fill zero values in for W and b
        self.init = tf.global_variables_initializer()
        
        # Create a placeholder for the session that will be shared between training and evaluation
        self.sess = None
        
    def train(self, training_data, dev_set):
        def get_minibatch(dataset, start_index, end_index):
            indices = range(start_index, end_index)
            vectors = np.vstack([dataset[i]['vector'] for i in indices])
            labels = [dataset[i]['label'] for i in indices]
            return vectors, labels
        
        self.sess = tf.Session()
        
        self.sess.run(self.init)
        print('Training.')

        # Training cycle
        for epoch in range(self.training_epochs):
            random.shuffle(training_set)
            avg_cost = 0.
            total_batch = int(len(training_set) / self.batch_size)
            
            # Loop over all batches in epoch
            for i in range(total_batch):
                # Assemble a minibatch of the next B examples
                minibatch_vectors, minibatch_labels = get_minibatch(training_set, 
                                                                    self.batch_size * i, 
                                                                    self.batch_size * (i + 1))

                # Run the optimizer to take a gradient step, and also fetch the value of the 
                # cost function for logging
                _, c = self.sess.run([self.optimizer, self.cost], 
                                     feed_dict={self.x: minibatch_vectors,
                                                self.y: minibatch_labels})
                                                                    
                # Compute average loss
                avg_cost += c / (total_batch * self.batch_size)
                
            # Display some statistics about the step
            if (epoch+1) % self.display_epoch_freq == 0:
                print("Epoch:", (epoch+1), "Cost:", avg_cost,
                      "Dev acc:", evaluate_classifier(self.classify, dev_set[0:500]), 
                      "Train acc:", evaluate_classifier(self.classify, training_set[0:500]))
    
    def classify(self, examples):
        # This classifies a list of examples
        vectors = np.vstack([example['vector'] for example in examples])
        logits = self.sess.run(self.logits, feed_dict={self.x: vectors})
        return np.argmax(logits, axis=1)

Now let's train it.

In [21]:
classifier = logistic_regression_classifier(dim, 0.9)
classifier.train(training_set, dev_set)

Training.
Epoch: 1 Cost: 0.00310194683779 Dev acc: 0.498 Train acc: 0.506
Epoch: 2 Cost: 0.00276696550901 Dev acc: 0.678 Train acc: 0.684
Epoch: 3 Cost: 0.00224092538917 Dev acc: 0.61 Train acc: 0.58
Epoch: 4 Cost: 0.00229212493394 Dev acc: 0.674 Train acc: 0.682
Epoch: 5 Cost: 0.00206224330597 Dev acc: 0.764 Train acc: 0.792
Epoch: 6 Cost: 0.00199582357029 Dev acc: 0.758 Train acc: 0.808
Epoch: 7 Cost: 0.00187951964068 Dev acc: 0.768 Train acc: 0.82
Epoch: 8 Cost: 0.00191835991831 Dev acc: 0.69 Train acc: 0.744
Epoch: 9 Cost: 0.00183469878889 Dev acc: 0.762 Train acc: 0.796
Epoch: 10 Cost: 0.00172145434017 Dev acc: 0.778 Train acc: 0.822
Epoch: 11 Cost: 0.00169320480415 Dev acc: 0.78 Train acc: 0.866
Epoch: 12 Cost: 0.00169078079139 Dev acc: 0.728 Train acc: 0.774
Epoch: 13 Cost: 0.00166268272463 Dev acc: 0.78 Train acc: 0.842
Epoch: 14 Cost: 0.00158987081334 Dev acc: 0.786 Train acc: 0.872
Epoch: 15 Cost: 0.00154061483961 Dev acc: 0.788 Train acc: 0.88
Epoch: 16 Cost: 0.0015114407115

And evaluate it.

In [22]:
evaluate_classifier(classifier.classify, dev_set)

0.768348623853211

## Assignments

### Our goals
  1. **Pick an effective learning rate**:
      - You could set up the learning rate value by passing it as argument (e.g. in `__init__ (self, dim, lr=1.0, ...)` )
      - Try small and larger values to see the behavior of the model.
  
  2. **Implement L2 regularization:**
      - Hint: Add regularization term to overal cost (`self.cost`)
      - Tensorflow already built in method for this. Check the API to find out. 
      - (Optionaly) Code it without using the built in tool for it

  3. **Pick an effective L2 weight:**
      - You could set up the learning rate value by passing it as argument (e.g. in `__init__ (self, dim, lw=1.0, ...)` )
      - Try small and larger values to see the behavior of the model.
  
  4. **Look at some learning curves:**
      - This code might be helpful: http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
