# Lab2: MLPs and Dropout

First, let's load the data as before.

In [2]:
sst_home = '../trees'

import re
import random

# Let's do 2-way positive/negative classification instead of 5-way
easy_label_map = {0:0, 1:0, 2:None, 3:1, 4:1}

def load_sst_data(path):
    data = []
    with open(path) as f:
        for i, line in enumerate(f): 
            example = {}
            example['label'] = easy_label_map[int(line[1])]
            if example['label'] is None:
                continue
            
            # Strip out the parse information and the phrase labels---we don't need those here
            text = re.sub(r'\s*(\(\d)|(\))\s*', '', line)
            example['text'] = text[1:]
            data.append(example)

    random.seed(1)
    random.shuffle(data)
    return data
     
training_set = load_sst_data(sst_home + '/train.txt')
dev_set = load_sst_data(sst_home + '/dev.txt')
test_set = load_sst_data(sst_home + '/test.txt') 

print('Training size: {}'.format(len(training_set)))
print('Dev size: {}'.format(len(dev_set)))
print('Test size: {}'.format(len(test_set)))

Training size: 6920
Dev size: 872
Test size: 1821


And extract bag-of-words feature vectors. For speed, we'll only use words that appear at least 10 times in the training set, leaving us with $|V|=1254$.

In [3]:
import collections
import numpy as np

def feature_function(datasets):
    '''Annotates datasets with feature vectors.'''
    
    # Extract vocabulary
    def tokenize(string):
        return string.split()
    
    word_counter = collections.Counter()
    for example in datasets[0]:
        word_counter.update(tokenize(example['text']))
    
    vocabulary = set([word for word in word_counter if word_counter[word] > 10])
                                
    feature_names = set()
    for i, dataset in enumerate(datasets):
        for example in dataset:
            example['features'] = collections.defaultdict(float)
            
            # Extract features (by name) for one example
            word_counter = collections.Counter(tokenize(example['text']))
            for x in word_counter.items():
                if x[0] in vocabulary:
                    example["features"]["word_count_for_" + x[0]] = x[1]
            
            feature_names.update(example['features'].keys())
                            
    # By now, we know what all the features will be, so we can
    # assign indices to them.
    feature_indices = dict(zip(feature_names, range(len(feature_names))))
    indices_to_features = {v: k for k, v in feature_indices.items()}
    dim = len(feature_indices)
                
    # Now we create actual vectors from those indices.
    for dataset in datasets:
        for example in dataset:
            example['vector'] = np.zeros((dim))
            for feature in example['features']:
                example['vector'][feature_indices[feature]] = example['features'][feature]
    return indices_to_features, dim
    
indices_to_features, dim = feature_function([training_set, dev_set, test_set])

print('Vocabulary size: {}'.format(dim))

Vocabulary size: 1254


And define a batch evalution function.

In [4]:
def evaluate_classifier(classifier, eval_set):
    correct = 0
    hypotheses = classifier(eval_set)
    for i, example in enumerate(eval_set):
        hypothesis = hypotheses[i]
        if hypothesis == example['label']:
            correct += 1        
    return correct / float(len(eval_set))

## Assignments

Now for the fun part! The below should be a working implementation of logistic regression in TensorFlow.

### Part One:

Modify it to turn it into an MLP with two ReLU hidden layers of 50 dimensions.

Keep in mind that initializing weight matrices with zeros causes problems in deep neural networks trained by SGD. (Why?) You should use tf.random_normal instead, with stddev=0.1.

If your model works, it should be able to overfit, reaching about 90% accuracy *on the training set* in the first 100 epochs.

### Part Two:

After each hidden layer, add dropout with a 80% keep rate. You're welcome to use `tf.nn.dropout`.

Remember that dropout behaves differently at training time and at test time. This is not automatic. You can implement in various ways, but an easy way can be this:

- Hint: Treat the keep rate as an input to the model, just like `x`. At training time, feed it a value of `0.8`, at test time, feed it a value of `1.0`. You can explore different dropout values.

If dropout works, your model should overfit less, but should still perform about as well (or, hopefully, better) on the dev set.

In [5]:
import tensorflow as tf

In [48]:
class logistic_regression_classifier:
    def __init__(self, dim, classes=2):
        # Define the hyperparameters
        self.learning_rate = 0.3  # Should be about right
        self.training_epochs = 100  # How long to train for - chosen to fit within class time
        self.display_epoch_freq = 1  # How often to test and print out statistics
        self.dim = dim  # The number of features
        self.outclasses = classes  # The number of features
        self.batch_size = 256  # Somewhat arbitrary - can be tuned, but often tune for speed, not accuracy
        
        # TODO: Use these.
        self.hidden_layer_sizes = [50, 50]
        #self.keep_rate = keep_rate
        
        # TODO: Overwrite this section
        ### Start of model definition ###
        
        # Define the inputs
        self.x = tf.placeholder(tf.float32, [None, dim])
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate = tf.placeholder(tf.float32, shape=())
        
        # Define (most of) the model
        #layer1
        self.W0 = tf.Variable(tf.random_normal([self.dim, self.hidden_layer_sizes[0]],stddev=0.1))
        self.b0 = tf.Variable(tf.random_normal([self.hidden_layer_sizes[0]],stddev=0.1))        
        self.logits0 = tf.matmul(self.x, self.W0) + self.b0
        self.h0 = tf.nn.dropout(tf.nn.relu(self.logits0),self.keep_rate)
        
        #layer2
        self.W1 = tf.Variable(tf.random_normal([self.hidden_layer_sizes[0], self.hidden_layer_sizes[1]],stddev=0.1))
        self.b1 = tf.Variable(tf.random_normal([self.hidden_layer_sizes[1]],stddev=0.1))        
        self.h1 = tf.nn.dropout(tf.nn.relu_layer(self.h0,self.W1,self.b1),self.keep_rate)
        
        #output
        self.W2 = tf.Variable(tf.random_normal([self.hidden_layer_sizes[1], self.outclasses],stddev=0.1))
        self.b2 = tf.Variable(tf.random_normal([self.outclasses],stddev=0.1))
        
        self.logitsh2 = tf.matmul(self.h1, self.W2) + self.b2 
                
        
        ### End of model definition ###
        
        # Define the cost function (here, the exp and sum are built in)
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logitsh2, labels=self.y))
        
        # Optionally you could add L2 regularization term
        
        # This library call performs the main SGD update equation
        self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.cost)
        
        # Create an operation to fill zero values in for W and b
        self.init = tf.global_variables_initializer()
        
        # Create a placeholder for the session that will be shared between training and evaluation
        self.sess = None
        
    def train(self, training_data, dev_set,keep_rate=1.0):
        def get_minibatch(dataset, start_index, end_index):
            indices = range(start_index, end_index)
            vectors = np.vstack([dataset[i]['vector'] for i in indices])
            labels = [dataset[i]['label'] for i in indices]
            return vectors, labels
        
        self.sess = tf.Session()
        
        self.sess.run(self.init)
        print ('Training.')

        # Training cycle
        for epoch in range(self.training_epochs):
            random.shuffle(training_set)
            avg_cost = 0.
            total_batch = int(len(training_set) / self.batch_size)
            
            # Loop over all batches in epoch
            for i in range(total_batch):
                # Assemble a minibatch of the next B examples
                minibatch_vectors, minibatch_labels = get_minibatch(training_set, 
                                                                    self.batch_size * i, 
                                                                    self.batch_size * (i + 1))

                # Run the optimizer to take a gradient step, and also fetch the value of the 
                # cost function for logging
                _, c = self.sess.run([self.optimizer, self.cost], 
                                     feed_dict={self.x: minibatch_vectors,
                                                self.y: minibatch_labels,
                                                self.keep_rate:keep_rate})
                                                                    
                # Compute average loss
                avg_cost += c / total_batch
                
            # Display some statistics about the step
            if (epoch+1) % self.display_epoch_freq == 0:
                print ("Epoch:", (epoch+1), "Cost:", avg_cost, \
                    "Dev acc:", evaluate_classifier(self.classify, dev_set[0:500]), \
                    "Train acc:", evaluate_classifier(self.classify, training_set[0:500]))
    
    def classify(self, examples, keep_rate=1.0):
        # This classifies a list of examples
        vectors = np.vstack([example['vector'] for example in examples])
        logits = self.sess.run(self.logitsh2, feed_dict={self.x: vectors,
                                                            self.keep_rate:keep_rate})
        return np.argmax(logits, axis=1)

Now let's train it.

In [64]:
classifier = logistic_regression_classifier(dim)
classifier.train(training_set, dev_set, 0.8)

Training.
Epoch: 1 Cost: 0.696899431723 Dev acc: 0.528 Train acc: 0.532
Epoch: 2 Cost: 0.689820815016 Dev acc: 0.536 Train acc: 0.494
Epoch: 3 Cost: 0.685101416376 Dev acc: 0.598 Train acc: 0.57
Epoch: 4 Cost: 0.681895399535 Dev acc: 0.612 Train acc: 0.6
Epoch: 5 Cost: 0.67757311353 Dev acc: 0.626 Train acc: 0.548
Epoch: 6 Cost: 0.670903199249 Dev acc: 0.632 Train acc: 0.588
Epoch: 7 Cost: 0.665237320794 Dev acc: 0.636 Train acc: 0.632
Epoch: 8 Cost: 0.656296500453 Dev acc: 0.64 Train acc: 0.622
Epoch: 9 Cost: 0.650778183231 Dev acc: 0.638 Train acc: 0.652
Epoch: 10 Cost: 0.641750309202 Dev acc: 0.66 Train acc: 0.662
Epoch: 11 Cost: 0.629644853097 Dev acc: 0.662 Train acc: 0.706
Epoch: 12 Cost: 0.610863617173 Dev acc: 0.666 Train acc: 0.688
Epoch: 13 Cost: 0.602286031953 Dev acc: 0.654 Train acc: 0.684
Epoch: 14 Cost: 0.59497056184 Dev acc: 0.698 Train acc: 0.748
Epoch: 15 Cost: 0.573448808105 Dev acc: 0.696 Train acc: 0.7
Epoch: 16 Cost: 0.57342915844 Dev acc: 0.7 Train acc: 0.768
Epo

And evaluate it.

In [65]:
evaluate_classifier(classifier.classify, dev_set)

0.7385321100917431