# Lab2: MLPs and Dropout

First, let's load the data as before.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Load the data
import re
import random

# Let's do 2-way positive/negative classification instead of 5-way    
def load_sst_data(path,
                  easy_label_map={0:0, 1:0, 2:None, 3:1, 4:1}):
    data = []
    with open(path) as f:
        for i, line in enumerate(f): 
            example = {}
            example['label'] = easy_label_map[int(line[1])]
            if example['label'] is None:
                continue
            
            # Strip out the parse information and the phrase labels--
            # ---we don't need those here
            text = re.sub(r'\s*(\(\d)|(\))\s*', '', line)
            example['text'] = text[1:]
            data.append(example)
    random.seed(1)
    random.shuffle(data)
    return data

sst_home = 'drive/My Drive/Colab Notebooks/dl4nlp_labs/data/trees/'
training_set = load_sst_data(sst_home + 'train.txt')
dev_set = load_sst_data(sst_home + 'dev.txt')
test_set = load_sst_data(sst_home + 'test.txt')

print('Training size: {}'.format(len(training_set)))
print('Dev size: {}'.format(len(dev_set)))
print('Test size: {}'.format(len(test_set)))

Training size: 6920
Dev size: 872
Test size: 1821


And extract bag-of-words feature vectors. For speed, we'll only use words that appear at least 10 times in the training set, leaving us with $|V|=1254$.

In [3]:
import collections
import numpy as np

def feature_function(datasets):
    '''Annotates datasets with feature vectors.'''
    
    # Extract vocabulary
    def tokenize(string):
        return string.split()
    
    word_counter = collections.Counter()
    for example in datasets[0]:
        word_counter.update(tokenize(example['text']))
    
    vocabulary = set([word for word in word_counter if word_counter[word] > 10])
                                
    feature_names = set()
    for i, dataset in enumerate(datasets):
        for example in dataset:
            example['features'] = collections.defaultdict(float)
            
            # Extract features (by name) for one example
            word_counter = collections.Counter(tokenize(example['text']))
            for x in word_counter.items():
                if x[0] in vocabulary:
                    example["features"]["word_count_for_" + x[0]] = x[1]
            
            feature_names.update(example['features'].keys())
                            
    # By now, we know what all the features will be, so we can
    # assign indices to them.
    feature_indices = dict(zip(feature_names, range(len(feature_names))))
    indices_to_features = {v: k for k, v in feature_indices.items()}
    dim = len(feature_indices)
                
    # Now we create actual vectors from those indices.
    for dataset in datasets:
        for example in dataset:
            example['vector'] = np.zeros((dim))
            for feature in example['features']:
                example['vector'][feature_indices[feature]] = example['features'][feature]
    return indices_to_features, dim
    
indices_to_features, dim = feature_function([training_set, dev_set, test_set])

print('Vocabulary size: {}'.format(dim))



Vocabulary size: 1254


And define a batch evalution function.

In [4]:
def evaluate_classifier(classifier, eval_set):
    correct = 0
    hypotheses = classifier(eval_set)
    for i, example in enumerate(eval_set):
        hypothesis = hypotheses[i]
        if hypothesis == example['label']:
            correct += 1        
    return correct / float(len(eval_set))

## Assignments

Now for the fun part! The below should be a working implementation of logistic regression in TensorFlow.

### Part One:

Modify it to turn it into an MLP with two ReLU hidden layers of 50 dimensions.

Keep in mind that initializing weight matrices with zeros causes problems in deep neural networks trained by SGD. (Why?) You should use tf.random_normal instead, with stddev=0.1.

If your model works, it should be able to overfit, reaching about 90% accuracy *on the training set* in the first 100 epochs.

### Part Two:

After each hidden layer, add dropout with a 80% keep rate (20% drop rate). You're welcome to use `tf.nn.dropout`.

Remember that dropout behaves differently at training time and at test time. This is not automatic. You can implement in various ways, but an easy way can be this:

- Hint: Treat the keep rate as an input to the model, just like `x`. At training time, feed it a value of `0.2`, at test time, feed it a value of `0.0`. You can explore different dropout values.

If dropout works, your model should overfit less, but should still perform about as well (or, hopefully, better) on the dev set.

In [5]:
%tensorflow_version 2.x
import tensorflow as tf
tf.__version__

'2.7.0'

In [9]:
class logistic_regression_classifier:
    def __init__(self, dim):
        # Define the hyperparameters
        self.learning_rate = 0.3  # Should be about right
        self.training_epochs = 100  # How long to train for - chosen to fit within class time
        self.display_epoch_freq = 1  # How often to test and print out statistics
        self.dim = dim  # The number of features
        self.batch_size = 256  # Somewhat arbitrary - can be tuned, but often tune for speed, not accuracy
        
        # TODO: Use these.
        self.hidden_layer_sizes = [50, 50]
        self.rate = 0.2

        # TODO: Overwrite this section
        ### Start of model definition ###

        self.trainable_variables = []
         # Define (most of) the model
        '''Variables'''
        #Hidden RELU layer 0
        self.W0 = tf.Variable(tf.random.normal([self.dim, self.hidden_layer_sizes[0]],stddev=0.1), dtype='float32')
        self.b0 = tf.Variable(tf.zeros([self.hidden_layer_sizes[0]]), dtype='float32')
        self.trainable_variables.append(self.W0)
        self.trainable_variables.append(self.b0)
        
        #Hidden RELU layer 1
        self.W1 = tf.Variable(tf.random.normal([self.hidden_layer_sizes[0],self.hidden_layer_sizes[1]],stddev=0.1), dtype='float32')
        self.b1 = tf.Variable(tf.zeros([self.hidden_layer_sizes[1]]), dtype='float32')
        self.trainable_variables.append(self.W1)
        self.trainable_variables.append(self.b1)
        
        #Output layer
        self.W2 = tf.Variable(tf.random.normal([self.hidden_layer_sizes[1], 2],stddev=0.1), dtype='float32')
        self.b2 = tf.Variable(tf.zeros([2]), dtype='float32')
        self.trainable_variables.append(self.W2)
        self.trainable_variables.append(self.b2)

    def model(self,x,rate):
        '''Training Computation'''
        #Hidden RELU layer 0 activation
        self.logits0 = tf.matmul(x, self.W0) + self.b0
        self.h0 = tf.nn.relu(self.logits0)
        self.h0 = tf.nn.dropout(self.h0,rate)
        
        #Hidden RELU layer 1 activation
        self.logits1 = tf.matmul(self.h0, self.W1) + self.b1
        self.h1 = tf.nn.relu(self.logits1)
        self.h1 = tf.nn.dropout(self.h1,rate)
        
        #Output layer activation
        logits = tf.matmul(self.h1, self.W2) + self.b2
        ### End of model definition ###
        return logits
     

    def train(self, training_data, dev_set):
        def get_minibatch(dataset, start_index, end_index):
            indices = range(start_index, end_index)
            vectors = np.float32(np.vstack([dataset[i]['vector'] for i in indices]))
            labels = [dataset[i]['label'] for i in indices]
            return vectors, labels
      
        print ('Training.')

        # Training cycle
        for epoch in range(self.training_epochs):
            random.shuffle(training_set)
            avg_cost = 0.
            total_batch = int(len(training_set) / self.batch_size)
            
            # Loop over all batches in epoch
            for i in range(total_batch):
                # Assemble a minibatch of the next B examples
                minibatch_vectors, minibatch_labels = get_minibatch(training_set, 
                                                                    self.batch_size * i, 
                                                                    self.batch_size * (i + 1))

                # Run the optimizer to take a gradient step, and also fetch the value of the 
                # cost function for logging
                with tf.GradientTape() as tape:
                  logits = self.model(minibatch_vectors,self.rate)
                  # Define the cost function (here, the exp and sum are built in)
                  cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=minibatch_labels))
                  #cost = tf.keras.losses.SparseCategoricalCrossentropy(logits,minibatch_labels,from_logits=True)
                  
                # Optionally you could add L2 regularization term
        
                # This library call performs the main SGD update equation
                #self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.cost)
                
                gradients = tape.gradient(cost, self.trainable_variables)
                optimizer = tf.keras.optimizers.SGD(self.learning_rate)
                optimizer.apply_gradients(zip(gradients, self.trainable_variables))
   
                # Compute average loss
                avg_cost += cost / total_batch
                
                # Display some statistics about the step
            if (epoch+1) % self.display_epoch_freq == 0:
                tf.print ("Epoch:", (epoch+1), "Cost:", avg_cost, \
                    "Dev acc:", evaluate_classifier(self.classify, dev_set[0:500]), \
                    "Train acc:", evaluate_classifier(self.classify, training_set[0:500]))
    
    def classify(self, examples):
        # This classifies a list of examples
        vectors = np.float32(np.vstack([example['vector'] for example in examples]))
        logits = self.model(vectors,0.0)
        return np.argmax(logits, axis=1)

Now let's train it.

In [10]:
classifier = logistic_regression_classifier(dim)
classifier.train(training_set, dev_set)

Training.
Epoch: 1 Cost: 0.692445576 Dev acc: 0.562 Train acc: 0.584
Epoch: 2 Cost: 0.686476648 Dev acc: 0.602 Train acc: 0.586
Epoch: 3 Cost: 0.679325342 Dev acc: 0.636 Train acc: 0.596
Epoch: 4 Cost: 0.670828223 Dev acc: 0.63 Train acc: 0.618
Epoch: 5 Cost: 0.667153478 Dev acc: 0.642 Train acc: 0.624
Epoch: 6 Cost: 0.657815397 Dev acc: 0.66 Train acc: 0.636
Epoch: 7 Cost: 0.654222 Dev acc: 0.668 Train acc: 0.698
Epoch: 8 Cost: 0.642935634 Dev acc: 0.692 Train acc: 0.666
Epoch: 9 Cost: 0.633350194 Dev acc: 0.682 Train acc: 0.676
Epoch: 10 Cost: 0.62144649 Dev acc: 0.692 Train acc: 0.676
Epoch: 11 Cost: 0.608016551 Dev acc: 0.694 Train acc: 0.732
Epoch: 12 Cost: 0.596590638 Dev acc: 0.716 Train acc: 0.724
Epoch: 13 Cost: 0.586231411 Dev acc: 0.716 Train acc: 0.75
Epoch: 14 Cost: 0.571956515 Dev acc: 0.71 Train acc: 0.744
Epoch: 15 Cost: 0.557533503 Dev acc: 0.688 Train acc: 0.698
Epoch: 16 Cost: 0.543159187 Dev acc: 0.724 Train acc: 0.742
Epoch: 17 Cost: 0.532618 Dev acc: 0.714 Train a

And evaluate it.

In [11]:
evaluate_classifier(classifier.classify, test_set)

0.7138934651290499

# Atribution:
Adapted by Oier Lopez de Lacalle, Olatz Perez de Viñaspre and Ander Barrena, based on a notebook by Sam Bowman at NYU