# Assignment1: Logistic Regression, SGD, and Regularization

First, let's load the data.

In [1]:
sst_home = '../trees'

import re
import random

# Let's do 2-way positive/negative classification instead of 5-way
easy_label_map = {0:0, 1:0, 2:None, 3:1, 4:1}

def load_sst_data(path):
    data = []
    with open(path) as f:
        for i, line in enumerate(f): 
            example = {}
            example['label'] = easy_label_map[int(line[1])]
            if example['label'] is None:
                continue
            
            # Strip out the parse information and the phrase labels---we don't need those here
            text = re.sub(r'\s*(\(\d)|(\))\s*', '', line)
            example['text'] = text[1:]
            data.append(example)

    random.seed(1)
    random.shuffle(data)
    return data
     
training_set = load_sst_data(sst_home + '/train.txt')
dev_set = load_sst_data(sst_home + '/dev.txt')
test_set = load_sst_data(sst_home + '/test.txt')

#print(training_set[0])
#print(dev_set)
#print(test_set)

# Note: Unlike with feature based classifiers, evaluation here should be fast, 
# and we don't need to trim down the dev and test sets. 

And extract bag-of-words feature vectors.

In [2]:
import collections
import numpy as np

def feature_function(datasets):
    '''Annotates datasets with feature vectors.'''
    
    # Extract vocabulary
    def tokenize(string):
        return string.split()
    
    word_counter = collections.Counter()
    for example in datasets[0]:
        word_counter.update(tokenize(example['text']))
    
    vocabulary = set([word for word in word_counter])
                                
    feature_names = set()
    for i, dataset in enumerate(datasets):
        for example in dataset:
            example['features'] = collections.defaultdict(float)
            
            # Extract features (by name) for one example
            word_counter = collections.Counter(tokenize(example['text']))
            for x in word_counter.items():
                if x[0] in vocabulary:
                    example["features"]["word_count_for_" + x[0]] = x[1]
            
            feature_names.update(example['features'].keys())
                            
    # By now, we know what all the features will be, so we can
    # assign indices to them.
    feature_indices = dict(zip(feature_names, range(len(feature_names))))
    indices_to_features = {v: k for k, v in feature_indices.items()}
    dim = len(feature_indices)
                
    # Now we create actual vectors from those indices.
    for dataset in datasets:
        for example in dataset:
            example['vector'] = np.zeros((dim))
            for feature in example['features']:
                example['vector'][feature_indices[feature]] = example['features'][feature]
    return indices_to_features, dim
    
indices_to_features, dim = feature_function([training_set, dev_set, test_set])



And define an evalution function. This is a bit different, since it's designed to let us test an entire big batch of examples at once with the classifier, rather than passing them in one by one. (For larger models or larger training sets, this could run out of memory, but it should be fine for now.)

In [3]:
def evaluate_classifier(classifier, eval_set):
    correct = 0
    hypotheses = classifier(eval_set)
    for i, example in enumerate(eval_set):
        hypothesis = hypotheses[i]
        if hypothesis == example['label']:
            correct += 1        
    return correct / float(len(eval_set))

Now for the fun part! The below should be a working implementation of logistic regression in TensorFlow.

In [4]:
import tensorflow as tf

In [10]:
class logistic_regression_classifier:
    def __init__(self, dim, lr=1.0, epchs=50, l2b=0.01):
        # Define the hyperparameters
        self.learning_rate = lr  # Maybe? Let's tune this --> BEST learning rate: 0.9
        self.training_epochs = epchs  # How long to train for - chosen to fit within class time
        self.display_epoch_freq = 1  # How often to test and print out statistics
        self.l2_weight=l2b # l2 regularization weight beta --> Best 0.001
        self.dim = dim  # The number of features
        self.batch_size = 256  # Somewhat arbitrary - can be tuned, but often tune for speed, not accuracy
        
        # Define the inputs
        self.x = tf.placeholder(tf.float32, [None, dim])
        self.y = tf.placeholder(tf.int32, [None])
        
        # Define (most of) the model
        self.W = tf.Variable(tf.zeros([self.dim, 2]))
        self.b = tf.Variable(tf.zeros([2]))
        self.logits = tf.matmul(self.x, self.W) + self.b

        # Define the cost function (here, the exp and sum are built in)
        # Original no regularization
        ##self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y))
        # Loss function with L2 Regularization with beta=0.01
        regularizer = tf.nn.l2_loss(self.W)
        # my initial solution:
        ## self.cost = tf.reduce_mean(self.cost+l2b*regularizer) #0.7694954128440367 - 0.7752293577981652
        # second solution: 
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y)
                                   +self.l2_weight*regularizer) #0.7706422018348624 - 0.7763761467889908
        
        #self.cost = self.cost+l2b*regularizer #0.7568807339449541
        
        # This library call performs the main SGD update equation
        self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.cost)
        
        # Create an operation to fill zero values in for W and b
        self.init = tf.global_variables_initializer()
        
        # Create a placeholder for the session that will be shared between training and evaluation
        self.sess = None
        
    def train(self, training_data, dev_set):
        def get_minibatch(dataset, start_index, end_index):
            indices = range(start_index, end_index)
            vectors = np.vstack([dataset[i]['vector'] for i in indices])
            labels = [dataset[i]['label'] for i in indices]
            return vectors, labels
        
        self.sess = tf.Session()
        
        self.sess.run(self.init)
        print('Training.')
        scores={} 
        scores['train']=[]
        scores['dev']=[]
        scores['cost']=[]
        # Training cycle
        for epoch in range(self.training_epochs):
            random.shuffle(training_set)
            avg_cost = 0.
            total_batch = int(len(training_set) / self.batch_size)
            
            # Loop over all batches in epoch
            for i in range(total_batch):
                # Assemble a minibatch of the next B examples
                minibatch_vectors, minibatch_labels = get_minibatch(training_set, 
                                                                    self.batch_size * i, 
                                                                    self.batch_size * (i + 1))

                # Run the optimizer to take a gradient step, and also fetch the value of the 
                # cost function for logging
                _, c = self.sess.run([self.optimizer, self.cost], 
                                     feed_dict={self.x: minibatch_vectors,
                                                self.y: minibatch_labels})
                                                                    
                # Compute average loss
                avg_cost += c / (total_batch * self.batch_size)
                
            
            # Display some statistics about the step
            if (epoch+1) % self.display_epoch_freq == 0:
                scores['cost'].append(avg_cost)
                scores['train'].append(evaluate_classifier(self.classify, training_set[0:500]))
                scores['dev'].append(evaluate_classifier(self.classify, dev_set[0:500]))
                print("Epoch:", (epoch+1), "Cost:", avg_cost,
                      "Dev acc:", evaluate_classifier(self.classify, dev_set[0:500]), 
                      "Train acc:", evaluate_classifier(self.classify, training_set[0:500]))
            
        return scores
    
    def classify(self, examples):
        # This classifies a list of examples
        vectors = np.vstack([example['vector'] for example in examples])
        logits = self.sess.run(self.logits, feed_dict={self.x: vectors})
        return np.argmax(logits, axis=1)

Now let's train it.

In [6]:
import matplotlib.pyplot as plt

def plot_learning_curve(title, data, ylim=None, xlabel="epochs", ylabel="Accuracy"):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid()
    
        
    train_scores=data['train']
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    
    train_sizes=len(train_scores)
    
    test_scores=data['cost']
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    dev_scores=data['dev']
    dev_scores_mean = np.mean(dev_scores, axis=1)
    dev_scores_std = np.std(dev_scores, axis=1)
    
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, dev_scores_mean - dev_scores_std,
                     dev_scores_mean + dev_scores_std, alpha=0.1, color="b")
    
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")

    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, dev_scores_mean, 'o-', color="b",
             label="Developement score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cost")

    plt.legend(loc="best")
    return plt



Bad key "patch.force_edgecolor" on line 33 in
/usr/local/lib/python3.4/dist-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle.
You probably need to get an updated matplotlibrc file from
http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template
or from the matplotlib source distribution

Bad key "lines.dotted_pattern" on line 20 in
/usr/local/lib/python3.4/dist-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle.
You probably need to get an updated matplotlibrc file from
http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template
or from the matplotlib source distribution

Bad key "xtick.major.top" on line 253 in
/usr/local/lib/python3.4/dist-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle.
You probably need to get an updated matplotlibrc file from
http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template
or from the matplotlib source distribution

Bad key "date.autoformatter.month" on line 231 in
/usr/local

In [15]:
#with tf.device('/cpu:0'):
classifier = logistic_regression_classifier(dim, 0.9,50,0.001)
scores = classifier.train(training_set, dev_set)
#    plot_learning_curve("testing plot",scores)

Training.
Epoch: 1 Cost: 0.00331546048013 Dev acc: 0.55 Train acc: 0.546
Epoch: 2 Cost: 0.00263478529329 Dev acc: 0.514 Train acc: 0.496
Epoch: 3 Cost: 0.00249056254203 Dev acc: 0.744 Train acc: 0.726
Epoch: 4 Cost: 0.00224212899425 Dev acc: 0.662 Train acc: 0.682
Epoch: 5 Cost: 0.00228739895478 Dev acc: 0.696 Train acc: 0.75
Epoch: 6 Cost: 0.00219461704708 Dev acc: 0.728 Train acc: 0.758
Epoch: 7 Cost: 0.00224260316248 Dev acc: 0.754 Train acc: 0.808
Epoch: 8 Cost: 0.00215469083438 Dev acc: 0.7 Train acc: 0.73
Epoch: 9 Cost: 0.00199595871554 Dev acc: 0.766 Train acc: 0.83
Epoch: 10 Cost: 0.00202861852961 Dev acc: 0.684 Train acc: 0.71
Epoch: 11 Cost: 0.00198929677545 Dev acc: 0.76 Train acc: 0.802
Epoch: 12 Cost: 0.00194010694718 Dev acc: 0.762 Train acc: 0.816
Epoch: 13 Cost: 0.00202407771118 Dev acc: 0.78 Train acc: 0.844
Epoch: 14 Cost: 0.00194643082149 Dev acc: 0.776 Train acc: 0.866
Epoch: 15 Cost: 0.00191508912637 Dev acc: 0.734 Train acc: 0.816
Epoch: 16 Cost: 0.00197387474192 

And evaluate it.

In [16]:
evaluate_classifier(classifier.classify, dev_set)

0.7672018348623854

## Assignments

### Our goals
  1. **Pick an effective learning rate**:
      - You could set up the learning rate value by passing it as argument (e.g. in `__init__ (self, dim, lr=1.0, ...)` )
      - Try small and larger values to see the behavior of the model.
  
  2. **Implement L2 regularization:**
      - Hint: Add regularization term to overal cost (`self.cost`)
      - Tensorflow already built in method for this. Check the API to find out. 
      - (Optionaly) Code it without using the built in tool for it

  3. **Pick an effective L2 weight:**
      - You could set up the learning rate value by passing it as argument (e.g. in `__init__ (self, dim, lw=1.0, ...)` )
      - Try small and larger values to see the behavior of the model.
  
  4. **Look at some learning curves:**
      - This code might be helpful: http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
