# Multitask Learning

Use a single shared representation to predict gender and phase 2

### Load Data

In [4]:
%store -r embeddings_info

abstracts = embeddings_info['abstracts']
abstracts_padded = embeddings_info['abstracts_padded']
embeddings = embeddings_info['embeddings']
word_dim = embeddings_info['word_dim']
word2idx, idx2word = embeddings_info['word2idx'], embeddings_info['idx2word']
maxlen = embeddings_info['maxlen']
vocab_size = embeddings_info['vocab_size']

In [5]:
%store -r pruned_dataset binarized_dataset

ys = np.array(binarized_dataset).T

### Train - Validation Split

In [6]:
X_train, ys_train = abstracts_padded, ys
X_val, ys_val = abstracts_padded, ys

In [None]:
from sklearn.cross_validation import KFold

fold = KFold(len(abstracts_padded), n_folds=5, shuffle=True)
p = iter(fold)

train_idxs, val_idxs = next(p)

X_train, ys_train = abstracts_padded[train_idxs], ys[train_idxs]
X_val, ys_val = abstracts_padded[val_idxs], ys[val_idxs]

num_train, num_val = len(X_train), len(X_val)

### Hyperparameters

In [7]:
nb_filter = 20
filter_length = 2
hidden_dims = 32
nb_epoch = 35
batch_size = 10

### Define Model

In [8]:
from keras.models import Graph
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D

model = Graph()
model.add_input(name='input', input_shape=[maxlen], dtype='int') # dtype='int' is 100% necessary for some reason!
model.add_node(Embedding(input_dim=vocab_size, output_dim=word_dim, weights=[embeddings], input_length=maxlen, trainable=False),
               name='embedding', input='input')
model.add_node(Dropout(0.25), name='dropout1', input='embedding')

model.add_node(Convolution1D(nb_filter=nb_filter,
                             filter_length=filter_length,
                             activation='relu'),
              name='conv',
              input='dropout1')
model.add_node(MaxPooling1D(pool_length=maxlen-1), name='pool', input='conv') # non-maximum suppression
model.add_node(Flatten(), name='flat', input='pool')
model.add_node(Dense(hidden_dims), name='z', input='flat')
model.add_node(Activation('relu'), name='shared', input='z')
model.add_node(Dropout(0.25), name='dropout2', input='shared')

model.add_node(Dense(output_dim=2, activation='softmax'), name='gender_probs', input='dropout2')
model.add_output(name='gender', input='gender_probs')

model.add_node(Dense(output_dim=2, activation='softmax'), name='phase_2_probs', input='dropout2')
model.add_output(name='phase_2', input='phase_2_probs')

model.compile(optimizer='rmsprop',
              loss={'gender': 'categorical_crossentropy',
                    'phase_2': 'categorical_crossentropy'})

Using Theano backend.


### Random Balanced Minibatch Generator

In [9]:
def batch_generator(ys, batch_size, balanced=True):
    """Yield successive batches for training
    
    This generator is not meant to be exhausted, but rather called by next().
    
    Each batch has batch_size/num_classes number of examples from each class
    
    """
    num_objectives, num_train = ys.shape
    
    while True:
        yield np.random.choice(num_train, size=batch_size)

### Minibatch Training

In [11]:
num_train = len(X_train)

batch_size = num_train // 2

example = batch_generator(ys_train, batch_size)

labels = ['gender', 'phase_2']

val_dict = {label: y_row for y_row, label in zip(ys_val, labels)}

In [19]:
def produce_labels(labels, ys, batch_idxs, class_sizes):
    batch_size = len(batch_idxs)
    
    for label, num_classes, y_row in zip(labels, class_sizes, ys):
        y_batch = y_row[batch_idxs]
        
        ys_block = np.zeros([batch_size, num_classes])
        ys_block[np.arange(batch_size), y_batch] = 1
        
        yield (label, ys_block)

for i in range(100):
    batch_idxs = next(example)
    
    X = X_train[batch_idxs]
    train_dict = dict(produce_labels(labels, ys, batch_idxs, class_sizes=[2, 2]))
    train_dict.update({'input': X})

    train_error = model.train_on_batch(train_dict)

    if not i % 10:
        print train_error
        
        predictions = model.predict({'input': X_val})
        for label in labels:
            ys_pred = predictions[label]
            ys_pred = ys_pred.argmax(axis=1)
            
            print '{} accuracy:'.format(label), np.mean(ys_pred == val_dict[label])

[array(0.48505547642707825, dtype=float32)]
gender accuracy: 1.0
phase_2 accuracy: 0.9
[array(0.31257328391075134, dtype=float32)]
gender accuracy: 1.0
phase_2 accuracy: 0.9
[array(0.3063952624797821, dtype=float32)]
gender accuracy: 1.0
phase_2 accuracy: 1.0
[array(0.37981748580932617, dtype=float32)]
gender accuracy: 1.0
phase_2 accuracy: 1.0
[array(0.22755445539951324, dtype=float32)]
gender accuracy: 1.0
phase_2 accuracy: 1.0
[array(0.26175355911254883, dtype=float32)]
gender accuracy: 1.0
phase_2 accuracy: 1.0
[array(0.422007292509079, dtype=float32)]
gender accuracy: 1.0
phase_2 accuracy: 1.0
[array(0.2125508189201355, dtype=float32)]
gender accuracy: 1.0
phase_2 accuracy: 1.0
[array(0.07638401538133621, dtype=float32)]
gender accuracy: 1.0
phase_2 accuracy: 1.0
[array(0.08776175230741501, dtype=float32)]
gender accuracy: 1.0
phase_2 accuracy: 1.0


### Confusion Matrix

In [None]:
predictions = model.predict(X_val)

ysval_block = np.zeros([num_val, num_classes])
ysval_block[np.arange(num_val), ys_val] = 1

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(ys_val, predictions.argmax(axis=1))

from support import plot_confusion_matrix

plot_confusion_matrix(cm, label_map)

### Examine Bigrams Which Filters Fire on

In [None]:
dataset.abstract.iloc[0]

In [None]:
filters = model.layers[2].W.eval()
filters = np.squeeze(filters)
filters = [filter.T for filter in filters]

def activation_generator(filter, abstract):
    for w1, w2 in zip(abstract, abstract[1:]):
        yield np.sum(embeddings[[w1, w2]] * filter), (w1, w2)
        
def activations_generator(filters, abstract):
    for filter in filters:
        yield list(activation_generator(filter, abstract))
        
def show_activations(filters, abstract):        
    activations = list(activations_generator(filters, abstract))

    for activation in activations:
        for score, (w1, w2) in sorted(activation, reverse=True)[:10]:
            print score, idx2word[w1], idx2word[w2]

        print
        
for gender, idx in zip(['Both', 'Female', 'Male'], [0, 50, 100]):
    print '*'*10, gender, '*'*10
    show_activations(filters, abstracts_padded[idx])