# Gender Classifier

### Load Data

In [116]:
%store -r cnn_model

dataset = cnn_model['dataset'].reset_index(drop=True)

abstracts_padded = cnn_model['abstracts_padded']
labels, ys = cnn_model['labels'], cnn_model['ys'].reset_index(drop=True)
num_classes = cnn_model['num_classes']

embeddings = cnn_model['embeddings']
word_dim = cnn_model['word_dim']
word2idx, idx2word = cnn_model['word2idx'], cnn_model['idx2word']
maxlen = cnn_model['maxlen']
vocab_size = cnn_model['vocab_size']
num_train = cnn_model['num_train']

### Hyperparameters

In [117]:
nb_filter = 5
filter_length = 2
hidden_dims = 32
nb_epoch = 35

### Define Model

In [79]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D

print('Build model...')
model = Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim=word_dim, weights=[embeddings], input_length=maxlen))
model.add(Dropout(0.25))

model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        activation='relu'))
model.add(MaxPooling1D(pool_length=2))

model.add(Flatten())
model.add(Dense(hidden_dims))
model.add(Dropout(0.25))
model.add(Activation('relu'))

model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

Using Theano backend.


Build model...


In [119]:
def labelidx_generator():
    """Generate a list where each element is all the indexes corresponding to a class"""

    for label in labels:
        yield ys[ys == label].index.tolist()

def batch_generator(batch_size):
    """Yield successive batches for training
    
    This generator is not meant to be exhausted, but rather called by next()
    
    """
    assert not batch_size % num_classes
    
    labels_idxs = list(labelidx_generator())
    
    while True:
        idxs_lists = [np.random.choice(label_idxs, size=batch_size/num_classes) for label_idxs in labels_idxs]
        idxs = [idx for idxs_list in idxs_lists for idx in idxs_list]
        
        yield idxs

In [None]:
batch_size = 15

example = batch_generator(batch_size=15)

In [146]:
for i in range(50):
    batch = next(example)

    abstracts_batch = abstracts_padded[batch]

    batch_size = 15

    ys_batch = np.zeros([batch_size, num_classes])
    ys_batch[np.arange(batch_size), ys[batch]] = 1

    print model.train_on_batch(abstracts_batch, ys_batch)

[array(0.4703173339366913, dtype=float32)]
[array(0.30346372723579407, dtype=float32)]
[array(0.3935213088989258, dtype=float32)]
[array(0.26881158351898193, dtype=float32)]
[array(0.4369662404060364, dtype=float32)]
[array(0.2765965163707733, dtype=float32)]
[array(0.4600931406021118, dtype=float32)]
[array(0.46159836649894714, dtype=float32)]
[array(0.37763652205467224, dtype=float32)]
[array(0.18277865648269653, dtype=float32)]
[array(0.203164741396904, dtype=float32)]
[array(0.16951055824756622, dtype=float32)]
[array(0.5674611330032349, dtype=float32)]
[array(0.3887283504009247, dtype=float32)]
[array(0.34243258833885193, dtype=float32)]
[array(0.4788208305835724, dtype=float32)]
[array(0.2791329026222229, dtype=float32)]
[array(0.27685075998306274, dtype=float32)]
[array(0.3246164619922638, dtype=float32)]
[array(0.4835849702358246, dtype=float32)]
[array(0.4905903935432434, dtype=float32)]
[array(0.31166955828666687, dtype=float32)]
[array(0.24679981172084808, dtype=float32)]
[a

In [150]:
predictions = model.predict(abstracts_padded)

np.mean(predictions.argmax(axis=1) == ys)

0.92546583850931674

### Examine Bigrams Which Filters Fire on

In [149]:
filters = model.layers[2].W.eval()
filters = np.squeeze(filters)
filters = [filter.T for filter in filters]

abstract = abstracts_padded[0]

def activation_generator(filter):
    for w1, w2 in zip(abstract, abstract[1:]):
        yield np.sum(embeddings[[w1, w2]] * filter), (w1, w2)
        
def activations_generator(filters):
    for filter in filters:
        yield list(activation_generator(filter))
        
activations = list(activations_generator(filters))

for activation in activations:
    for score, (w1, w2) in sorted(activation, reverse=True)[:10]:
        print score, idx2word[w1], idx2word[w2]
        
    print

0.439572675754 trials of
0.351481750361 range of
0.342615633999 symptoms of
0.318224560524 magnitude of
0.318224560524 magnitude of
0.317734985361 Serum D-cycloserine
0.30851242987 impairment of
0.297891491023 trial .
0.290294148023 dose of
0.279616767301 D-serine .

0.786158382607 of the
0.758244342999 assess the
0.738976918683 for the
0.738976918683 for the
0.715640993039 reflecting the
0.690424686484 completed the
0.68217528849 at the
0.664111372325 by the
0.637346373683 , the
0.555191480812 . The

0.557287294566 . Fifty-five
0.557004542489 . Twenty-six
0.402393250002 . The
0.365552048554 groups .
0.342196002817 patients with
0.338112218278 8 or
0.335882991987 efficacy for
0.335882991987 efficacy for
0.332375408683 placebo for
0.328273904131 effect for

0.545715160704 trial .
0.470450553283 design .
0.464260040643 schizophrenia .
0.429435329372 trials of
0.420994493496 in this
0.420819714946 in a
0.420819714946 in a
0.412282548534 antipsychotics in
0.412282548534 antipsychotics in
0