# Gender Classifier

### Load Data

In [8]:
%store -r cnn_model

dataset = cnn_model['dataset']

abstracts_padded = cnn_model['abstracts_padded']
labels, ys = cnn_model['labels'], cnn_model['ys']
num_classes = cnn_model['num_classes']

embeddings = cnn_model['embeddings']
word_dim = cnn_model['word_dim']
word2idx, idx2word = cnn_model['word2idx'], cnn_model['idx2word']
maxlen = cnn_model['maxlen']
vocab_size = cnn_model['vocab_size']
num_train = cnn_model['num_train']

### Train - Validation Split

In [9]:
from sklearn.cross_validation import KFold

fold = KFold(len(abstracts_padded), n_folds=5)
p = iter(fold)

train_idxs, val_idxs = next(p)

X_train, ys_train = abstracts_padded[train_idxs], ys[train_idxs]
X_val, ys_val = abstracts_padded[val_idxs], ys[val_idxs]

num_train, num_val = len(X_train), len(X_val)

### Hyperparameters

In [10]:
nb_filter = 5
filter_length = 2
hidden_dims = 32
nb_epoch = 35

### Define Model

In [11]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D

print('Build model...')
model = Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim=word_dim, weights=[embeddings], input_length=maxlen))
model.add(Dropout(0.25))

model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        activation='relu'))
model.add(MaxPooling1D(pool_length=2))

model.add(Flatten())
model.add(Dense(hidden_dims))
model.add(Dropout(0.25))
model.add(Activation('relu'))

model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

Using Theano backend.


Build model...


In [28]:
def labelidx_generator(ys):
    """Generate a list of lists, where each list contains all the indices corresponding to a class"""

    for label in labels:
        idxs = np.argwhere(ys == label).flatten()
        yield idxs

def batch_generator(ys, batch_size):
    """Yield successive batches for training
    
    This generator is not meant to be exhausted, but rather called by next()
    
    """
    assert not batch_size % num_classes
    
    labels_idxs = list(labelidx_generator(ys))
    
    while True:
        idxs_lists = [np.random.choice(label_idxs, size=batch_size/num_classes) for label_idxs in labels_idxs]
        idxs = [idx for idxs_list in idxs_lists for idx in idxs_list]
        
        yield idxs

In [37]:
batch_size = 15

example = batch_generator(ys_train, batch_size)

for i in range(50):
    batch = next(example)
    
    X = X_train[batch]
    ys = np.zeros([batch_size, num_classes])
    ys[np.arange(batch_size), ys_train[batch]] = 1

    print model.train_on_batch(X, ys)
    
    if not i % 10:
        predictions = model.predict(X_val)

        ysval_block = np.zeros([num_val, num_classes])
        ysval_block[np.arange(num_val), ys_val] = 1

        print 'Validation accuracy', np.mean(predictions.argmax(axis=1) == ys_val)

[array(0.15675826370716095, dtype=float32)]
Validation accuracy 0.797136038186
[array(0.28385475277900696, dtype=float32)]
[array(0.13814355432987213, dtype=float32)]
[array(0.20715422928333282, dtype=float32)]
[array(0.28599366545677185, dtype=float32)]
[array(0.3846203684806824, dtype=float32)]
[array(0.17764458060264587, dtype=float32)]
[array(0.3291880786418915, dtype=float32)]
[array(0.12966813147068024, dtype=float32)]
[array(0.17874804139137268, dtype=float32)]
[array(0.17367976903915405, dtype=float32)]
Validation accuracy 0.751789976134
[array(0.21907272934913635, dtype=float32)]
[array(0.24582543969154358, dtype=float32)]
[array(0.2880464792251587, dtype=float32)]
[array(0.36144405603408813, dtype=float32)]
[array(0.17732064425945282, dtype=float32)]
[array(0.24093575775623322, dtype=float32)]
[array(0.34361809492111206, dtype=float32)]
[array(0.3272550404071808, dtype=float32)]
[array(0.21755382418632507, dtype=float32)]
[array(0.11952842772006989, dtype=float32)]
Validation

### Examine Bigrams Which Filters Fire on

In [38]:
filters = model.layers[2].W.eval()
filters = np.squeeze(filters)
filters = [filter.T for filter in filters]

abstract = abstracts_padded[0]

def activation_generator(filter):
    for w1, w2 in zip(abstract, abstract[1:]):
        yield np.sum(embeddings[[w1, w2]] * filter), (w1, w2)
        
def activations_generator(filters):
    for filter in filters:
        yield list(activation_generator(filter))
        
activations = list(activations_generator(filters))

for activation in activations:
    for score, (w1, w2) in sorted(activation, reverse=True)[:10]:
        print score, idx2word[w1], idx2word[w2]
        
    print

0.50322654533 may be
0.474932492351 to treatment
0.356489096645 6-month trial
0.356489096645 6-month trial
0.342796811713 in this
0.338289636034 to conventional
0.330906883058 in trials
0.326156170715 Twenty-six subjects
0.324985800699 group design
0.315732342744 this trial

0.764217942926 weeks ,
0.762268309115 double-blind ,
0.741424259313 antipsychotics ,
0.697436696345 trial ,
0.689886737123 symptoms ,
0.680677173386 D-Cycloserine ,
0.659943370502 agonists ,
0.593603398568 rate ,
0.573663654616 measures ,
0.558188986289 site ,

0.710394467062 . Twenty-six
0.693357548702 . Fifty-five
0.609068424439 . D-Cycloserine
0.609068424439 . D-Cycloserine
0.608119034006 . The
0.607546749587 for 6
0.605416647719 . To
0.575126949295 . Because
0.549679709433 for efficacy
0.516910220145 or 24

0.499560751384 serum concentrations
0.32207013346 D-cycloserine concentrations
0.241639645219 . Serum
0.230938948244 50 mg/day
0.230938948244 50 mg/day
0.228223810593 Serum D-cycloserine
0.226470604321 were 