# Gender Classifier

### Load Data

In [1]:
%store -r cnn_model

dataset = cnn_model['dataset']

abstracts_padded = cnn_model['abstracts_padded']
label_map, ys = cnn_model['label_map'], cnn_model['ys']
labels = [i for gender, i in label_map.items()]
num_classes = cnn_model['num_classes']

embeddings = cnn_model['embeddings']
word_dim = cnn_model['word_dim']
word2idx, idx2word = cnn_model['word2idx'], cnn_model['idx2word']
maxlen = cnn_model['maxlen']
vocab_size = cnn_model['vocab_size']
num_train = cnn_model['num_train']

### Train - Validation Split

In [2]:
X_train, ys_train = abstracts_padded, ys

num_train = len(X_train)

In [3]:
from sklearn.cross_validation import KFold

fold = KFold(len(abstracts_padded), n_folds=5)
p = iter(fold)

train_idxs, val_idxs = next(p)

X_train, ys_train = abstracts_padded[train_idxs], ys[train_idxs]
X_val, ys_val = abstracts_padded[val_idxs], ys[val_idxs]

num_train, num_val = len(X_train), len(X_val)

### Hyperparameters

In [3]:
nb_filter = 5
filter_length = 2
hidden_dims = 32
nb_epoch = 35

### Define Model

In [23]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D

model = Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim=word_dim, weights=[embeddings], input_length=maxlen,
                   trainable=False))
# model.add(Dropout(0.25))

model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        activation='relu'))
model.add(MaxPooling1D(pool_length=maxlen-1)) # non-maximum suppression

model.add(Flatten())
# model.add(Dense(hidden_dims))
# model.add(Dropout(0.25))
# model.add(Activation('relu'))

model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

model.layers[3].input_shape # ensure non-maximum suppression

(None, 1, 5)

### Helper Functions for Generating Minibatches

In [24]:
def labelidx_generator(ys):
    """Generate a list of lists, where each list contains all the indices corresponding to a class"""

    for label in labels:
        idxs = np.argwhere(ys == label).flatten()
        yield idxs

def batch_generator(ys, batch_size):
    """Yield successive batches for training
    
    This generator is not meant to be exhausted, but rather called by next()
    
    """
    assert not batch_size % num_classes
    
    labels_idxs = list(labelidx_generator(ys))
    
    while True:
        idxs_lists = [np.random.choice(label_idxs, size=batch_size/num_classes) for label_idxs in labels_idxs]
        idxs = [idx for idxs_list in idxs_lists for idx in idxs_list]
        
        yield idxs

In [27]:
batch_size = 50

example = batch_generator(ys_train, batch_size)

for i in range(100):
    batch = next(example)
    
    X = X_train[batch]
    ys = np.zeros([batch_size, num_classes])
    ys[np.arange(batch_size), ys_train[batch]] = 1

    print model.train_on_batch(X, ys)
    
    if not i % 10:
#         predictions = model.predict(X_val)

#         ysval_block = np.zeros([num_val, num_classes])
#         ysval_block[np.arange(num_val), ys_val] = 1

#         print 'Validation accuracy', np.mean(predictions.argmax(axis=1) == ys_val)

        predictions = model.predict(X_train)

        ystrain_block = np.zeros([num_train, num_classes])
        ystrain_block[np.arange(num_train), ys_train] = 1

        print 'Validation accuracy', np.mean(predictions.argmax(axis=1) == ys_train)

[array(0.5766856670379639, dtype=float32)]
Validation accuracy 0.94
[array(0.5705587267875671, dtype=float32)]
[array(0.5815173983573914, dtype=float32)]
[array(0.5725276470184326, dtype=float32)]
[array(0.5817862749099731, dtype=float32)]
[array(0.5765678286552429, dtype=float32)]
[array(0.5860778093338013, dtype=float32)]
[array(0.5810268521308899, dtype=float32)]
[array(0.5705460906028748, dtype=float32)]
[array(0.559622585773468, dtype=float32)]
[array(0.573646605014801, dtype=float32)]
Validation accuracy 0.95
[array(0.5710346102714539, dtype=float32)]
[array(0.568739652633667, dtype=float32)]
[array(0.5614373683929443, dtype=float32)]
[array(0.550750732421875, dtype=float32)]
[array(0.5688070058822632, dtype=float32)]
[array(0.5472411513328552, dtype=float32)]
[array(0.548345685005188, dtype=float32)]
[array(0.5563122034072876, dtype=float32)]
[array(0.5607635378837585, dtype=float32)]
[array(0.570620596408844, dtype=float32)]
Validation accuracy 0.96
[array(0.5596684217453003, d

### Examine Bigrams Which Filters Fire on

In [28]:
dataset.abstract.iloc[3]

'High-grade prostatic intraepithelial neoplasia (HGPIN) is generally regarded as a premalignant lesion that progresses toward prostate cancer. In light of the significant sequelae of prostate cancer treatment, prevention is desirable, and men with HGPIN would be suitable, high-risk subjects. There is in vitro, in vivo, epidemiologic, and human experimental evidence that selenium supplementation may protect against prostate cancer. This article introduces the rationale for, and progress to date, of a double-blind, randomized, placebo-controlled trial of selenium supplementation (200 mug/d in the form of selenomethionine), to prevent the development of prostate cancer among men with HGPIN. The trial, Southwest Oncology Group Protocol 9917, funded by a National Cancer Institute program supporting pivotal prevention trials has registered 537 patients and has randomized >380 to date. Subject accrual is expected to be completed by the fall of 2006, with trial completion in 2009.'

In [29]:
filters = model.layers[1].W.eval()
filters = np.squeeze(filters)
filters = [filter.T for filter in filters]

abstract = abstracts_padded[3]

def activation_generator(filter):
    for w1, w2 in zip(abstract, abstract[1:]):
        yield np.sum(embeddings[[w1, w2]] * filter), (w1, w2)
        
def activations_generator(filters):
    for filter in filters:
        yield list(activation_generator(filter))
        
activations = list(activations_generator(filters))

for activation in activations:
    for score, (w1, w2) in sorted(activation, reverse=True)[:10]:
        print score, idx2word[w1], idx2word[w2]
        
    print

0.191696263346 a premalignant
0.186450032254 ) is
0.175216290962 <MASK> High-grade
0.147563402171 generally regarded
0.13583212792 , with
0.131544865071 accrual is
0.127488682589 Cancer Institute
0.125468068412 as a
0.121252249208 to be
0.120584162286 a National

1.13796707745 537 patients
0.517942211254 high-risk subjects
0.491557046927 among men
0.397863521459 significant sequelae
0.379810744895 and men
0.319021065859 > 380
0.316419756531 registered 537
0.274620137653 intraepithelial neoplasia
0.262729626166 Group Protocol
0.22434247734 200 mug/d

0.315147620768 registered 537
0.300667027119 2006 ,
0.260423278511 is generally
0.244999932357 high-risk subjects
0.234140123521 9917 ,
0.22159237578 prostatic intraepithelial
0.209101468584 cancer among
0.208085673979 evidence that
0.206715391268 significant sequelae
0.200033676969 is expected

0.296779350679 mug/d in
0.274641513498 trials has
0.267795878017 subjects .
0.237284318822 Protocol 9917
0.232917135325 placebo-controlled trial
0.