# Preprocess Abstracts

Create embedding matrix and indexed abstracts for feeding into Keras character-level language model.

In [49]:
%store -r abstracts_targets_collapsed

abstracts = abstracts_targets_collapsed.abstract

### Extract Characters

In [50]:
text = ' '.join(abstracts)
chars = set(text)

char2idx = dict((c, i) for i, c in enumerate(chars, start=3))
idx2char = dict((i, c) for i, c in enumerate(chars, start=3))

# Add in special tokens
char2idx['MASK'], idx2char[0] = (0, 'MASK')
char2idx['START'], idx2char[1] = (1, 'START')
char2idx['END'], idx2char[2] = (2, 'END')

chars.update({'START', 'END', 'MASK'})

### Create Embedding Matrix

In [45]:
num_chars = len(chars)
MAX_LENGTH = 1000+1
NUM_ABSTRACTS = len(abstracts)
MASK_VALUE = 0

embeddings = np.eye(num_chars)
embeddings[0] = 0 # mask value - not a real character

### Indexify and Pad Abstracts

In [46]:
def abstracts_idxified_generator(abstracts, max_length):
    for abstract in abstracts:
        abstract_idxed = [char2idx['START']] + [char2idx[char] for char in abstract] + [char2idx['END']]
        
        yield abstract_idxed[:max_length]

abstracts_idxed = list(abstracts_idxified_generator(abstracts, MAX_LENGTH))

from keras.preprocessing.sequence import pad_sequences

abstracts_padded = pad_sequences(abstracts_idxed, maxlen=MAX_LENGTH)

### Store Abstracts to Disk

In [47]:
abstracts_char_lm = {'abstracts_padded': abstracts_padded, 'embeddings': embeddings,
                     'char2idx': char2idx, 'idx2char': idx2char}

%store abstracts_char_lm

import pickle

pickle.dump(abstracts_char_lm, open('data.p', 'wb'))

Stored 'abstracts_char_lm' (dict)
