# Create Abstracts Embeddings with `word2vec`

- Pull out `word2vec` vectors for every word in the corpus and make an embedding matrix
- Transform each abstract into a list of indexes into the embedding matrix
- Save it!

In [1]:
%store -r abstracts_targets_collapsed

dataset = abstracts_targets_collapsed

dataset = dataset

### Extract Abstracts

In [2]:
abstracts = dataset.abstract
abstracts = abstracts.map(lambda abstract: abstract.decode('utf-8')) # decode character encoding

### Put All Unique Words in Abstracts into a Set

In [3]:
from nltk import sent_tokenize, word_tokenize

words = set()
for abstract in abstracts:
    for word in word_tokenize(abstract):
        words.add(word)

### Load Pre-Trained Pubmed `word2vec` Vectors

In [4]:
import gensim

model = gensim.models.Word2Vec.load_word2vec_format('/home/ebanner/Research/data/word2vec/PubMed-w2v.bin', binary=True)  # C binary format

### Generate word $\rightarrow$ index Mapping for Mini-Embedding Matrix

In [5]:
word2idx = {word: idx for idx, word in enumerate(words)} # ignore unknown words for now

word2idx = {word: idx+1 for word, idx in word2idx.items()} # bump every word up to make room for the mask
word2idx['<MASK>'] = 0

### Generate Mini-Embedding Matrix

In [6]:
import operator

def w2v_generator(word2idx):
    """Generate word2vec vectors for each word
    
    It's assumed that every word in word2idx contains in word2vec
    
    """
    for word, _ in sorted(word2idx.items(), key=operator.itemgetter(1)):
        if word not in model:
            yield np.zeros(model.vector_size) # just yield all zeros for OOV words (including the mask)
        else:
            yield model[word]
        
W = np.array(list(w2v_generator(word2idx)))

### Convert Each Abstract to a List of Indices

In [7]:
def abstracts2idxs_generator(abstracts):
    for i, abstract in enumerate(abstracts):
        yield list(abstract2idxs(abstract))
    
def abstract2idxs(abstract):
    for word in word_tokenize(abstract):
        yield word2idx[word]
        
abstracts_idxed = list(abstracts2idxs_generator(abstracts))

### Pad the Abstracts to a Fixed Length

Figure Out Which Length we Should Cap At. This length should be long enough to not cut off most abstracts, but not be too big as to overweight long outliers.

In [8]:
from nltk import word_tokenize

def length_generator():
    for abstract in abstracts:
        words = word_tokenize(abstract)
        yield len(words)
    
lengths = pd.Series(list(length_generator()))

def best_maxlen(ratio=.8):
    """Return the smallest maxlen such that ratio < num_lengths/num_total"""

    num_total = len(lengths)

    for length in range(min(lengths), max(lengths)):
        num_lengths = len(lengths[lengths <= length])

        if num_lengths / float(num_total) >= ratio:
            return length

def best_maxlens():
    for ratio in (.2, .4, .6, .8, .95, 1.):
        yield best_maxlen(ratio)
        
list(best_maxlens())

[252, 292, 320, 364, 464, None]

In [9]:
from keras.preprocessing import sequence

maxlen = best_maxlen(ratio=.95)

abstracts_padded = sequence.pad_sequences(abstracts_idxed, maxlen=maxlen)

### Check Yourself

In [10]:
idx2word = {idx: word for word, idx in word2idx.items()}

for abstract_padded in abstracts_padded[:10]:
    print ' '.join([idx2word[idx] for idx in abstract_padded])
    print

<MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK> <MASK>

### Save the Following:

- Abstracts represented as indices into embedding matrix
- Embedding matrix

In [11]:
labels = dataset.gender.unique()
gender_map = {gender: i for i, gender in enumerate(labels)}
ys = dataset.gender.map(gender_map)

embeddings_info = {
            'abstracts': abstracts,
            'abstracts_padded': abstracts_padded,
            'embeddings': W,
            'word_dim': model.vector_size,
            'maxlen': maxlen,
            'vocab_size': len(W),
            'word2idx': word2idx,
            'idx2word': idx2word,
}

%store embeddings_info

import pickle
pickle.dump(embeddings_info, open('embeddings_info.p', 'wb'))

Stored 'embeddings_info' (dict)
