# `word2vec`

- Pull out `word2vec` vectors for every word in the corpus and make an embedding matrix
- Transform each abstract into a list of indexes into the embedding matrix
- Save it!

In [1]:
%store -r abstracts_targets_collapsed

dataset = abstracts_targets_collapsed

### Take Only a Minimal Subset of the Dataset

In [2]:
def examples_generator(dataset, target='gender'):
    """Generate indexes into dataset for one training example for each class"""
    
    labels = dataset[target].unique()

    for label in labels:
        yield dataset[dataset[target] == label].iloc[0].name

dataset = dataset.iloc[list(examples_generator(dataset))]

### Extract Abstracts

In [3]:
abstracts = dataset.abstract
abstracts = abstracts.map(lambda abstract: abstract.decode('utf-8')) # decode character encoding

### Put All Unique Words in Abstracts into a Set

In [4]:
from nltk import sent_tokenize, word_tokenize

words = set()
for abstract in abstracts:
    for word in word_tokenize(abstract.decode('utf-8')):
        words.add(word)

### Load Pre-Trained Pubmed `word2vec` Vectors

In [5]:
import gensim

model = gensim.models.Word2Vec.load_word2vec_format('/home/ebanner/Research/data/word2vec/PubMed-w2v.bin', binary=True)  # C binary format

### Generate word $\rightarrow$ index Mapping for Mini-Embedding Matrix

Additionally add in the `<MASK>` entry as the zeroth element in the embedding matrix.

In [6]:
word2idx = {word: idx for idx, word in enumerate(words)} # ignore unknown words for now

word2idx = {word: idx+1 for word, idx in word2idx.items()} # push every word down one spot to make room for the mask
word2idx['<<<MASK>>>'] = 0

### Generate Mini-Embedding Matrix

In [7]:
import operator

def w2v_generator(word2idx):
    """Generate word2vec vectors for each word
    
    It's assumed that every word in word2idx contains in word2vec
    
    """
    for word, _ in sorted(word2idx.items(), key=operator.itemgetter(1)):
        if word not in model:
            yield np.zeros(model.vector_size) # just yield all zeros for OOV words (including the mask)
        else:
            yield model[word]
        
W = np.array(list(w2v_generator(word2idx)))

### Convert Each Abstract to a List of Indices

In [8]:
def abstracts2idxs_generator(abstracts):
    for i, abstract in enumerate(abstracts):
        yield list(abstract2idxs(abstract))
    
def abstract2idxs(abstract):
    for word in word_tokenize(abstract):
        yield word2idx[word]
        
abstracts_idxed = list(abstracts2idxs_generator(abstracts))

### Pad the Abstracts to a Fixed Length

The maximum value is the length of the longest abstract

In [9]:
from keras.preprocessing import sequence

maxlen = max(len(abstract_idxed) for abstract_idxed in abstracts_idxed)

abstracts_padded = sequence.pad_sequences(abstracts_idxed, maxlen=maxlen)

### Check Yourself

In [10]:
idx2word = {idx: word for word, idx in word2idx.items()}

for abstract_padded in abstracts_padded:
    print ' '.join([idx2word[idx] for idx in abstract_padded])
    print

<<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>> <<<MASK>>>

### Save the Following:

- Abstracts represented as indices into embedding matrix
- Embedding matrix

In [15]:
genders = dataset.gender
ys = genders.map({gender: i for i, gender in enumerate(genders)})

cnn_model = {
            'dataset': dataset,
            'abstracts_padded': abstracts_padded,
            'ys': ys,
            'embeddings': W,
            'word_dim': model.vector_size,
            'maxlen': maxlen,
            'vocab_size': len(W),
            'num_classes': len(genders),
            'num_train': len(dataset),
            'word2idx': word2idx,
            'idx2word': idx2word,
}

%store cnn_model

Stored 'cnn_model' (dict)
