In [None]:
DATADIR = "data/nerhtml"

def get_sentences(filename):    
    with Path(DATADIR, filename).open('r', encoding="utf-8") as f:
        sentences = f.read().strip().split('\n\n')
        return [[t.split() for t in s.split('\n')] for s in sentences if len(s) > 0] 

def extract_vocabulary(filenames):
    if not isinstance(filenames, list):
        filenames = [filenames]
        
    words = []
    for f in filenames:
        words = words + [w[0] for s in get_sentences(f) for w in s]
    
    words = list(set(words))
    
    with Path(DATADIR, 'vocab.words.txt').open('w', encoding='utf8') as f:
        for w in words:
            f.write(w + '\n')
               
extract_vocabulary(['train', 'valid', 'test'])

In [3]:
from pathlib import Path

import numpy as np

if __name__ == '__main__':
  DATADIR = "data/conll2003"
    
  # Load vocab.
  with Path(DATADIR, 'vocab.words.txt').open() as f:
    word_to_idx = {line.strip(): idx for idx, line in enumerate(f)}
  size_vocab = len(word_to_idx)

  # Array of zeros
  embeddings = np.zeros((size_vocab, 300))

  # Get relevant glove vectors
  found = 0
  print('Reading GloVe file (may take a while)')
  with open('data/glove.840B.300d.txt', encoding="utf-8") as f:
    for line_idx, line in enumerate(f):
      if line_idx % 100000 == 0:
        print('- At line {}'.format(line_idx))
      line = line.strip().split()
      if len(line) != 300 + 1:
        continue
      word = line[0]
      embedding = line[1:]
      if word in word_to_idx:
        found += 1
        word_idx = word_to_idx[word]
        embeddings[word_idx] = embedding
  print('- done. Found {} vectors for {} words'.format(found, size_vocab))

  # Save np.array to file
  np.savez_compressed(DATADIR + '/glove.npz', embeddings=embeddings)

Reading GloVe file (may take a while)
- At line 0
- At line 100000
- At line 200000
- At line 300000
- At line 400000
- At line 500000
- At line 600000
- At line 700000
- At line 800000
- At line 900000
- At line 1000000
- At line 1100000
- At line 1200000
- At line 1300000
- At line 1400000
- At line 1500000
- At line 1600000
- At line 1700000
- At line 1800000
- At line 1900000
- At line 2000000
- At line 2100000
- done. Found 26890 vectors for 30290 words
