In [4]:
from pathlib import Path

DATADIR = "data/nerhtml"

def get_sentences(filename):    
    with Path(DATADIR, filename).open('r', encoding="utf-8") as f:
        sentences = f.read().strip().split('\n\n')
        return [[t.split() for t in s.split('\n')] for s in sentences if len(s) > 0] 

def extract_vocabulary(filenames):
    if not isinstance(filenames, list):
        filenames = [filenames]
        
    words = []
    for f in filenames:
        words = words + [w[0] for s in get_sentences(f) for w in s]
    
    words = list(set(words))
    
    with Path(DATADIR, 'vocab.words.txt').open('w', encoding='utf8') as f:
        for w in words:
            f.write(w + '\n')

    print('Found %d words.' % (len(words)))            
extract_vocabulary(['train', 'valid', 'test'])

Found 35214 words.


In [None]:
# -*- coding: utf-8 -*-

LOWERCASE = False

from pathlib import Path

import numpy as np

if __name__ == '__main__':
  DATADIR = "data"
    
  # Load vocab.
  with Path(DATADIR, 'conll2003/vocab.words.txt').open(encoding='utf-8') as f:
    word_to_idx = {line.strip(): idx for idx, line in enumerate(f)}
  size_vocab = len(word_to_idx)

  word_to_lowercase = {}
  for key in word_to_idx:
    lkey = key.lower()
    if not lkey in word_to_lowercase:
      word_to_lowercase[lkey] = []
    word_to_lowercase[lkey].append(word_to_idx[key])
    
  # Array of zeros
  embeddings = np.zeros((size_vocab, 300))

  # Get relevant glove vectors
  found = 0
  print('Reading GloVe file (may take a while)')
  with open('data/glove.840B.300d.txt', encoding="utf-8") as f:
    for line_idx, line in enumerate(f):
      if line_idx % 100000 == 0:
        print('- At line {}'.format(line_idx))
      line = line.strip().split()
      if len(line) != 300 + 1:
        continue
      word = line[0]
      embedding = line[1:]
        
      if LOWERCASE:
        word = word.lower()
        if word in word_to_lowercase:
          for word_idx in word_to_lowercase[word]:
            found += 1
            embeddings[word_idx] = embedding
      else:  
        if word in word_to_idx:
          found += 1
          word_idx = word_to_idx[word]
          embeddings[word_idx] = embedding
  print('- done. Found {} vectors for {} words'.format(found, size_vocab))

  ASHES = embeddings[22491]
  ashes = embeddings[2500]
  print(ASHES)
  print(ashes)  

  # Save np.array to file
  np.savez_compressed(DATADIR + '/glove2.npz', embeddings=embeddings)

Reading GloVe file (may take a while)
- At line 0
- At line 100000
- At line 200000
- At line 300000
- At line 400000
- At line 500000
- At line 600000
- At line 700000
- At line 800000
- At line 900000
- At line 1000000
- At line 1100000


In [38]:
# -*- coding: utf-8 -*-

LOWERCASE = False

from pathlib import Path
import re
import numpy as np

if __name__ == '__main__':
  DATADIR = "data"
    
  # Load vocab.
  with Path(DATADIR, 'conll2003/vocab.chars.txt').open(encoding='utf-8') as f:
    char_to_idx = {line.strip(): idx for idx, line in enumerate(f)}
  size_vocab = len(char_to_idx)
  print(size_vocab)
    
  # Array of zeros
  embeddings = np.zeros((size_vocab, 300))
  # Get relevant glove vectors
  found = 0
  print('Reading GloVe file (may take a while)')
  with open('data/char_embeddings.txt', encoding="utf-8") as f:
    for line_idx, line in enumerate(f):
      line = line.strip().split()
      if len(line) != 300 + 1:
        continue  
    
      char = line[0]
      embedding = line[1:]

      if char in char_to_idx:
        found += 1
        char_idx = char_to_idx[char]
        embeddings[char_idx] = embedding
  print('- done. Found {} vectors for {} words'.format(found, size_vocab))

  # Save np.array to file
  np.savez_compressed(DATADIR + '/char_embeddings.npz', embeddings=embeddings)

89
Reading GloVe file (may take a while)
- done. Found 89 vectors for 89 words


In [75]:
import tensorflow as tf

def create_position_embeddings(self, max_length, emb_dim):
    
max_length=1600
emb_dim=    
position_emb = np.array([
    [(pos+1) / np.power(10000, 2 * (j // 2) / emb_dim) for j in range(emb_dim)]
    for pos in range(max_length)
])

position_emb[:,0::2] = np.sin(position_emb[:,0::2]) # dim 2i
position_emb[:,1::2] = np.cos(position_emb[:,1::2]) # dim 2i+1
position_emb = np.vstack([position_emb, [[0.] * emb_dim]])

print(position_emb)
 
    # variable = np.vstack([position_emb, [[0.] * emb_dim]])
    # variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
 
    # seq = tf.constant(np.arange(1600), dtype=tf.int32)
    # seq = tf.nn.embedding_lookup(variable, seq)
    # pos_embeddings = tf.slice(seq, [0], [tf.shape(inputs)[1]])
    # return inputs + pos_embeddings

[[  8.41470985e-01   5.40302306e-01   5.15138753e-01 ...,   9.99999942e-01
    1.84784979e-04   9.99999983e-01]
 [  9.09297427e-01  -4.16146837e-01   8.83057855e-01 ...,   9.99999767e-01
    3.69569951e-04   9.99999932e-01]
 [  1.41120008e-01  -9.89992497e-01   9.98611031e-01 ...,   9.99999475e-01
    5.54354911e-04   9.99999846e-01]
 ..., 
 [ -7.68254661e-01  -6.40144339e-01   7.46905556e-01 ...,   9.99865690e-01
    8.86956273e-03   9.99960665e-01]
 [ -9.53752653e-01   3.00592544e-01   9.82709113e-01 ...,   9.99860035e-01
    9.05434029e-03   9.99959009e-01]
 [ -2.62374854e-01   9.64966028e-01   9.37667769e-01 ...,   9.99854264e-01
    9.23911754e-03   9.99957318e-01]]


array([[  8.41470985e-01,   5.40302306e-01,   5.15138753e-01, ...,
          9.99999942e-01,   1.84784979e-04,   9.99999983e-01],
       [  9.09297427e-01,  -4.16146837e-01,   8.83057855e-01, ...,
          9.99999767e-01,   3.69569951e-04,   9.99999932e-01],
       [  1.41120008e-01,  -9.89992497e-01,   9.98611031e-01, ...,
          9.99999475e-01,   5.54354911e-04,   9.99999846e-01],
       ..., 
       [ -7.68254661e-01,  -6.40144339e-01,   7.46905556e-01, ...,
          9.99865690e-01,   8.86956273e-03,   9.99960665e-01],
       [ -9.53752653e-01,   3.00592544e-01,   9.82709113e-01, ...,
          9.99860035e-01,   9.05434029e-03,   9.99959009e-01],
       [ -2.62374854e-01,   9.64966028e-01,   9.37667769e-01, ...,
          9.99854264e-01,   9.23911754e-03,   9.99957318e-01]])