In [1]:
import torch 
import numpy as np 
from allennlp.data.token_indexers import TokenIndexer
from allennlp.data.tokenizers import Token 
from allennlp.data.vocabulary import Vocabulary

In [2]:
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.tokenizers.word_tokenizer import WordTokenizer
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders import TextFieldEmbedder 
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper

In [3]:
from allennlp.training.trainer import Trainer 

In [4]:
from pathlib import Path
path = Path('Lesson11/data/translate')
en_path = path/'europarlv7-en-fr.en'
fr_path = path/'europarlv7-fr-en.fr'

In [5]:
import re 
re_eq = re.compile('^(Wh[^?.!]+\?)')
re_fq = re.compile('^([^?.!]+\?)')

lines = ((re_eq.search(eq), re_fq.search(fq)) 
         for eq, fq in zip(open(en_path, encoding='utf-8'), open(fr_path, encoding='utf-8')))

qs = [(e.group(), f.group()) for e,f in lines if e and f]

In [6]:
qs[:5] # en-fr questions as pairs in a tuple

[('Why has no air quality test been done on this particular building since we were elected?',
  "Comment se fait-il qu'aucun test de qualité de l'air n'ait été réalisé dans ce bâtiment depuis notre élection ?"),
 ('Why has there been no Health and Safety Committee meeting since 1998?',
  "Comment se fait-il que le comité de santé et d'hygiène ne se soit plus réuni depuis 1998 ?"),
 ('Why has there been no fire drill, either in the Brussels Parliament buildings or the Strasbourg Parliament buildings?',
  "Comment se fait-il que nous n'ayons jamais fait d'exercice d'évacuation dans les bâtiments du Parlement de Bruxelles et de Strasbourg ?"),
 ('Why are there no fire instructions?',
  "Comment se fait-il qu'il n'y ait pas de consignes en cas d'incendie ?"),
 ('Why have the staircases not been improved since my accident?',
  "Comment se fait-il que les escaliers n'aient pas été améliorés depuis mon accident ?")]

In [7]:
import pickle
pickle.dump(qs, (path/'fr-en-qs.pkl').open('wb'))
qs = pickle.load((path/'fr-en-qs.pkl').open('rb'))

In [8]:
en_qs, fr_qs = zip(*qs)

In [9]:
en_tok = WordTokenizer().batch_tokenize(list(en_qs))  # return each sentence as a list of tokens

In [10]:
fr_tok = WordTokenizer().batch_tokenize(list(fr_qs))  # return each sentence as a list of tokens

In [11]:
print(len(en_tok), len(fr_tok))

14803 14803


In [12]:
print(np.percentile([len(o) for o in en_tok], 90),
np.percentile([len(o) for o in fr_tok], 90))
keep = np.array([len(o)<35 for o in en_tok])
en_tok = np.array(en_tok)[keep]
fr_tok = np.array(fr_tok)[keep]

31.0 34.0


In [13]:
print(len(en_tok), len(fr_tok))  # total number of question pairs 

13683 13683


In [18]:
# create vocabulary by mapping strings to integers 
# include only frequently occuring words in vocabulary 
import collections
def toks2ids(tok,pre):
    freq = collections.Counter(p for o in tok for p in o)
    itos = [o for o,c in freq.most_common(40000)]
    itos.insert(0, '_bos_')
    itos.insert(1, '_pad_')
    itos.insert(2, '_eos_')
    itos.insert(3, '_unk')
    stoi = collections.defaultdict(lambda: 3, 
                                   {v:k for k,v in enumerate(itos)})
    ids = np.array([([stoi[o] for o in p] + [2]) for p in tok])
    np.save(path/f'{pre}_ids.npy', ids)
    # pickle.dump(itos, open(path/f'{pre}_itos.pkl', 'wb'))
    return ids,itos,stoi


In [19]:
en_ids,en_itos,en_stoi = toks2ids(en_tok,'en')
fr_ids,fr_itos,fr_stoi = toks2ids(fr_tok,'fr')

In [20]:
[fr_itos[o] for o in fr_ids[0]], len(fr_itos), len(en_itos)

([Comment,
  se,
  fait,
  -,
  il,
  qu'aucun,
  test,
  de,
  qualité,
  de,
  l'air,
  n'ait,
  été,
  réalisé,
  dans,
  ce,
  bâtiment,
  depuis,
  notre,
  élection,
  ?,
  '_eos_'],
 40004,
 40004)

In [32]:
# ! pip install git+https://github.com/facebookresearch/fastText.git
import fastText as ft

In [33]:
def sq2sqDataset(x, y):
    def __init__(self, x, y): 
        self.x = x
        self.y = y
    def __getitem__(self, idx):
        return A(self.x[idx], self.y[idx])
    def __len__(self):
        return len(self.x)

In [None]:
class Sq2sq(nn.Module):
    super.__init__():
        def
    