In [33]:
import torchtext
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab, build_vocab_from_iterator
from torchtext.utils import download_from_url, extract_archive
import io
import time

url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')


In [41]:

def build_vocab(filepath, tokenizer):
    tokens = []
    with io.open(filepath, encoding="utf-8") as f:
        for string_ in f:
            tokens.append(tokenizer(string_))
    print("len of lines: ", len(tokens))
    vocab = build_vocab_from_iterator(tokens, specials=['<unk>', '<blank>', '<bos>', '<eos>'])
    print(vocab)
    return vocab

de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)

len of lines:  29000
Vocab()
len of lines:  29000
Vocab()


In [38]:
#generating vocab from text file
import io
from torchtext.vocab import build_vocab_from_iterator
def yield_tokens(file_path, tokenizer):
    with io.open(file_path, encoding = 'utf-8') as f:
        for line in f:
            yield tokenizer(line)
vocab = build_vocab_from_iterator(yield_tokens(train_filepaths[1], en_tokenizer), specials=["<unk>"])
print(vocab)

Vocab()


In [40]:
len(de_vocab), len(en_vocab)

(19215, 10838)

In [None]:

def data_process(filepaths):
  raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
  raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
  data = []
  for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
    de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],
                            dtype=torch.long)
    en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],
                            dtype=torch.long)
    data.append((de_tensor_, en_tensor_))
  return data

train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)

In [10]:
with open(test_filepaths[1], encoding="utf-8") as f:
    for i, line in enumerate(f):
        if (i+1) % 10000 == 0:
            time.sleep(3)
        print(line, en_tokenizer(line))

A man in an orange hat starring at something.
 ['A', 'man', 'in', 'an', 'orange', 'hat', 'starring', 'at', 'something', '.', '\n']
A Boston Terrier is running on lush green grass in front of a white fence.
 ['A', 'Boston', 'Terrier', 'is', 'running', 'on', 'lush', 'green', 'grass', 'in', 'front', 'of', 'a', 'white', 'fence', '.', '\n']
A girl in karate uniform breaking a stick with a front kick.
 ['A', 'girl', 'in', 'karate', 'uniform', 'breaking', 'a', 'stick', 'with', 'a', 'front', 'kick', '.', '\n']
Five people wearing winter jackets and helmets stand in the snow, with snowmobiles in the background.
 ['Five', 'people', 'wearing', 'winter', 'jackets', 'and', 'helmets', 'stand', 'in', 'the', 'snow', ',', 'with', 'snowmobiles', 'in', 'the', 'background', '.', '\n']
People are fixing the roof of a house.
 ['People', 'are', 'fixing', 'the', 'roof', 'of', 'a', 'house', '.', '\n']
A man in light colored clothing photographs a group of men wearing dark suits and hats standing around a woman

In [12]:
with open(test_filepaths[0], encoding="utf-8") as f:
    for i, line in enumerate(f):
        if (i+1) % 10000 == 0:
            time.sleep(3)
        print(line, de_tokenizer(line))

Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.
 ['Ein', 'Mann', 'mit', 'einem', 'orangefarbenen', 'Hut', ',', 'der', 'etwas', 'anstarrt', '.', '\n']
Ein Boston Terrier läuft über saftig-grünes Gras vor einem weißen Zaun.
 ['Ein', 'Boston', 'Terrier', 'läuft', 'über', 'saftig-grünes', 'Gras', 'vor', 'einem', 'weißen', 'Zaun', '.', '\n']
Ein Mädchen in einem Karateanzug bricht ein Brett mit einem Tritt.
 ['Ein', 'Mädchen', 'in', 'einem', 'Karateanzug', 'bricht', 'ein', 'Brett', 'mit', 'einem', 'Tritt', '.', '\n']
Fünf Leute in Winterjacken und mit Helmen stehen im Schnee mit Schneemobilen im Hintergrund.
 ['Fünf', 'Leute', 'in', 'Winterjacken', 'und', 'mit', 'Helmen', 'stehen', 'im', 'Schnee', 'mit', 'Schneemobilen', 'im', 'Hintergrund', '.', '\n']
Leute Reparieren das Dach eines Hauses.
 ['Leute', 'Reparieren', 'das', 'Dach', 'eines', 'Hauses', '.', '\n']
Ein hell gekleideter Mann fotografiert eine Gruppe von Männern in dunklen Anzügen und mit Hüten, die um eine Frau in eine

In [24]:
for i in range(len(en_vocab)):
    print(i, en_vocab.lookup_token(i))

0 <unk>
1 <pad>
2 <bos>
3 <eos>
4 a
5 

6 .
7 A
8 in
9 the
10 on
11 is
12 and
13 man
14 of
15 with
16 ,
17 woman
18 are
19 to
20 Two
21 at
22 wearing
23 people
24 shirt
25 white
26 young
27 black
28 his
29 an
30 while
31 blue
32 red
33 sitting
34 girl
35 dog
36 boy
37 men
38 standing
39 playing
40 group
41 street
42 down
43 walking
44 -
45 front
46 her
47 holding
48 water
49 by
50 The
51 up
52 green
53 women
54 An
55 one
56 for
57 looking
58 outside
59 child
60 Three
61 as
62 little
63 large
64 through
65 yellow
66 brown
67 two
68 from
69 hat
70 ball
71 their
72 into
73 person
74 children
75 next
76 other
77 dressed
78 small
79 out
80 over
81 building
82 riding
83 running
84 People
85 near
86 jacket
87 another
88 around
89 some
90 sidewalk
91 field
92 orange
93 beach
94 crowd
95 stands
96 pink
97 sits
98 jumping
99 behind
100 table
101 snow
102 grass
103 hair
104 background
105 stand
106 bike
107 's
108 air
109 city
110 player
111 girls
112 Man
113 looks
114 top
115 wall
116 off
117 th

In [27]:
import pickle
with open("/tmp/vocab_src_a.pkl", "rb") as f:
    vocab_src_a = pickle.load(f)

In [30]:
vocab_src_b = set([en_vocab.lookup_token(i) for i in range(len(en_vocab))])

In [42]:
vocab_src_a - vocab_src_b

{"another's",
 "Levi's",
 '281.',
 'knee-length',
 'protests',
 "80's",
 'hand-truck',
 'red-brick',
 'yo-yo',
 "there's",
 'P-shaped',
 'Skynyrd',
 'gangplank',
 'Dali',
 'impresses',
 'wooden-paneled',
 "river's",
 'sun-bathing',
 'mini-gold',
 'avoids',
 "Veteran's",
 "doctor's",
 "boy's",
 'Asian-inspired',
 'high-altitude',
 'autograph',
 'sports-bra',
 'face-painting',
 'non-grassy',
 "1950's",
 'roller-skaters',
 'youngest',
 'duty',
 'll',
 'negro',
 "catcher's",
 "ship's",
 'high-fashion',
 'thumbs-up',
 'pick-up',
 'sun-lit',
 'loop-the-loop',
 'star-shaped',
 'ninety-six',
 'dimly-lit',
 'Families',
 'canoe-type',
 'orthodontist',
 "Bird's",
 'spider-patterned',
 "Ruby's",
 'mini-skirt',
 'happy-appearing',
 'short-haired',
 "He's",
 'dark-haired',
 'blue-checkered',
 '5-person',
 'Corazon',
 'Mrs',
 'Bali',
 'clients',
 'sword-fighting',
 'unto',
 "grill's",
 'Professional',
 'V-sign',
 "Harrod's",
 'birds-eye',
 'face-off',
 "There's",
 'high-flying',
 'cross-legged',
 'pi

In [2]:
en_vocab

NameError: name 'en_vocab' is not defined