In [8]:
import torch
import torchtext
from torchtext.datasets import Multi30k
from torchtext.vocab import build_vocab_from_iterator

from tqdm import tqdm

import spacy

In [2]:
!python -m spacy download en_core_web_md de_core_news_md
!pip install portalocker

2023-12-01 01:54:29.984194: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-01 01:54:29.984282: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-01 01:54:29.984327: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now l

In [3]:
# load data sets
train_data, val_data, test_data = Multi30k(language_pair= ('de', 'en'))
next(iter(train_data))

('Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'Two young, White males are outside near many bushes.')

In [4]:
# define tokenizers
nlp_de = spacy.load("de_core_news_md")
nlp_en = spacy.load("en_core_web_md")

# create token iterables for each language
def get_de_tokens():
  for de_phrase, _ in iter(train_data):
    yield [token.text for token in nlp_en.tokenizer(de_phrase)]

def get_en_tokens():
  for _, en_phrase in iter(train_data):
    yield [token.text for token in nlp_en.tokenizer(en_phrase)]

tokens_de = get_de_tokens()
print(f"deutch tokens: {next(tokens_de)}")

# create deutch vocabulary
vocab_de = build_vocab_from_iterator(tokens_de, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

tokens_en = get_en_tokens()
print(f"english tokens: {next(tokens_en)}")

# create english vocabulary
vocab_en = build_vocab_from_iterator(tokens_en, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

# set unknown token index
vocab_de.set_default_index(vocab_de['<unk>'])
vocab_en.set_default_index(vocab_en['<unk>'])




deutch tokens: ['Zwei', 'junge', 'weiße', 'Männer', 'sind', 'i', 'm', 'Freien', 'in', 'der', 'Nähe', 'vieler', 'Büsche', '.']
english tokens: ['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [5]:
print(f"hello world I am being tokenized ->", end=" ")

[vocab_en[token.text] for token in nlp_en.tokenizer("hello world I am being tokenized")]

hello world I am being tokenized -> 

[5465, 1870, 1166, 3426, 194, 0]

In [9]:
# convert text to vectors

def data_process(texts: torch.utils.data.IterDataPipe):
  data = []
  for (raw_de, raw_en) in tqdm(iter(texts)):
    de_tensor_ = torch.tensor([vocab_de[token.text] for token in nlp_de(raw_de)],
                            dtype=torch.long)
    en_tensor_ = torch.tensor([vocab_en[token.text] for token in nlp_en(raw_en)],
                            dtype=torch.long)
    data.append((de_tensor_, en_tensor_))
  return data

train_data = data_process(train_data)
val_data = data_process(val_data)
test_data = data_process(test_data)

#TODO: iterable data loader
# https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset

5409it [02:06, 42.75it/s]


KeyboardInterrupt: ignored