In [None]:
import pandas as pd
import spacy
from spacy.tokenizer import Tokenizer
from torchtext.data import Field, BucketIterator
import torchtext
import torch
import re
device = 'cuda' if torch.cuda.is_available() else 'cpu'


class DataFrameDataset(torchtext.data.Dataset):

    def __init__(self, df, src_field, target_field, is_test=False, **kwargs):
        fields = [('de', src_field), ('en', target_field)]
        examples = []
        for i, row in df.iterrows():
            en = row.en
            de = row.de
            examples.append(torchtext.data.Example.fromlist([de, en], fields))

        super().__init__(examples, fields, **kwargs)

deNLP = spacy.load("de_core_news_sm")
deTokenizer = Tokenizer(deNLP.vocab)

enNLP = spacy.load('en_core_web_sm')
enTokenizer = Tokenizer(enNLP.vocab)

def myTokenizerDE(x):
 return  [word.text for word in
          deTokenizer(re.sub(r"\s+\s+"," ",re.sub(r"[\.\'\`\"\r+\n+]"," ",x.lower())).strip())]
def myTokenizerEN(x):
 return  [word.text for word in
          enTokenizer(re.sub(r"\s+\s+"," ",re.sub(r"[\.\'\`\"\r+\n+]"," ",x.lower())).strip())]

def get_fields():
    fixed_length = 200
    SRC = torchtext.legacy.data.Field(tokenize=myTokenizerDE, lower=True, batch_first=False, init_token="<sos>", eos_token="<eos>", fix_length=fixed_length)
    TARGET = torchtext.legacy.data.Field(tokenize=myTokenizerEN, lower=True, batch_first=False, init_token="<sos>", eos_token="<eos>", fix_length=fixed_length)

    return SRC, TARGET

def preprocess_data():

    train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/WMT2016/train/train.tsv", sep='\t')
    valid = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/WMT2016/valid/valid.tsv", sep='\t')
    test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/WMT2016/test/test.tsv", sep='\t')

    fixed_length = 200
    SRC = Field(tokenize=myTokenizerDE, lower=True, batch_first=False, init_token="<sos>", eos_token="<eos>", fix_length=fixed_length)
    TARGET = Field(tokenize=myTokenizerEN, lower=True, batch_first=False, init_token="<sos>", eos_token="<eos>", fix_length=fixed_length)

    train_dataset = DataFrameDataset(train, SRC, TARGET)
    valid_dataset = DataFrameDataset(valid, SRC, TARGET)
    test_dataset = DataFrameDataset(test, SRC, TARGET)

    SRC.build_vocab(train_dataset, min_freq=2)
    TARGET.build_vocab(train_dataset, min_freq=2)

    BATCH_SIZE = 8

    train_iterator = BucketIterator(
        train_dataset,
        batch_size=BATCH_SIZE,
        device='cuda' if torch.cuda.is_available() else 'cpu',
        sort=False,
        sort_within_batch=False,
        shuffle=True
    )

    valid_iterator = BucketIterator(
        valid_dataset,
        batch_size=BATCH_SIZE,
        device='cuda' if torch.cuda.is_available() else 'cpu',
        sort=False,
        sort_within_batch=False,
        shuffle=False
    )

    test_iterator = BucketIterator(
        test_dataset,
        batch_size=BATCH_SIZE,
        device='cuda' if torch.cuda.is_available() else 'cpu',
        sort=False,
        sort_within_batch=False,
        shuffle=False
    )

    return train_iterator, valid_iterator, test_iterator, SRC, TARGET