In [1]:
import torch
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm, tqdm_notebook
from transformers import AdamW
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [2]:
use_sample = True
valid_step = 50

In [3]:
config = GPT2Config(n_positions=1024//2, n_ctx=1024//2, n_embd=768//6, n_layer=12//6, n_head=12//6)

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [5]:
model = GPT2LMHeadModel(config).cuda()
# model = GPT2LMHeadModel.from_pretrained('./gpt2')

In [6]:
# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# outputs = model(**inputs, labels=inputs["input_ids"], output_attentions=True)
# loss = outputs.loss
# logits = outputs.logits

In [7]:
# ! wget http://www.statmt.org/europarl/v7/fr-en.tgz

In [8]:
# ! tar zxvf fr-en.tgz

In [9]:
! ls -al *.en *.fr

-rw-r--r-- 1 jkfirst deep-learners  30869430 10월  9 17:07 europarl-v7.fr-en.test.en
-rw-r--r-- 1 jkfirst deep-learners  35007266 10월  9 17:07 europarl-v7.fr-en.test.fr
-rw-r--r-- 1 jkfirst deep-learners 209887699 10월  9 17:07 europarl-v7.fr-en.train.en
-rw-r--r-- 1 jkfirst deep-learners 242204275 10월  9 17:08 europarl-v7.fr-en.train.fr
-rw-r--r-- 1 jkfirst deep-learners  60766172 10월  9 17:08 europarl-v7.fr-en.valid.en
-rw-r--r-- 1 jkfirst deep-learners  69708260 10월  9 17:08 europarl-v7.fr-en.valid.fr
-rw-r--r-- 1 jkfirst deep-learners     13743 10월 11 07:46 sample.test.en
-rw-r--r-- 1 jkfirst deep-learners     15524 10월 11 07:46 sample.test.fr
-rw-r--r-- 1 jkfirst deep-learners    119064 10월 11 07:46 sample.train.en
-rw-r--r-- 1 jkfirst deep-learners    134973 10월 11 07:46 sample.train.fr
-rw-r--r-- 1 jkfirst deep-learners     32738 10월 11 07:46 sample.valid.en
-rw-r--r-- 1 jkfirst deep-learners     35821 10월 11 07:46 sample.valid.fr


In [10]:
! wc -l europarl-v7.fr-en.*

   207723 europarl-v7.fr-en.test.en
   207723 europarl-v7.fr-en.test.fr
  1400000 europarl-v7.fr-en.train.en
  1400000 europarl-v7.fr-en.train.fr
   400000 europarl-v7.fr-en.valid.en
   400000 europarl-v7.fr-en.valid.fr
  4015446 total


In [11]:
! wc -l sample.*

   100 sample.test.en
   100 sample.test.fr
   700 sample.train.en
   700 sample.train.fr
   200 sample.valid.en
   200 sample.valid.fr
  2000 total


In [12]:
! head -n 1 europarl-v7.fr-en.*

==> europarl-v7.fr-en.test.en <==
Having personally participated in a parliamentary delegation led by the Member, Gutiérrez Díaz, at the Intergovernmental Conference in Malta in 1997, we recognize that a lot of problems, including political ones, in certain areas of the Mediterranean basin are an obstacle or at least a brake on the launching of substantial cooperation.

==> europarl-v7.fr-en.test.fr <==
Ayant personnellement participé à la délégation parlementaire présidée par M. Gutiérrez Díaz à la conférence intergouvernementale de Malte en 1997, nous reconnaissons que de nombreux problèmes, d'ordre politique aussi, constituent, dans certaines zones du bassin méditerranéen, un obstacle, ou pour le moins un frein à la mise en oeuvre d'une véritable coopération.

==> europarl-v7.fr-en.train.en <==
Resumption of the session

==> europarl-v7.fr-en.train.fr <==
Reprise de la session

==> europarl-v7.fr-en.valid.en <==
Indeed, it helps clarify the means of redress available in

In [13]:
! head -n 1 sample.*

==> sample.test.en <==
In six months' time, it may be appropriate to carry out an analysis of the outcome and also to look more closely at the new situation' s effects upon the Commission' s role.

==> sample.test.fr <==
Il pourrait être judicieux de procéder, vers le milieu de l'année, à une analyse des résultats, mais aussi d' examiner de plus près l' incidence de la nouvelle situation sur le rôle de la Commission.

==> sample.train.en <==
Resumption of the session

==> sample.train.fr <==
Reprise de la session

==> sample.valid.en <==
I should also like to make a few comments, firstly, Mr Berend, regarding the assessment you have made of this sixth periodic report.

==> sample.valid.fr <==
Je voudrais à mon tour faire quelques observations, d'abord sur le jugement que vous portez, Monsieur le Rapporteur, sur ce sixième rapport périodique.


In [14]:
if use_sample:
    total_rows = 1000
else:
    total_rows = 2007723

In [15]:
n_train = total_rows * 70 // 100
n_valid = total_rows * 20 // 100
n_test = total_rows - n_train - n_valid

In [16]:
n_train, n_valid, n_test, n_train+n_valid+n_test==total_rows

(700, 200, 100, True)

In [17]:
n_support = 3

In [18]:
class TranslationDataset():
    def __init__(self, src_filename, tgt_filename, tokenizer, n_support):
        '''
        src_filename: french
        tgt_filename: english
        '''
        self.tokenizer = tokenizer
        self.n_support = n_support
        self.task = 'Translate French to English'
        self.task_delim = ' : '
        self.data_delim = ' => '
        
        print(f'reading src: {src_filename}')
        src = self.read_file(src_filename)
        print(f'reading tgt: {tgt_filename}')
        tgt = self.read_file(tgt_filename)
        self.text = self.concatenate_with_delim(src, tgt)
        self.text = [self.task + self.task_delim + t for t in self.text]
        
        # make input_ids and attention_mask
        self.input_tensor = list(map(lambda t: self.tokenizer(t, return_tensors='pt'), self.text))
        # self.input_tensor = []
        # for t in tqdm(self.text, desc='making tensor'):
        #     self.input_tensor.append(tokenizer(t, return_tensors='pt'))
        # self.input_ids = [tensor['input_ids'] for tensor in input_tensor]
        # self.attention_mask = [tensor['attention_mask'] for tensor in input_tensor]
        
#         self.src = src
#         self.src_attention = src_attention
#         self.tgt = tgt
#         self.tgt_attention = tgt_attention
        
    def concatenate_with_delim(self, src, tgt):
        text_list = []
        cnt = 0
        text = ''
        for s, t in zip(src, tgt):
            text += f'{s} => {t}'
            if cnt%self.n_support == self.n_support - 1:
                text_list.append(text)
                cnt = -1
                text = ''
            else:
                text += ' | '
            
            cnt += 1
        return text_list
            
    def read_file(self, filename):
        text_list = []
        with open(filename, 'r') as f:
            for oneline in tqdm(f, desc=f'reading {filename}'):
                oneline = oneline.rstrip()
                text_list.append(oneline)
        return text_list
    
    def __getitem__(self, idx):
        return self.input_tensor[idx]
    
    def __len__(self):
        return len(self.input_tensor)

In [19]:
if use_sample:
    train_dataset = TranslationDataset('sample.train.fr', 'sample.train.en', tokenizer, n_support)
    valid_dataset = TranslationDataset('sample.valid.fr', 'sample.valid.en', tokenizer, n_support)
    test_dataset = TranslationDataset('sample.test.fr', 'sample.test.en', tokenizer, n_support)
else:
    train_dataset = TranslationDataset('europarl-v7.fr-en.train.fr', 'europarl-v7.fr-en.train.en', tokenizer, n_support)
    valid_dataset = TranslationDataset('europarl-v7.fr-en.valid.fr', 'europarl-v7.fr-en.valid.en', tokenizer, n_support)
    test_dataset = TranslationDataset('europarl-v7.fr-en.test.fr', 'europarl-v7.fr-en.test.en', tokenizer, n_support)

reading sample.train.fr: 700it [00:00, 334131.42it/s]
reading sample.train.en: 700it [00:00, 464706.05it/s]

reading src: sample.train.fr
reading tgt: sample.train.en



reading sample.valid.fr: 200it [00:00, 322019.50it/s]
reading sample.valid.en: 200it [00:00, 442203.90it/s]
reading sample.test.fr: 100it [00:00, 205603.14it/s]
reading sample.test.en: 100it [00:00, 285910.29it/s]

reading src: sample.valid.fr
reading tgt: sample.valid.en
reading src: sample.test.fr
reading tgt: sample.test.en





In [20]:
train_dataset.text[-1]

"Translate French to English : Concernant la fiscalité, les aides d' État, les Fonds structurels, la défense de nos productions traditionnelles il est urgent d' imaginer des mesures concrètes marquées par l' audace et par l' ambition. => As regards taxation, state aid, the Structural Funds and defending our traditional products, practical measures characterised by daring and ambition must be planned as a matter of urgency. | Faute de quoi, la convergence et la cohésion ne resteront malheureusement pour nous que des mots et il est à craindre que la politique structurelle menée dans nos régions, malgré l' importance des sommes engagées, se soldera par un échec. => If these do not materialise, then, unfortunately, convergence and cohesion will remain no more than words for us, and it is to be feared that the structural policy undertaken in our regions, despite the size of the amounts committed, will end in failure. | Monsieur le Président, à mon tour je voudrais, comme tous les orateurs l

In [21]:
# Token indices sequence length is longer than the specified maximum sequence length for this model (1049 > 1024). Running this sequence through the model will result in indexing errors

In [22]:
def collate_fn(batch):
    input_ids = []
    attention_mask = []
    for b in batch:
        input_ids.append(b['input_ids'][0][:512])
        attention_mask.append(b['attention_mask'][0][:512])
    
    # padding
    input_ids = pad_sequence(input_ids, batch_first=True)
    attention_mask = pad_sequence(attention_mask, batch_first=True)

    # make return dict
    ret = {
        'input_ids': input_ids.cuda(),
        'attention_mask': attention_mask.cuda()
    }
    return ret

In [23]:
# collate_fn은 batch 단위의 데이터에 적용해야 하는 작업을 수행할 때 사용하면 된다.
# 가령, 모델의 입력 데이터 사이즈는 일정해야 하기 때문에 pad_sequence 등의 함수를 통해 길이를 맞춰줘야 한다.
# 이 작업을 Dataset에서 할 경우 불필요하게 메모리를 많이 사용하게 되기 때문에
# collate_fn을 이용해서 각 batch가 생성될 때마다 pad_sequence를 적용해주는 것이다.
train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=8, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, collate_fn=collate_fn, batch_size=4, shuffle=False)
test_dataloader = DataLoader(test_dataset, collate_fn=collate_fn, batch_size=4, shuffle=False)

In [24]:
for i, batch in enumerate(train_dataloader):
    if i > 100:
        break
    print(batch)

{'input_ids': tensor([[ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,   290,  5120,   287],
        ...,
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}
{'input_ids': tensor([[ 8291, 17660,  4141,  ...,  8475, 15221,    13],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        ...,
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,

In [25]:
# out = model(**batch, labels=batch['input_ids'], output_attentions=True)

In [26]:
# att = out.attentions[-1]
# len(att)

In [27]:
# Optimizer와 Loss 함수는 가장 일반적인 것으로 정의했다.
# 이 노트북 파일의 목적은 BERT를 이용해서 높은 성능의 모델을 간편하게 만들 수 있다는 것을 보여주기 위함이다.
# Optimizer와 Loss를 최적화할 경우 좋은 성능이 나온 이유를 잘 설명할 수 없다.
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
loss = nn.CrossEntropyLoss()

In [28]:
def train(dataloader, epoch):
    model.train()
    cnt = 0
    total_loss = 0
    save_step = 100
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        out = model(**batch, labels=batch['input_ids'])
        loss = out.loss
        loss.backward()
        optimizer.step()
        
        if cnt and (cnt % save_step == 0):
            torch.save(model.state_dict(), f'__very-small-gpt2-{epoch}.bin')
            
        cnt += 1
        total_loss += loss.data.item()
    return total_loss/cnt

In [29]:
def valid(dataloader):
    model.eval()
    cnt = 0
    total_loss = 0
    for batch in tqdm(dataloader):
        out = model(**batch, labels=batch['input_ids'])
        loss = out.loss.data.item()
        cnt += 1
        total_loss += loss
    return total_loss/cnt

In [30]:
def test(dataloader):
    model.eval()
    cnt = 0
    total_loss = 0
    for batch in tqdm(dataloader):
        out = model(**batch, labels=batch['input_ids'])
        loss = out.loss.data.item()
        cnt += 1
        total_loss += loss
    return total_loss/cnt

In [31]:
train_dataset.text[0]

'Translate French to English : Reprise de la session => Resumption of the session | Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances. => I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period. | Comme vous avez pu le constater, le grand "bogue de l\'an 2000" ne s\'est pas produit. En revanche, les citoyens d\'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles. => Although, as you will have seen, the dreaded \'millennium bug\' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.'

In [32]:
valid_dataset.text[0]

"Translate French to English : Je voudrais à mon tour faire quelques observations, d'abord sur le jugement que vous portez, Monsieur le Rapporteur, sur ce sixième rapport périodique. => I should also like to make a few comments, firstly, Mr Berend, regarding the assessment you have made of this sixth periodic report. | Vous en avez souligné la qualité et vous avez même écrit, si je ne me trompe, que par rapport à ceux qui le précédaient, il marquait une vraie amélioration. => You pointed out the quality of the report and you even wrote, if I am not mistaken, that it marked a real improvement in comparison with previous reports. | Au nom de tous les fonctionnaires de la Commission et de mon prédécesseur, Mme Wulf-Mathies, je tiens à vous dire que nous avons été très sensibles à cette appréciation portée par votre Assemblée et par vous-même. => On behalf of all the officials of the Commission and my predecessor, Mrs Wulf-Mathies, I must inform you that we were very alert to the evaluatio

In [33]:
test_dataset.text[0]

"Translate French to English : Il pourrait être judicieux de procéder, vers le milieu de l'année, à une analyse des résultats, mais aussi d' examiner de plus près l' incidence de la nouvelle situation sur le rôle de la Commission. => In six months' time, it may be appropriate to carry out an analysis of the outcome and also to look more closely at the new situation' s effects upon the Commission' s role. | La question de savoir comment nous procéderions par la suite a, jusqu' à nouvel ordre, trouvé sa réponse dans l' idée d' organiser un congrès institutionnel global, où serait ouvert un débat sans conditions et aux perspectives les plus larges, entre des représentants des différents intérêts existants. => The question of how best to make further progress has so far been solved through the idea of holding an inter-institutional congress which will open up an unbiased debate adopting a broad perspective and involving representatives of different interests. | L' on aura alors l' occasion

In [34]:
tbar = tqdm(range(100000))
best_loss = 10000
for i in tbar:
    train_loss = train(train_dataloader, i)
    
    if i % valid_step == 0:
        valid_loss = valid(valid_dataloader)
        print(f'validation loss: {valid_loss}')
        
        if valid_loss < best_loss:
            torch.save(model.state_dict(), '__very-small-gpt2.bin')
            best_loss = valid_loss
            print(f'best_loss: {best_loss}')
        
    tbar.set_postfix(loss=train_loss)

  0%|          | 0/100000 [00:00<?, ?it/s]

0





In [42]:
# for i in range(1000):
#     train(train_dataloader)
    
#     if i % valid_step == 0:
#         valid_loss = valid(valid_dataloader)
#         print(f'validation loss: {valid_loss}')

In [43]:
# torch.save(model.state_dict(), 'very-small-gpt2.bin')

### test

In [7]:
import torch
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm, tqdm_notebook
from transformers import AdamW
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.nn.parallel import DistributedDataParallel

In [3]:
config = GPT2Config(n_positions=1024//2, n_ctx=1024//2, n_embd=768//6, n_layer=12//6, n_head=12//6)

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [5]:
model = GPT2LMHeadModel(config)

In [8]:
model = DistributedDataParallel(model)

RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.

In [6]:
# 학습한 모델 로딩
model.load_state_dict(torch.load('__very-small-gpt2-valid-900.bin', map_location='cpu'))
model.eval()

RuntimeError: Error(s) in loading state_dict for GPT2LMHeadModel:
	Missing key(s) in state_dict: "transformer.wte.weight", "transformer.wpe.weight", "transformer.h.0.ln_1.weight", "transformer.h.0.ln_1.bias", "transformer.h.0.attn.bias", "transformer.h.0.attn.masked_bias", "transformer.h.0.attn.c_attn.weight", "transformer.h.0.attn.c_attn.bias", "transformer.h.0.attn.c_proj.weight", "transformer.h.0.attn.c_proj.bias", "transformer.h.0.ln_2.weight", "transformer.h.0.ln_2.bias", "transformer.h.0.mlp.c_fc.weight", "transformer.h.0.mlp.c_fc.bias", "transformer.h.0.mlp.c_proj.weight", "transformer.h.0.mlp.c_proj.bias", "transformer.h.1.ln_1.weight", "transformer.h.1.ln_1.bias", "transformer.h.1.attn.bias", "transformer.h.1.attn.masked_bias", "transformer.h.1.attn.c_attn.weight", "transformer.h.1.attn.c_attn.bias", "transformer.h.1.attn.c_proj.weight", "transformer.h.1.attn.c_proj.bias", "transformer.h.1.ln_2.weight", "transformer.h.1.ln_2.bias", "transformer.h.1.mlp.c_fc.weight", "transformer.h.1.mlp.c_fc.bias", "transformer.h.1.mlp.c_proj.weight", "transformer.h.1.mlp.c_proj.bias", "transformer.ln_f.weight", "transformer.ln_f.bias", "lm_head.weight". 
	Unexpected key(s) in state_dict: "module.transformer.wte.weight", "module.transformer.wpe.weight", "module.transformer.h.0.ln_1.weight", "module.transformer.h.0.ln_1.bias", "module.transformer.h.0.attn.bias", "module.transformer.h.0.attn.masked_bias", "module.transformer.h.0.attn.c_attn.weight", "module.transformer.h.0.attn.c_attn.bias", "module.transformer.h.0.attn.c_proj.weight", "module.transformer.h.0.attn.c_proj.bias", "module.transformer.h.0.ln_2.weight", "module.transformer.h.0.ln_2.bias", "module.transformer.h.0.mlp.c_fc.weight", "module.transformer.h.0.mlp.c_fc.bias", "module.transformer.h.0.mlp.c_proj.weight", "module.transformer.h.0.mlp.c_proj.bias", "module.transformer.h.1.ln_1.weight", "module.transformer.h.1.ln_1.bias", "module.transformer.h.1.attn.bias", "module.transformer.h.1.attn.masked_bias", "module.transformer.h.1.attn.c_attn.weight", "module.transformer.h.1.attn.c_attn.bias", "module.transformer.h.1.attn.c_proj.weight", "module.transformer.h.1.attn.c_proj.bias", "module.transformer.h.1.ln_2.weight", "module.transformer.h.1.ln_2.bias", "module.transformer.h.1.mlp.c_fc.weight", "module.transformer.h.1.mlp.c_fc.bias", "module.transformer.h.1.mlp.c_proj.weight", "module.transformer.h.1.mlp.c_proj.bias", "module.transformer.ln_f.weight", "module.transformer.ln_f.bias", "module.lm_head.weight". 

In [45]:
def one_data_test(src, example_tuple):
    # set support dataset
    text = f'Translate French to English : '
    for i, tup in enumerate(example_tuple):
        text += f'{tup} | '
    
    # make prompt
    text += f'{src} => '
    
    # make input tensor
    inputs = tokenizer(text, return_tensors='pt')
    inputs['input_ids'] = inputs['input_ids'].cuda()
    inputs['attention_mask'] = inputs['attention_mask'].cuda()
    print(inputs['input_ids'].shape)
    
    # generate text
    greedy_output = model.generate(**inputs, max_length=512)
    

    print("Output:\n" + 100 * '-')
    tgt = tokenizer.decode(greedy_output[0], skip_special_tokens=True)
    outputs = tokenizer(tgt, return_tensors='pt')
    print(outputs['input_ids'].shape)
#     out = model(**inputs)
    return tgt

In [48]:
example_tuple = (
    'Reprise de la session => Resumption of the session',
    'Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances. => I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.',
    #'Comme vous avez pu le constater, le grand "bogue de l\'an 2000" ne s\'est pas produit. En revanche, les citoyens d\'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles. => '
)
#src = "Il pourrait être judicieux de procéder, vers le milieu de l'année, à une analyse des résultats, mais aussi d' examiner de plus près l' incidence de la nouvelle situation sur le rôle de la Commission."
#src = 'Comme vous avez pu le constater, le grand "bogue de l\'an 2000" ne s\'est pas produit. En revanche, les citoyens d\'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles. => Although, as you will have seen, the dreaded \'millennium bug\' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.'
src = 'Reprise de la session'
out = one_data_test(src, example_tuple)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


torch.Size([1, 135])
Output:
----------------------------------------------------------------------------------------------------
torch.Size([1, 531])


In [49]:
out

'Translate French to English : Reprise de la session => Resumption of the session | Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances. => I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period. | Reprise de la session =>  theorem FOR AmarReasonReasonoman intellectuals Clash Clash." booking incom incomasticalastical TW TW TW TW TW TW�Remember Claud reass reass reass reass worldly Kuhoscopeoscopeoscopeeffeffbrainer Southwest Southwest Southwest Southwest Southwest Southwest Southwest FIFA ger orally.(.(.(.(.(.(.( Debatequalityqualityqualityquality Like tamptrialprop Ministry moltenaer basin 1929However FSA caffe […] […] […] […]packed markers markers Yusinterstitial Vegeta troubles troubles troub

In [40]:
for batch in valid_dataloader:
    tgt = tokenizer.decode(batch['input_ids'][0], skip_special_tokens=True)
    break

In [41]:
tgt

"Translate French to English : Je voudrais à mon tour faire quelques observations, d'abord sur le jugement que vous portez, Monsieur le Rapporteur, sur ce sixième rapport périodique. => I should also like to make a few comments, firstly, Mr Berend, regarding the assessment you have made of this sixth periodic report. | Vous en avez souligné la qualité et vous avez même écrit, si je ne me trompe, que par rapport à ceux qui le précédaient, il marquait une vraie amélioration. => You pointed out the quality of the report and you even wrote, if I am not mistaken, that it marked a real improvement in comparison with previous reports. | Au nom de tous les fonctionnaires de la Commission et de mon prédécesseur, Mme Wulf-Mathies, je tiens à vous dire que nous avons été très sensibles à cette appréciation portée par votre Assemblée et par vous-même. => On behalf of all the officials of the Commission and my predecessor, Mrs Wulf-Mathies, I must inform you that we were very alert to the evaluatio

In [43]:
train_dataset.text[0]

'Translate French to English : Reprise de la session => Resumption of the session | Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances. => I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period. | Comme vous avez pu le constater, le grand "bogue de l\'an 2000" ne s\'est pas produit. En revanche, les citoyens d\'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles. => Although, as you will have seen, the dreaded \'millennium bug\' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.'