In [1]:
import torch
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm
from transformers import AdamW
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [2]:
use_sample = False
valid_step = 50

In [3]:
config = GPT2Config(n_positions=1024//2, n_ctx=1024//2, n_embd=768//6, n_layer=12//6, n_head=12//6)

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('./gpt2')

In [5]:
model = GPT2LMHeadModel(config).cuda()
# model = GPT2LMHeadModel.from_pretrained('./gpt2')

In [6]:
# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# outputs = model(**inputs, labels=inputs["input_ids"], output_attentions=True)
# loss = outputs.loss
# logits = outputs.logits

In [7]:
# ! wget http://www.statmt.org/europarl/v7/fr-en.tgz

In [8]:
# ! tar zxvf fr-en.tgz

In [9]:
! ls -al *.en *.fr

-rw-rw-r-- 1 jkfirst jkfirst  30869430 10월  8 12:30 europarl-v7.fr-en.test.en
-rw-rw-r-- 1 jkfirst jkfirst  35007266 10월  8 12:31 europarl-v7.fr-en.test.fr
-rw-rw-r-- 1 jkfirst jkfirst 209887699 10월  8 12:26 europarl-v7.fr-en.train.en
-rw-rw-r-- 1 jkfirst jkfirst 242204275 10월  8 12:26 europarl-v7.fr-en.train.fr
-rw-rw-r-- 1 jkfirst jkfirst  60766172 10월  8 12:30 europarl-v7.fr-en.valid.en
-rw-rw-r-- 1 jkfirst jkfirst  69708260 10월  8 12:31 europarl-v7.fr-en.valid.fr
-rw-rw-r-- 1 jkfirst jkfirst     13743 10월  8 12:21 sample.test.en
-rw-rw-r-- 1 jkfirst jkfirst     15524 10월  8 12:22 sample.test.fr
-rw-rw-r-- 1 jkfirst jkfirst    119064 10월  8 12:20 sample.train.en
-rw-rw-r-- 1 jkfirst jkfirst    134973 10월  8 12:21 sample.train.fr
-rw-rw-r-- 1 jkfirst jkfirst     32738 10월  8 12:21 sample.valid.en
-rw-rw-r-- 1 jkfirst jkfirst     35821 10월  8 12:22 sample.valid.fr


In [10]:
! wc -l europarl-v7.fr-en.*

   207723 europarl-v7.fr-en.test.en
   207723 europarl-v7.fr-en.test.fr
  1400000 europarl-v7.fr-en.train.en
  1400000 europarl-v7.fr-en.train.fr
   400000 europarl-v7.fr-en.valid.en
   400000 europarl-v7.fr-en.valid.fr
  4015446 total


In [11]:
! wc -l sample.*

   100 sample.test.en
   100 sample.test.fr
   700 sample.train.en
   700 sample.train.fr
   200 sample.valid.en
   200 sample.valid.fr
  2000 total


In [12]:
! head -n 1 europarl-v7.fr-en.*

==> europarl-v7.fr-en.test.en <==
Having personally participated in a parliamentary delegation led by the Member, Gutiérrez Díaz, at the Intergovernmental Conference in Malta in 1997, we recognize that a lot of problems, including political ones, in certain areas of the Mediterranean basin are an obstacle or at least a brake on the launching of substantial cooperation.

==> europarl-v7.fr-en.test.fr <==
Ayant personnellement participé à la délégation parlementaire présidée par M. Gutiérrez Díaz à la conférence intergouvernementale de Malte en 1997, nous reconnaissons que de nombreux problèmes, d'ordre politique aussi, constituent, dans certaines zones du bassin méditerranéen, un obstacle, ou pour le moins un frein à la mise en oeuvre d'une véritable coopération.

==> europarl-v7.fr-en.train.en <==
Resumption of the session

==> europarl-v7.fr-en.train.fr <==
Reprise de la session

==> europarl-v7.fr-en.valid.en <==
Indeed, it helps clarify the means of redress available in

In [13]:
! head -n 1 sample.*

==> sample.test.en <==
In six months' time, it may be appropriate to carry out an analysis of the outcome and also to look more closely at the new situation' s effects upon the Commission' s role.

==> sample.test.fr <==
Il pourrait être judicieux de procéder, vers le milieu de l'année, à une analyse des résultats, mais aussi d' examiner de plus près l' incidence de la nouvelle situation sur le rôle de la Commission.

==> sample.train.en <==
Resumption of the session

==> sample.train.fr <==
Reprise de la session

==> sample.valid.en <==
I should also like to make a few comments, firstly, Mr Berend, regarding the assessment you have made of this sixth periodic report.

==> sample.valid.fr <==
Je voudrais à mon tour faire quelques observations, d'abord sur le jugement que vous portez, Monsieur le Rapporteur, sur ce sixième rapport périodique.


In [14]:
if use_sample:
    total_rows = 1000
else:
    total_rows = 2007723

In [15]:
n_train = total_rows * 70 // 100
n_valid = total_rows * 20 // 100
n_test = total_rows - n_train - n_valid

In [16]:
n_train, n_valid, n_test, n_train+n_valid+n_test==total_rows

(1405406, 401544, 200773, True)

In [17]:
n_support = 3

In [18]:
class TranslationDataset():
    def __init__(self, src_filename, tgt_filename, tokenizer, n_support):
        '''
        src_filename: french
        tgt_filename: english
        '''
        self.tokenizer = tokenizer
        self.n_support = n_support
        self.task = 'Translate French to English'
        self.task_delim = ' : '
        self.data_delim = ' => '
        
        print(f'reading src: {src_filename}')
        src = self.read_file(src_filename)
        print(f'reading tgt: {tgt_filename}')
        tgt = self.read_file(tgt_filename)
        self.text = self.concatenate_with_delim(src, tgt)
        self.text = [self.task + self.task_delim + t for t in self.text]
        
        # make input_ids and attention_mask
        self.input_tensor = list(map(lambda t: self.tokenizer(t, return_tensors='pt'), self.text))
        # self.input_ids = [tensor['input_ids'] for tensor in input_tensor]
        # self.attention_mask = [tensor['attention_mask'] for tensor in input_tensor]
        
#         self.src = src
#         self.src_attention = src_attention
#         self.tgt = tgt
#         self.tgt_attention = tgt_attention
        
    def concatenate_with_delim(self, src, tgt):
        text_list = []
        cnt = 0
        text = ''
        for s, t in zip(src, tgt):
            text += f'{s} => {t}'
            if cnt%self.n_support == self.n_support - 1:
                text_list.append(text)
                cnt = -1
                text = ''
            else:
                text += ' | '
            
            cnt += 1
        return text_list
            
    def read_file(self, filename):
        text_list = []
        with open(filename, 'r') as f:
            for oneline in tqdm(f, desc=f'reading {filename}'):
                oneline = oneline.rstrip()
                text_list.append(oneline)
        return text_list
    
    def __getitem__(self, idx):
        return self.input_tensor[idx]
    
    def __len__(self):
        return len(self.input_tensor)

In [19]:
if use_sample:
    train_dataset = TranslationDataset('sample.train.fr', 'sample.train.en', tokenizer, n_support)
    valid_dataset = TranslationDataset('sample.valid.fr', 'sample.valid.en', tokenizer, n_support)
    test_dataset = TranslationDataset('sample.test.fr', 'sample.test.en', tokenizer, n_support)
else:
    train_dataset = TranslationDataset('europarl-v7.fr-en.train.fr', 'europarl-v7.fr-en.train.en', tokenizer, n_support)
    valid_dataset = TranslationDataset('europarl-v7.fr-en.valid.fr', 'europarl-v7.fr-en.valid.en', tokenizer, n_support)
    test_dataset = TranslationDataset('europarl-v7.fr-en.test.fr', 'europarl-v7.fr-en.test.en', tokenizer, n_support)

reading src: europarl-v7.fr-en.train.fr


reading europarl-v7.fr-en.train.fr: 1400000it [00:01, 845276.33it/s]


reading tgt: europarl-v7.fr-en.train.en


reading europarl-v7.fr-en.train.en: 1400000it [00:01, 1281332.30it/s]


reading src: europarl-v7.fr-en.valid.fr


reading europarl-v7.fr-en.valid.fr: 400000it [00:00, 971274.11it/s] 


reading tgt: europarl-v7.fr-en.valid.en


reading europarl-v7.fr-en.valid.en: 400000it [00:00, 1450023.03it/s]


reading src: europarl-v7.fr-en.test.fr


reading europarl-v7.fr-en.test.fr: 207723it [00:00, 1117934.63it/s]


reading tgt: europarl-v7.fr-en.test.en


reading europarl-v7.fr-en.test.en: 207723it [00:00, 1440752.69it/s]


In [20]:
train_dataset.text[-1]

'Translate French to English : La résolution n\'a aucun effet formel mais elle exprime tout simplement les sentiments du Parlement. => The resolution has no formal effect at all, but is merely an expression of how Parliament feels. | Le vote décisif aura (probablement) lieu vers le début de l\'année prochaine lorsque le Parlement sera invité à donner son approbation à l\'accord. => The deciding vote will (probably) be some time early next year, when Parliament will be asked to give its consent to the agreement. | Si le "non" l\'emporte, l\'accord sera relégué aux oubliettes. => If we get a No vote then, the agreement will be scrapped.'

In [21]:
def collate_fn(batch):
    input_ids = []
    attention_mask = []
    for b in batch:
        input_ids.append(b['input_ids'][0][:512])
        attention_mask.append(b['attention_mask'][0][:512])
    
    # padding
    input_ids = pad_sequence(input_ids, batch_first=True)
    attention_mask = pad_sequence(attention_mask, batch_first=True)

    # make return dict
    ret = {
        'input_ids': input_ids.cuda(),
        'attention_mask': attention_mask.cuda()
    }
    return ret

In [22]:
# collate_fn은 batch 단위의 데이터에 적용해야 하는 작업을 수행할 때 사용하면 된다.
# 가령, 모델의 입력 데이터 사이즈는 일정해야 하기 때문에 pad_sequence 등의 함수를 통해 길이를 맞춰줘야 한다.
# 이 작업을 Dataset에서 할 경우 불필요하게 메모리를 많이 사용하게 되기 때문에
# collate_fn을 이용해서 각 batch가 생성될 때마다 pad_sequence를 적용해주는 것이다.
train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=6, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, collate_fn=collate_fn, batch_size=4, shuffle=False)
test_dataloader = DataLoader(test_dataset, collate_fn=collate_fn, batch_size=4, shuffle=False)

In [23]:
for i, batch in enumerate(train_dataloader):
    if i > 100:
        break
    print(batch)

{'input_ids': tensor([[ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,  3151,  1866,   286],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}
{'input_ids': tensor([[ 8291, 17660,  4141,  ...,  2932,  5472,    13],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0]], device='cuda:0'

{'input_ids': tensor([[ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,  3031,   287,   597],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}
{'input_ids': tensor([[ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,  5260, 21846,    13],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0],
        [ 8291, 17660,  4141,  ...,     0,     0,     0]], device='cuda:0'

In [24]:
# out = model(**batch, labels=batch['input_ids'], output_attentions=True)

In [25]:
# att = out.attentions[-1]
# len(att)

In [26]:
# Optimizer와 Loss 함수는 가장 일반적인 것으로 정의했다.
# 이 노트북 파일의 목적은 BERT를 이용해서 높은 성능의 모델을 간편하게 만들 수 있다는 것을 보여주기 위함이다.
# Optimizer와 Loss를 최적화할 경우 좋은 성능이 나온 이유를 잘 설명할 수 없다.
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
loss = nn.CrossEntropyLoss()

In [27]:
def train(dataloader):
    model.train()
    cnt = 0
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        out = model(**batch, labels=batch['input_ids'])
        loss = out.loss
        loss.backward()
        optimizer.step()
        cnt += 1
        total_loss += loss.data.item()
        # tbar.set_postfix(loss=loss.data.item())
    return total_loss/cnt

In [28]:
def valid(dataloader):
    model.eval()
    cnt = 0
    total_loss = 0
    for batch in tqdm(dataloader):
        out = model(**batch, labels=batch['input_ids'])
        loss = out.loss.data.item()
        cnt += 1
        total_loss += loss
    return total_loss/cnt

In [29]:
def test(dataloader):
    model.eval()
    cnt = 0
    total_loss = 0
    for batch in tqdm(dataloader):
        out = model(**batch, labels=batch['input_ids'])
        loss = out.loss.data.item()
        cnt += 1
        total_loss += loss
    return total_loss/cnt

In [30]:
train_dataset.text[0]

'Translate French to English : Reprise de la session => Resumption of the session | Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances. => I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period. | Comme vous avez pu le constater, le grand "bogue de l\'an 2000" ne s\'est pas produit. En revanche, les citoyens d\'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles. => Although, as you will have seen, the dreaded \'millennium bug\' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.'

In [31]:
valid_dataset.text[0]

"Translate French to English : En effet, il permet de clarifier les recours possible en cas de violation flagrante de la propriété intellectuelle dans un des pays Partie à l'accord. => Indeed, it helps clarify the means of redress available in the event of flagrant breaches of intellectual property rights in one of the countries party to the agreement. | La Commission a été transparente, l'ACAC ne peut pas aller plus loin que l'acquis communautaire, ne peut pas dépasser les mesures prises dans le cadre des directives européennes, ne peut en aucun cas violer les droits fondamentaux. => The Commission has been transparent, ACTA cannot go beyond the acquis communautaire, cannot exceed any of the measures taken within the framework of the European directives, and cannot under any circumstances violate fundamental rights. | Il a été négocié en dehors des instances internationales traditionnelles (OMC...) car la Chine et l'Inde refusaient tout accord! => It has been negotiated outside the tr

In [32]:
test_dataset.text[0]

"Translate French to English : Ayant personnellement participé à la délégation parlementaire présidée par M. Gutiérrez Díaz à la conférence intergouvernementale de Malte en 1997, nous reconnaissons que de nombreux problèmes, d'ordre politique aussi, constituent, dans certaines zones du bassin méditerranéen, un obstacle, ou pour le moins un frein à la mise en oeuvre d'une véritable coopération. => Having personally participated in a parliamentary delegation led by the Member, Gutiérrez Díaz, at the Intergovernmental Conference in Malta in 1997, we recognize that a lot of problems, including political ones, in certain areas of the Mediterranean basin are an obstacle or at least a brake on the launching of substantial cooperation. | Nous estimons toutefois que ces obstacles ne doivent pas devenir un alibi pour arrêter un processus de collaboration qui devient de plus en plus urgent et pressant. => Nonetheless, we believe that these obstacles must not become an excuse to halt a process of 

In [None]:
tbar = tqdm(range(10000))
best_loss = 10000
for i in tbar:
    train_loss = train(train_dataloader)
    
    if i % valid_step == 0:
        valid_loss = valid(valid_dataloader)
        print(f'validation loss: {valid_loss}')
        
        if valid_loss < best_loss:
            torch.save(model.state_dict(), '_very-small-gpt2.bin')
            best_loss = valid_loss
        
    tbar.set_postfix(loss=train_loss)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [41]:
tbar = tqdm(range(1000))
for i in tbar:
    train_loss = train(train_dataloader)
    
    if i % valid_step == 0:
        valid_loss = valid(valid_dataloader)
        print(f'validation loss: {valid_loss}')
        torch.save(model.state_dict(), '_very-small-gpt2.bin')
        
    tbar.set_postfix(loss=train_loss)

3.9875934684977814

In [42]:
# for i in range(1000):
#     train(train_dataloader)
    
#     if i % valid_step == 0:
#         valid_loss = valid(valid_dataloader)
#         print(f'validation loss: {valid_loss}')

In [43]:
# torch.save(model.state_dict(), 'very-small-gpt2.bin')

### test

In [5]:
import torch

In [6]:
model = GPT2LMHeadModel(config)

In [7]:
# 학습한 모델 로딩
model.load_state_dict(torch.load('_very-small-gpt2.bin', map_location='cpu'))
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 128)
    (wpe): Embedding(512, 128)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Layer

In [45]:
def one_data_test(src, example_tuple):
    # set support dataset
    text = f'Translate French to English : '
    for i, tup in enumerate(example_tuple):
        text += f'{tup} | '
    
    # make prompt
    text += f'{src} => '
    
    # make input tensor
    inputs = tokenizer(text, return_tensors='pt')
    inputs['input_ids'] = inputs['input_ids'].cuda()
    inputs['attention_mask'] = inputs['attention_mask'].cuda()
    print(inputs['input_ids'].shape)
    
    # generate text
    greedy_output = model.generate(**inputs, max_length=512)
    

    print("Output:\n" + 100 * '-')
    tgt = tokenizer.decode(greedy_output[0], skip_special_tokens=True)
    outputs = tokenizer(tgt, return_tensors='pt')
    print(outputs['input_ids'].shape)
#     out = model(**inputs)
    return tgt

In [48]:
example_tuple = (
    'Reprise de la session => Resumption of the session',
    'Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances. => I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.',
    #'Comme vous avez pu le constater, le grand "bogue de l\'an 2000" ne s\'est pas produit. En revanche, les citoyens d\'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles. => '
)
#src = "Il pourrait être judicieux de procéder, vers le milieu de l'année, à une analyse des résultats, mais aussi d' examiner de plus près l' incidence de la nouvelle situation sur le rôle de la Commission."
#src = 'Comme vous avez pu le constater, le grand "bogue de l\'an 2000" ne s\'est pas produit. En revanche, les citoyens d\'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles. => Although, as you will have seen, the dreaded \'millennium bug\' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.'
src = 'Reprise de la session'
out = one_data_test(src, example_tuple)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


torch.Size([1, 135])
Output:
----------------------------------------------------------------------------------------------------
torch.Size([1, 531])


In [49]:
out

'Translate French to English : Reprise de la session => Resumption of the session | Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances. => I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period. | Reprise de la session =>  theorem FOR AmarReasonReasonoman intellectuals Clash Clash." booking incom incomasticalastical TW TW TW TW TW TW�Remember Claud reass reass reass reass worldly Kuhoscopeoscopeoscopeeffeffbrainer Southwest Southwest Southwest Southwest Southwest Southwest Southwest FIFA ger orally.(.(.(.(.(.(.( Debatequalityqualityqualityquality Like tamptrialprop Ministry moltenaer basin 1929However FSA caffe […] […] […] […]packed markers markers Yusinterstitial Vegeta troubles troubles troub

In [40]:
for batch in valid_dataloader:
    tgt = tokenizer.decode(batch['input_ids'][0], skip_special_tokens=True)
    break

In [41]:
tgt

"Translate French to English : Je voudrais à mon tour faire quelques observations, d'abord sur le jugement que vous portez, Monsieur le Rapporteur, sur ce sixième rapport périodique. => I should also like to make a few comments, firstly, Mr Berend, regarding the assessment you have made of this sixth periodic report. | Vous en avez souligné la qualité et vous avez même écrit, si je ne me trompe, que par rapport à ceux qui le précédaient, il marquait une vraie amélioration. => You pointed out the quality of the report and you even wrote, if I am not mistaken, that it marked a real improvement in comparison with previous reports. | Au nom de tous les fonctionnaires de la Commission et de mon prédécesseur, Mme Wulf-Mathies, je tiens à vous dire que nous avons été très sensibles à cette appréciation portée par votre Assemblée et par vous-même. => On behalf of all the officials of the Commission and my predecessor, Mrs Wulf-Mathies, I must inform you that we were very alert to the evaluatio

In [43]:
train_dataset.text[0]

'Translate French to English : Reprise de la session => Resumption of the session | Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances. => I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period. | Comme vous avez pu le constater, le grand "bogue de l\'an 2000" ne s\'est pas produit. En revanche, les citoyens d\'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles. => Although, as you will have seen, the dreaded \'millennium bug\' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.'