In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('../artifacts/data/wmt/processed/train/en-fr.csv')
data.head()

Unnamed: 0,source_text,target_text
0,"Bonjour,","Hello,"
1,Mes livres #PRS_ORG# empruntés restent toujour...,My borrowed #PRS_ORG# books are still protecte...
2,Ça ne fonctionne pas et ça fait plusieurs fois...,It doesn't work and I've been through this pro...
3,Merci.,Thank you.
4,#EMAIL#,#EMAIL#


In [3]:
train_data, val_data = train_test_split(data, test_size=0.2)

In [4]:
from transformers import MarianTokenizer
model_token = 'Helsinki-NLP/opus-mt-fr-en'
tokenizer = MarianTokenizer.from_pretrained(model_token,return_tensors = 'pt')

In [5]:
with tokenizer.as_target_tokenizer():
        encodings = tokenizer('hi bub', max_length = 128,
                                         truncation=True, padding=True,
                                         return_tensors = 'pt')
        
encodings



{'input_ids': tensor([[10648,   383,  2949,     0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [6]:
tokenizer(text_target='hi bub', max_length = 128,
                                         truncation=True, padding=True,
                                         return_tensors = 'pt')

{'input_ids': tensor([[10648,   383,  2949,     0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [18]:
max_length=128

def preprocess_function(example):
        # print(example)
        try:
            inputs = example['source_text']
            targets = example['target_text']
            
            if not isinstance(inputs,(str,list)):
                inputs = str(inputs)
            if not isinstance(targets,(str,list)):
                targets = str(targets)
            
            input_encodings = tokenizer(inputs, max_length = max_length,
                                             truncation=True, padding=True,
                                             return_tensors = 'pt')
            
            target_encodings= tokenizer(max_length = max_length,
                                             truncation=True, padding=True,
                                             return_tensors = 'pt', text_target=targets)
            
            return {
                'input_ids' : input_encodings['input_ids'],
                'attention_mask' : input_encodings['attention_mask'],
                'labels': target_encodings['input_ids']       
            }
        except Exception as e:
            print('---------------------------------')
            print(e)
            print(example)
            print(type(example['source_text']), type(example['target_text']))
            
            

In [19]:
from datasets import load_dataset, load_from_disk

data = load_dataset('csv', data_files='../artifacts/data/wmt/processed/train/en-fr.csv')
data_pt = data.map(preprocess_function, batched=True)
data_pt.save_to_disk('./processed_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/15025 [00:00<?, ? examples/s]

In [20]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
model_fr_en = AutoModelForSeq2SeqLM.from_pretrained(model_token)

In [21]:
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer,model = model_fr_en)
        
data_pt = load_from_disk('./processed_dataset')

In [22]:
from transformers import TrainingArguments, Trainer

In [23]:
trainer_args = TrainingArguments(
            output_dir='.output',
            num_train_epochs = 3,
            learning_rate = 2e-5,
            per_device_train_batch_size = 32,
            per_device_eval_batch_size= 32,
            weight_decay= 0.02
        )
        
trainer = Trainer(
            model = model_fr_en, args = trainer_args,
            tokenizer = tokenizer, data_collator = seq2seq_data_collator,
            train_dataset = data_pt['train'],
            
        )
trainer.train()

Step,Training Loss


KeyboardInterrupt: 

## Configured till above

In [20]:
import evaluate
metric = evaluate.load('sacrebleu')

In [34]:
from torch.utils.data import DataLoader
train_dataloader=DataLoader(tokenized_datasets,shuffle =True,collate_fn = data_collator,batch_size=8)
eval_dataloader=DataLoader(test_datasets,collate_fn = data_collator,batch_size=8)

In [35]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x30c640710>

In [36]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_token)

In [37]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5)

In [38]:
from accelerate import Accelerator
accelerator = Accelerator()
model,optimizer,train_dataloader,eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)

In [39]:
from transformers import get_scheduler
num_train_epochs =3
num_steps = len(train_dataloader)
num_train_steps = num_train_epochs * num_steps

lr_scheduler = get_scheduler('linear', optimizer =optimizer,num_warmup_steps =0,num_training_steps=num_train_steps)

In [40]:
import numpy as np

def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

In [41]:
output_dir = './saved_model/'

In [42]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_train_steps))


for epoch in range(num_train_epochs):
    
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    

    # Evaluation
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)


  0%|          | 0/4509 [00:00<?, ?it/s]

RuntimeError: each element in list of batch should be of equal size