# Data Augmentation in NLP Using Back Translation With MarianMT       
> Author: https://zoumanakeita.medium.com/

In [None]:
!pip install transformers
!pip install sentencepiece
from transformers import MarianMTModel, MarianTokenizer

**Note**: 
Make sure to restart the "**Runtime > Restart runtime**" so that the changes are taken into consideration.

# Models Configuration

## Configuration of the first model 
This model translates from English to French

In [None]:
# Get the name of the first model
first_model_name = 'Helsinki-NLP/opus-mt-en-fr'

# Get the tokenizer
first_model_tkn = MarianTokenizer.from_pretrained(first_model_name)

# Load the pretrained model based on the name
first_model = MarianMTModel.from_pretrained(first_model_name)

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/784k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/287M [00:00<?, ?B/s]

## Configuration of the second model 
This model translates from French to English

In [None]:
# Get the name of the second model
second_model_name = 'Helsinki-NLP/opus-mt-fr-en'

# Get the tokenizer
second_model_tkn = MarianTokenizer.from_pretrained(second_model_name)

# Load the pretrained model based on the name
second_model = MarianMTModel.from_pretrained(second_model_name)

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/784k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/287M [00:00<?, ?B/s]

In [None]:
original_texts = ["This article aims to perform the back translation for text data augmentation",
          "It is the 25th article by Zoumana on Medium. He loves to give back to the community",
          "The first model translates from English to French, which is a temporary process", 
          "The second model finally translates back all the temporary french text into English"]

original_texts

['This article aims to perform the back translation for text data augmentation',
 'It is the 25th article by Zoumana on Medium. He loves to give back to the community',
 'The first model translates from English to French, which is a temporary process',
 'The second model finally translates back all the temporary french text into English']

In [None]:
def format_batch_texts(language_code, batch_texts):
  
  formated_bach = [">>{}<< {}".format(language_code, text) for text in batch_texts]

  return formated_bach


In [None]:
# Test of the function
format_batch_texts("fr", original_texts)

['>>fr<< This article aims to perform the back translation for text data augmentation',
 '>>fr<< It is the 25th article by Zoumana on Medium. He loves to give back to the community',
 '>>fr<< The first model translates from English to French, which is a temporary process',
 '>>fr<< The second model finally translates back all the temporary french text into English']

In [None]:
def perform_translation(batch_texts, model, tokenizer, language="fr"):
    # Prepare the text data into appropriate format for the model
    formated_batch_texts = format_batch_texts(language, batch_texts)
    
    # Generate translation using model
    translated = model.generate(**tokenizer(formated_batch_texts, return_tensors="pt", padding=True))

    # Convert the generated tokens indices back into text
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    
    return translated_texts


In [None]:
translated_texts = perform_translation(original_texts, first_model, first_model_tkn)

In [None]:
translated_texts

["Cet article vise à effectuer la traduction arrière pour l'augmentation de données texte",
 "C'est le 25ème article de Zoumana sur Medium. Il aime rendre à la communauté",
 "Le premier modèle traduit de l'anglais au français, qui est un processus temporaire",
 'Le deuxième modèle traduit enfin tout le texte temporaire français en anglais']

Now we can translate back the following texts into English by providing the proper parameters.

In [None]:
back_translated_texts = perform_translation(translated_texts, second_model, second_model_tkn)

In [None]:
back_translated_texts

['This article aims to perform back translation for text data increase',
 "This is Zoumana's 25th article on Medium. He likes to give back to the community.",
 'The first model translates from English to French, which is a temporary process',
 'The second model finally translates all the temporary French text into English']

Instead of performing all the tasks in a standalone manner, it might be interesting to create a function that generates the back translation rights away. 

In [None]:
def perform_back_translation(batch_texts, original_language="en", temporary_language="fr"):

  # Translate from Original to Temporary Language
  tmp_translated_batch = perform_translation(batch_texts, first_model, first_model_tkn, temporary_language)

  # Translate Back to English
  back_translated_batch = perform_translation(tmp_translated_batch, second_model, second_model_tkn, original_language)

  # Return The Final Result
  return back_translated_batch


In [None]:
def combine_texts(original_texts, back_translated_batch):
  
  return set(original_texts + back_translated_batch) 

In [None]:
back_translated_batch = perform_back_translation(original_texts)
back_translated_batch

['This article aims to perform back translation for text data increase',
 "This is Zoumana's 25th article on Medium. He likes to give back to the community.",
 'The first model translates from English to French, which is a temporary process',
 'The second model finally translates all the temporary French text into English']

We can notice that the third sentence in the original batch is exactly the same as the one in the back translated batch. Here is where the duplicate suppression occurs. 

### Final Augmented Text data. 
I modified the back translation function by creating the augmentation. 

In [None]:
def perform_back_translation_with_augmentation(batch_texts, original_language="en", temporary_language="fr"):

  # Translate from Original to Temporary Language
  tmp_translated_batch = perform_translation(batch_texts, first_model, first_model_tkn, temporary_language)

  # Translate Back to English
  back_translated_batch = perform_translation(tmp_translated_batch, second_model, second_model_tkn, original_language)

  # Return The Final Result
  return combine_texts(original_texts, back_translated_batch)

In [None]:
final_augmented = perform_back_translation_with_augmentation(original_texts)
final_augmented

{'It is the 25th article by Zoumana on Medium. He loves to give back to the community',
 'The first model translates from English to French, which is a temporary process',
 'The second model finally translates all the temporary French text into English',
 'The second model finally translates back all the temporary french text into English',
 'This article aims to perform back translation for text data increase',
 'This article aims to perform the back translation for text data augmentation',
 "This is Zoumana's 25th article on Medium. He likes to give back to the community."}