#Background

Author: Christian Resch
Date: January 2024

Google Colab to use 600m NLLB model to translate benchmark datasets to test various LLMs for Swahili, Gikuyu, Hindi and Bhojpuri.

Set up as in this [medium blog post](https://medium.com/@perezogayo/translating-text-using-meta-ais-nllb-fb189f3a946c).

#1 Set up NLLB

In [None]:
!pip install transformers



In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

tokenizer = AutoTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/nllb-200-distilled-600M')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

#2 Set up pipelines

In [None]:
#Set up pipelines

# Swahili translator
#translator_en_swh = pipeline('translation', model=model, tokenizer=tokenizer, src_lang='eng_Latn', tgt_lang='swh_Latn', max_length = 200)

# Kikuyu translator
translator_en_kik = pipeline('translation', model=model, tokenizer=tokenizer, src_lang='eng_Latn', tgt_lang='kik_Latn', max_length = 200)

# Bhojpuri translator
translator_en_luo = pipeline('translation', model=model, tokenizer=tokenizer, src_lang='eng_Latn', tgt_lang='luo_Latn', max_length = 200)

# Hindi translator
#translator_en_hin = pipeline('translation', model=model, tokenizer=tokenizer, src_lang='eng_Latn', tgt_lang='hin_Deva', max_length = 200)

# Bhojpuri translator
#translator_en_bho = pipeline('translation', model=model, tokenizer=tokenizer, src_lang='eng_Latn', tgt_lang='bho_Deva', max_length = 200)



# 3 Load data

In [None]:
import os
from google.colab import drive
import json

In [None]:
drive.mount('/content/drive') # see here for details https://colab.research.google.com/notebooks/io.ipynb#scrollTo=RWSJpsyKqHjH
os.getcwd()

Mounted at /content/drive


'/content'

In [None]:
os.chdir('drive/MyDrive/Data/X-CSR_datasets/en')

In [None]:
ls

dev.jsonl  test.jsonl  train.jsonl


In [None]:
with open('dev.jsonl') as f:
  en = [json.loads(line) for line in f] # work around needed as dataset is in jsonl format

print("First entry in English dev dataset")
print(en[:2]) #list of dictionaries
print(en[0].keys())

print('en has length: ' + str(len(en)))

First entry in English dev dataset
[{'id': 'd3845adc08414fda', 'lang': 'en', 'question': {'stem': 'The dental office handled a lot of patients who experienced traumatic mouth injury, where were these patients coming from?', 'choices': [{'label': 'A', 'text': 'town'}, {'label': 'B', 'text': 'michigan'}, {'label': 'C', 'text': 'hospital'}, {'label': 'D', 'text': 'schools'}, {'label': 'E', 'text': 'office building'}]}, 'answerKey': 'C'}, {'id': '35677bbcf00c4e3a', 'lang': 'en', 'question': {'stem': 'Where can you find bald eagles and cheese in the midwest?', 'choices': [{'label': 'A', 'text': 'colorado'}, {'label': 'B', 'text': 'currency'}, {'label': 'C', 'text': 'iowa'}, {'label': 'D', 'text': 'arctic'}, {'label': 'E', 'text': 'wisconsin'}]}, 'answerKey': 'E'}]
dict_keys(['id', 'lang', 'question', 'answerKey'])
en has length: 1000


#4 Translate English-Kikuyu

In [None]:
#Import further packages
import time

In [None]:
#Define target language

#tgt_lang = 'kik'
tgt_lang = 'bho'

In [None]:
#Created directory for new language

path = '/content/drive/MyDrive/Data/X-CSR_datasets/'

if tgt_lang not in os.listdir(path):
  os.mkdir(path + tgt_lang)
  print('Created directory ' + tgt_lang + ' in ' + path +'.')
else:
  print('Directory ' + tgt_lang + ' already exists in ' + path +'.')

Directory bho already exists in /content/drive/MyDrive/Data/X-CSR_datasets/.


In [None]:
#Set up new dataset

translated_data = []

In [None]:
#translate
#Not super worked out yet, so you have to replace the translator from above by hand, see comments below

for english_dict in en:
  start_time = time.time()

  translated_dict = {}

  for i in english_dict.keys():
      translated_dict[i] = None

  translated_dict['id'] = english_dict['id']
  translated_dict['lang'] = tgt_lang

  #question is a dictionary with stem: text, choices: list of dictionaries
  question_translated = {}

  stem = english_dict['question']['stem']

  ####
  # INSERT CORRECT TRANSLATOR HERE!
  ####

  #stem_translated = translator_en_bho(stem)
  stem_translated = translator_en_kik(stem)

  question_translated['stem'] = stem_translated[0]['translation_text']

  #translating the choices
  choices = english_dict['question']['choices']

  choices_translated = []

  for choice in choices:
    translated_choice = {}

    translated_choice['label'] = choice['label']

    text = choice['text']

    #####
    # INSERT CORRECT TRANSLATOR HERE!
    #####

    #text_translated = translator_en_bho(text)
    text_translated = translator_en_kik(text)

    translated_choice['text'] = text_translated[0]['translation_text']

    choices_translated.append(translated_choice)

  question_translated['choices'] = choices_translated

  translated_dict['question'] = question_translated

  translated_dict['answerKey'] = english_dict['answerKey']

  translated_data.append(translated_dict)

  end_time = round(time.time() - start_time)
  print('Successfully translated ' + str(translated_dict['id']) + ' from en to ' + tgt_lang + '. Time needed: ' + str(end_time) + ' seconds')

# Serializing json
json_object = json.dumps(translated_data, indent=4)

# Writing to results_sw.json
with open(path + tgt_lang + '/dev.json', "w") as outfile:
    outfile.write(json_object)

Successfully translated d3845adc08414fda from en to bho. Time needed: 11 seconds
Successfully translated 35677bbcf00c4e3a from en to bho. Time needed: 9 seconds
Successfully translated 048b0565dc77a993 from en to bho. Time needed: 8 seconds
Successfully translated cbb8e2554edeed45 from en to bho. Time needed: 12 seconds
Successfully translated 055e75f54b8e913c from en to bho. Time needed: 9 seconds
Successfully translated f8876108403b66d6 from en to bho. Time needed: 9 seconds
Successfully translated 6029c72b4deda756 from en to bho. Time needed: 9 seconds
Successfully translated a44cdb09ea0f1be1 from en to bho. Time needed: 9 seconds
Successfully translated 2a7d31879031d457 from en to bho. Time needed: 8 seconds
Successfully translated 47eb76d3df2644cd from en to bho. Time needed: 9 seconds
Successfully translated c1eede6bb0f42589 from en to bho. Time needed: 7 seconds
Successfully translated b8bcb55951e48208 from en to bho. Time needed: 8 seconds
Successfully translated 53042df5a9772d

#5 Save translated data

In [None]:
# Serializing json
json_object = json.dumps(translated_data, indent=4)

# Writing to results_sw.json
with open(path + tgt_lang + '/dev.json', "w") as outfile:
    outfile.write(json_object)