#Background

Author: Christian Resch
Date: February 2024

Google Colab to use 600m or larger NLLB model to translate benchmark datasets to test various LLMs for Swahili, Gikuyu, Luo, Hindi and Bhojpuri.

Set up as in this [medium blog post](https://medium.com/@perezogayo/translating-text-using-meta-ais-nllb-fb189f3a946c).

In [1]:
import json
with open('../data/raw/test.jsonl') as f:
    en = [json.loads(line) for line in f] # work around needed as dataset is in jsonl format

print('Data loaded with length ' + str(len(en)))




Data loaded with length 1


In [5]:
en

[{'id': '95b2fec4a492244a',
  'lang': 'en',
  'question': {'stem': 'The people wanted to stop the parade, so what did they set up to thwart it?',
   'choices': [{'label': 'A', 'text': 'carnival'},
    {'label': 'B', 'text': 'public place'},
    {'label': 'C', 'text': 'apartment'},
    {'label': 'D', 'text': 'roadblock'},
    {'label': 'E', 'text': 'space shuttle'}]}}]

In [7]:
def save_translated_data(translated_data, tgt_lang, path = '../data/processed/'):
  # Serializing json
  json_object = json.dumps(translated_data, indent=4)

  # Writing to results_sw.json
  with open(path + tgt_lang + '_dev.json', "w") as outfile:
      outfile.write(json_object)

  print('Saved data for ' + tgt_lang + '.')



save_translated_data(en, "luku", path = '../data/processed/')

Saved data for luku.


# 0 Define functions

In [None]:
def create_directory(tgt_lang, path = '/content/drive/MyDrive/Data/X-CSR_datasets/'):
  '''

  Create a directory for the target language in path. Do not create a new directory if directory already exists

  '''

  if tgt_lang not in os.listdir(path):
    os.mkdir(path + tgt_lang)
    print('Created directory ' + tgt_lang + ' in ' + path +'.')
  else:
    print('Directory ' + tgt_lang + ' already exists in ' + path +'.')

In [None]:
def translate_xcsr(tgt_lang, english_data, path = '/content/drive/MyDrive/Data/X-CSR_datasets/', print_update = True):

  '''
  Not super refined yet...

  Translates the X-CSR dataset.

  Returns a dictionary.

  Requires pipeline from the transformer library.

  Expects a set up NLLB model, e.g. facebook/nllb-200-distilled-600M or facebook/nllb-200-3.3B from Huggingface

  Expects a loaded the english dev.jsonl X-CSR dataset in english_dict

  Path default is set to run in Google Colab
  '''

  #Set up pipelines

  print('Setting up ' + tgt_lang + ' translator...')
  if tgt_lang == 'swh':
    # Swahili translator
    translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang='eng_Latn', tgt_lang='swh_Latn', max_length = 200)
  elif tgt_lang == 'kik':
    # Kikuyu translator
    translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang='eng_Latn', tgt_lang='kik_Latn', max_length = 200)
  elif tgt_lang == 'luo':
    # Luo translator
    translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang='eng_Latn', tgt_lang='luo_Latn', max_length = 200)
  elif tgt_lang == 'hin':
    # Hindi translator
    translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang='eng_Latn', tgt_lang='hin_Deva', max_length = 200)
  elif tgt_lang == 'bho':
    #Bhojpuri translator
    translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang='eng_Latn', tgt_lang='bho_Deva', max_length = 200)
  else:
    print('Translator has not set up, please add it to the function. And improve the function...')
    return

  #Set up new dataset

  translated_data = []

  #translate

  counter = 1

  for english_dict in english_data:
    start_time = time.time()

    translated_dict = {}

    for i in english_dict.keys():
        translated_dict[i] = None

    translated_dict['id'] = english_dict['id']
    translated_dict['lang'] = tgt_lang

    #question is a dictionary with stem: text, choices: list of dictionaries
    question_translated = {}

    stem = english_dict['question']['stem']

    stem_translated = translator(stem)

    question_translated['stem'] = stem_translated[0]['translation_text']

    #translating the choices
    choices = english_dict['question']['choices']

    choices_translated = []

    for choice in choices:
      translated_choice = {}

      translated_choice['label'] = choice['label']

      text = choice['text']

      text_translated = translator(text)

      translated_choice['text'] = text_translated[0]['translation_text']

      choices_translated.append(translated_choice)

    question_translated['choices'] = choices_translated

    translated_dict['question'] = question_translated

    translated_dict['answerKey'] = english_dict['answerKey']

    translated_data.append(translated_dict)

    end_time = round(time.time() - start_time)

    if print_update:
      print('Successfully translated question number ' + str(counter) + ' from en to ' + tgt_lang + '. Time needed: ' + str(end_time) + ' seconds')

    counter += 1

  return translated_data

In [None]:
def save_translated_data(translated_data, tgt_lang, path = '/content/drive/MyDrive/Data/X-CSR_datasets/'):
  # Serializing json
  json_object = json.dumps(translated_data, indent=4)

  # Writing to results_sw.json
  with open(path + tgt_lang + '/dev.json', "w") as outfile:
      outfile.write(json_object)

  print('Saved data for ' + tgt_lang + '.')

In [None]:
def create_test_dataset(data, sample_size, seed = 10):
  #Ensure repeatability
  random.seed(seed)

  sample = random.sample(range(0, len(data) - 1), sample_size)

  data_short = []

  for num in sample:
    data_short.append(data[num])

  print('Generated sample from data with length ' + str(len(data_short)) + '.00')
  return data_short

#1 Set up NLLB

In [None]:
!pip install transformers



In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

'''
Exchange model HERE
'''

model = 'facebook/nllb-200-distilled-600M'
#model = 'facebook/nllb-200-3.3B' # from https://huggingface.co/facebook/nllb-200-3.3B

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSeq2SeqLM.from_pretrained(model)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
tokenizer_config.json: 100%|██████████| 564/564 [00:00<00:00, 384kB/s]
sentencepiece.bpe.model: 100%|██████████| 4.85M/4.85M [00:00<00:00, 14.0MB/s]
tokenizer.json: 100%|██████████| 17.3M/17.3M [00:00<00:00, 18.3MB/s]
special_tokens_map.json: 100%|██████████| 3.55k/3.55k [00:00<00:00, 1.53MB/s]


ImportError: 
AutoModelForSeq2SeqLM requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.


# 2 Load data

This would need to be adapted for use on the DFKI Cluster.

In [None]:
import os
from google.colab import drive
import json

In [None]:
drive.mount('/content/drive') # see here for details https://colab.research.google.com/notebooks/io.ipynb#scrollTo=RWSJpsyKqHjH
os.getcwd()

Mounted at /content/drive


'/content'

In [None]:
os.chdir('drive/MyDrive/Data/X-CSR_datasets/en')

In [None]:
ls

dev.jsonl  test.jsonl  train.jsonl


In [None]:
with open('dev.jsonl') as f:
  en = [json.loads(line) for line in f] # work around needed as dataset is in jsonl format

print("First entry in English dev dataset")
print(en[:2]) #list of dictionaries
print(en[0].keys())

print('en has length: ' + str(len(en)))

First entry in English dev dataset
[{'id': 'd3845adc08414fda', 'lang': 'en', 'question': {'stem': 'The dental office handled a lot of patients who experienced traumatic mouth injury, where were these patients coming from?', 'choices': [{'label': 'A', 'text': 'town'}, {'label': 'B', 'text': 'michigan'}, {'label': 'C', 'text': 'hospital'}, {'label': 'D', 'text': 'schools'}, {'label': 'E', 'text': 'office building'}]}, 'answerKey': 'C'}, {'id': '35677bbcf00c4e3a', 'lang': 'en', 'question': {'stem': 'Where can you find bald eagles and cheese in the midwest?', 'choices': [{'label': 'A', 'text': 'colorado'}, {'label': 'B', 'text': 'currency'}, {'label': 'C', 'text': 'iowa'}, {'label': 'D', 'text': 'arctic'}, {'label': 'E', 'text': 'wisconsin'}]}, 'answerKey': 'E'}]
dict_keys(['id', 'lang', 'question', 'answerKey'])
en has length: 1000


In [None]:
#Short data for test purposes

import random

en_short = create_test_dataset(en, 10, seed = 10)

Generated sample from data with length 10.00


#3 Translate data

In [None]:
#Import further packages
import time

In [None]:
#Define target languages

tgt_langs = ['bho', 'luo']
#tgt_langs = ['kik', 'bho']
#tgt_langs = ['kik', 'bho', 'luo']

In [None]:
#Created directories for target languages languages

for tgt_lang in tgt_langs:
  create_directory(tgt_lang)

Directory bho already exists in /content/drive/MyDrive/Data/X-CSR_datasets/.
Directory luo already exists in /content/drive/MyDrive/Data/X-CSR_datasets/.


In [None]:
# Test translate data to ensure the pipeline works

for tgt_lang in tgt_langs:
  translated_data = translate_xcsr(tgt_lang, en_short, path = '/content/drive/MyDrive/Data/X-CSR_datasets/', print_update = True)

  save_translated_data(translated_data, tgt_lang, path = '/content/drive/MyDrive/Data/X-CSR_datasets/')

Setting up kik translator...
Successfully translated question number 1 from en to kik. Time needed: 9 seconds
Successfully translated question number 2 from en to kik. Time needed: 11 seconds
Successfully translated question number 3 from en to kik. Time needed: 11 seconds
Successfully translated question number 4 from en to kik. Time needed: 10 seconds
Successfully translated question number 5 from en to kik. Time needed: 8 seconds
Successfully translated question number 6 from en to kik. Time needed: 13 seconds
Successfully translated question number 7 from en to kik. Time needed: 8 seconds
Successfully translated question number 8 from en to kik. Time needed: 13 seconds
Successfully translated question number 9 from en to kik. Time needed: 14 seconds
Successfully translated question number 10 from en to kik. Time needed: 10 seconds
Setting up bho translator...
Successfully translated question number 1 from en to bho. Time needed: 8 seconds
Successfully translated question number 2 f

In [None]:
# Test translate all data

for tgt_lang in tgt_langs:
  translated_data = translate_xcsr(tgt_lang, en, path = '/content/drive/MyDrive/Data/X-CSR_datasets/', print_update = True)

  save_translated_data(translated_data, tgt_lang, path = '/content/drive/MyDrive/Data/X-CSR_datasets/')

Setting up bho translator...
Successfully translated question number 1 from en to bho. Time needed: 13 seconds
Successfully translated question number 2 from en to bho. Time needed: 10 seconds
Successfully translated question number 3 from en to bho. Time needed: 9 seconds
Successfully translated question number 4 from en to bho. Time needed: 14 seconds
Successfully translated question number 5 from en to bho. Time needed: 10 seconds
Successfully translated question number 6 from en to bho. Time needed: 10 seconds
Successfully translated question number 7 from en to bho. Time needed: 11 seconds
Successfully translated question number 8 from en to bho. Time needed: 9 seconds
Successfully translated question number 9 from en to bho. Time needed: 9 seconds
Successfully translated question number 10 from en to bho. Time needed: 11 seconds
Successfully translated question number 11 from en to bho. Time needed: 8 seconds
Successfully translated question number 12 from en to bho. Time needed: