In [None]:
# !pip install google.cloud.translate

In [1]:
import os, csv
# public
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
import pandas as pd
from tqdm import tqdm
from google.cloud import translate_v2 as translate

In [2]:
# a method that calls the cloud translator
def translate_text(source: str, target: str, text: str) -> dict:
    """
    Translates text into the target language.
    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """

    translate_client = translate.Client()

    if isinstance(text, bytes):
        text = text.decode("utf-8")

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(
        text
        , source_language=source
        , target_language=target
    )
    return result

def process_nan(s):
  if type(s)!=type(''):
    return ''
  return s

In [47]:
src_lan = 'en'
# target language
tgt_lan = 'es'
language = "Spanish"


RESOURCE = '../..'
DATA =  RESOURCE+'/data/'+language
GOLD_FILE = os.path.join(DATA, 'gold-sentences-'+language+'.tsv')
TRANS_FILE = os.path.join(DATA, 'trans-sentences-'+language+'.tsv')

# DATA = os.path.join(RESOURCE, 'data')
DATA =  RESOURCE+'/data/English'

KEY = os.path.join(RESOURCE, 'key')
RAW_SENSE_TSV = os.path.join(DATA, 'gold-tokens-English-wSenses.tsv')
RAW_TOKENS_TSV =  os.path.join(DATA, 'gold-tokens-English-wSenses.tsv')
RAW_SENTENCE_TSV = os.path.join(DATA, 'gold-sentences-English.tsv')
JSON_GOOGLE_APPLICATION_CREDENTIALS = os.path.join(KEY, 'tonal-works-420505-eda807c7cc52.json')
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_GOOGLE_APPLICATION_CREDENTIALS

# DATA

In [48]:
sentence_df = pd.read_csv(RAW_SENTENCE_TSV, delimiter='\t', quoting=csv.QUOTE_NONE)
sentence_df.head()
sents = sentence_df["Sentence"].tolist()
# sents

In [49]:
sids = sentence_df["Sentence ID"].tolist()
# sids = [i.replace('d', 'i') for i in sids]
sids[:4]

['d001.s001', 'd001.s002', 'd001.s003', 'd001.s004']

In [50]:
# len(sids), len(sents)

# TRANSLATE

In [51]:
# get languages
client = translate.Client()
languages = client.get_languages()
languages

[{'language': 'af', 'name': 'Afrikaans'},
 {'language': 'sq', 'name': 'Albanian'},
 {'language': 'am', 'name': 'Amharic'},
 {'language': 'ar', 'name': 'Arabic'},
 {'language': 'hy', 'name': 'Armenian'},
 {'language': 'as', 'name': 'Assamese'},
 {'language': 'ay', 'name': 'Aymara'},
 {'language': 'az', 'name': 'Azerbaijani'},
 {'language': 'bm', 'name': 'Bambara'},
 {'language': 'eu', 'name': 'Basque'},
 {'language': 'be', 'name': 'Belarusian'},
 {'language': 'bn', 'name': 'Bengali'},
 {'language': 'bho', 'name': 'Bhojpuri'},
 {'language': 'bs', 'name': 'Bosnian'},
 {'language': 'bg', 'name': 'Bulgarian'},
 {'language': 'ca', 'name': 'Catalan'},
 {'language': 'ceb', 'name': 'Cebuano'},
 {'language': 'ny', 'name': 'Chichewa'},
 {'language': 'zh', 'name': 'Chinese (Simplified)'},
 {'language': 'zh-TW', 'name': 'Chinese (Traditional)'},
 {'language': 'co', 'name': 'Corsican'},
 {'language': 'hr', 'name': 'Croatian'},
 {'language': 'cs', 'name': 'Czech'},
 {'language': 'da', 'name': 'Danish

In [52]:
src_lan, tgt_lan

('en', 'es')

In [53]:
# sample test
sentence = sents[0]
test = translate_text(src_lan, tgt_lan, sentence)
test

{'translatedText': 'Este documento es un resumen del Informe Público Europeo de Evaluación (EPAR).',
 'input': 'This document is a summary of the European Public Assessment Report ( EPAR ) .'}

In [54]:
# translating all the sentences from English to target language
tgt_sents = []
for s in tqdm(sents):
    tgt_s = translate_text(src_lan, tgt_lan, s)
    tgt_sents.append(tgt_s)

100%|██████████| 131/131 [02:46<00:00,  1.27s/it]


In [65]:
print(tgt_sents)
tgt_sents = [s['translatedText'] for s in tgt_sents]

[{'translatedText': 'Este documento es un resumen del Informe Público Europeo de Evaluación (EPAR).', 'input': 'This document is a summary of the European Public Assessment Report ( EPAR ) .'}, {'translatedText': 'Explica cómo el Comité de Medicamentos de Uso Humano (CHMP) evaluó los estudios realizados para llegar a sus recomendaciones sobre cómo utilizar el medicamento.', 'input': 'It explains how the Committee for Medicinal Products for Human Use ( CHMP ) assessed the studies performed , to reach their recommendations on how to use the medicine .'}, {'translatedText': 'Si necesita más información sobre su condición médica o su tratamiento, lea el prospecto (también parte del EPAR) o comuníquese con su médico o farmacéutico.', 'input': 'If you need more information about your medical condition or your treatment , read the Package Leaflet ( also part of the EPAR ) or contact your doctor or pharmacist .'}, {'translatedText': 'Si desea obtener más información sobre la base de las recome

In [66]:
# output
trans_df = pd.DataFrame({
    'ID': sids
    , 'English': sents
    , language+ ' Translation': tgt_sents
    })
trans_df.head()

Unnamed: 0,ID,English,Spanish Translation
0,d001.s001,This document is a summary of the European Pub...,Este documento es un resumen del Informe Públi...
1,d001.s002,It explains how the Committee for Medicinal Pr...,Explica cómo el Comité de Medicamentos de Uso ...
2,d001.s003,If you need more information about your medica...,Si necesita más información sobre su condición...
3,d001.s004,If you want more information on the basis of t...,Si desea obtener más información sobre la base...
4,d001.s005,What is Alimta ?,¿Qué es Alimta?


In [67]:
# Save to TSV 
trans_df.to_csv(TRANS_FILE, sep='\t', index=False)

In [68]:
# load TSV 
trans_df = pd.read_csv(TRANS_FILE, delimiter='\t', quoting=csv.QUOTE_NONE)
trans_df.head()

Unnamed: 0,ID,English,Spanish Translation
0,d001.s001,This document is a summary of the European Pub...,Este documento es un resumen del Informe Públi...
1,d001.s002,It explains how the Committee for Medicinal Pr...,Explica cómo el Comité de Medicamentos de Uso ...
2,d001.s003,If you need more information about your medica...,Si necesita más información sobre su condición...
3,d001.s004,If you want more information on the basis of t...,Si desea obtener más información sobre la base...
4,d001.s005,What is Alimta ?,¿Qué es Alimta?


In [69]:
trans_sents = trans_df[language+ " Translation"].tolist()

# EVALUATE QUALITY

In [70]:
gold_df = pd.read_csv(GOLD_FILE, delimiter='\t', quoting=csv.QUOTE_NONE)
gold_df.head()


Unnamed: 0,Sentence ID,Sentence
0,d001.s001,En el presente documento se resume el Informe ...
1,d001.s002,En él se explica cómo el Comité de Medicamento...
2,d001.s003,Si desea más información sobre su enfermedad o...
3,d001.s004,Si desea más información sobre el fundamento e...
4,d001.s005,¿Qué es Alimta ?


In [71]:
gold_sents = gold_df["Sentence"].tolist()
len(gold_sents),len(trans_sents)

(131, 131)

In [72]:
def calculate_bleu(reference_sentences, candidate_sentences):
    """
    Calculate the BLEU score between two lists of sentences.

    :param reference_sentences: List of reference sentences
    :param candidate_sentences: List of candidate sentences
    :return: BLEU score
    """
    references = [[ref.split()] for ref in reference_sentences]
    candidates = [cand.split() for cand in candidate_sentences]
    
    # Calculate sentence-level BLEU scores
    sentence_bleu_scores = [sentence_bleu(ref, cand,  weights=(0.25,0.25,0.25,0.25)) for ref, cand in zip(references, candidates)]
    
    # Calculate corpus-level BLEU score
    corpus_bleu_score = corpus_bleu(references, candidates, weights=(0.25,0.25,0.25,0.25))
    
    return sentence_bleu_scores, corpus_bleu_score




In [73]:
sentence_scores, corpus_score = calculate_bleu(trans_sents, gold_sents)
print(f"Sentence-level BLEU scores: {sentence_scores}")
print(f"Corpus-level BLEU score: {corpus_score}")


Sentence-level BLEU scores: [0.23961829057131984, 0.27039777221233924, 0.13740950768136106, 3.0052113220051886e-78, 9.53091075863908e-155, 0.2852636439147137, 0.5081327481546147, 0.5081327481546147, 0.2649268590278449, 0.3806658939363621, 0.5370756670734657, 0.8633400213704505, 8.38826642100846e-155, 0.7828785637123031, 0.5839895781169013, 0.36161426197529906, 0.2135323752004724, 6.276154237151866e-155, 0.4257110866884422, 0.2082198320914845, 1.2882297539194154e-231, 0.27958160935889725, 0.6007307912522272, 1.0244914152188952e-231, 0.38899050791510964, 0.6098820960308446, 0.43471993230956535, 0.45592167592757143, 2.9154018748021785e-78, 0.37991784282579627, 0.6930977286178778, 0.2846824368437765, 0.46836246523694947, 0.34616316658919044, 0.4198850012185551, 0.2644073920994416, 0.2829559628326351, 0.6816650778781156, 0.28373869773074895, 4.539377974371219e-78, 0.7259795291154771, 6.961252661205922e-155, 0.4240125351805037, 9.53091075863908e-155, 0.14654357858721576, 0.41180376356915777,

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [74]:
from torchtext.data.metrics import bleu_score
references = [[ref.split()] for ref in gold_sents]
candidates = [cand.split() for cand in trans_sents]
print(bleu_score( candidates, references))


0.3072473201468685


In [192]:
import evaluate
bleu = evaluate.load('bleu')

ImportError: cannot import name 'LastCommitInfo' from 'huggingface_hub.hf_api' (/usr/local/lib/python3.11/site-packages/huggingface_hub/hf_api.py)

In [187]:
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting xxhash (from evaluate)
  Using cached xxhash-3.4.1-cp311-cp311-macosx_10_9_x86_64.whl.metadata (12 kB)
Collecting pyarrow>=15.0.0 (from datasets>=2.0.0->evaluate)
  Downloading pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl.metadata (3.0 kB)
Collecting pyarrow-hotfix (from datasets>=2.0.0->evaluate)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting requests>=2.19.0 (from evaluate)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.62.1 (from evaluate)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.7.0 (from evaluate)
  Downloading huggingface_hub-

In [133]:
# Spanish BLEU 0.22304585226870685
# Italian BLEU 0.1595706884838815