In [None]:
# !pip install google.cloud.translate

In [2]:
import os, csv
# public
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
import pandas as pd
from tqdm import tqdm
from google.cloud import translate_v2 as translate

In [28]:
# a method that calls the cloud translator
def translate_text(source: str, target: str, text: str) -> dict:
    """
    Translates text into the target language.
    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """

    translate_client = translate.Client()

    if isinstance(text, bytes):
        text = text.decode("utf-8")

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(
        text
        , source_language=source
        , target_language=target
    )
    return result

def process_nan(s):
  if type(s)!=type(''):
    return ''
  return s

In [177]:
src_lan = 'en'
# target language
tgt_lan = 'it'
language = "Italian"

RESOURCE = 'res'
RESULTS = os.path.join(RESOURCE, 'results')
TRANS_FILE = os.path.join(RESULTS, 'sentences-'+language+'-Translations.tsv')


DATA = os.path.join(RESOURCE, 'data')
KEY = os.path.join(RESOURCE, 'key')
RAW_SENTENCE_TSV = os.path.join(DATA, 'EnglishSentences.tsv')
GOLD_FILE = os.path.join(DATA, "gold-sentences-"+language+".tsv")
JSON_GOOGLE_APPLICATION_CREDENTIALS = os.path.join(KEY, 'tonal-works-420505-eda807c7cc52.json')
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_GOOGLE_APPLICATION_CREDENTIALS

# DATA

In [30]:
sentence_df = pd.read_csv(RAW_SENTENCE_TSV, delimiter='\t', header=None, quoting=csv.QUOTE_NONE)
sentence_df.head()
# sents = sentence_df[1][1:].tolist()
# sents

Unnamed: 0,0,1
0,ID,English
1,d001.s001,This document is a summary of the European Pub...
2,d001.s002,It explains how the Committee for Medicinal Pr...
3,d001.s003,If you need more information about your medica...
4,d001.s004,If you want more information on the basis of t...


In [31]:
sids = sentence_df[0][1:].tolist()
sids = [i.replace('d', 'i') for i in sids]
sids[:4]

['i001.s001', 'i001.s002', 'i001.s003', 'i001.s004']

In [None]:
# len(sids), len(sents)

# TRANSLATE

In [None]:
# get languages
client = translate.Client()
languages = client.get_languages()
languages

In [None]:
src_lan, tgt_lan

In [None]:
# sample test
sentence = sents[0]
test = translate_text(src_lan, tgt_lan, sentence)
test

In [None]:
# translating all the sentences from English to target language
tgt_sents = []
for s in tqdm(sents):
    tgt_s = translate_text(src_lan, tgt_lan, s)
    tgt_sents.append(tgt_s)

In [None]:
tgt_sents = [s['translatedText'] for s in tgt_sents]

In [None]:
# output
trans_df = pd.DataFrame({
    'ID': sids
    , 'English': sents
    , language+ ' Translation': tgt_sents
    })
trans_df.head()

In [None]:
# Save to TSV 
trans_df.to_csv(TRANS_FILE, sep='\t', index=False)

In [178]:
# load TSV 
trans_df = pd.read_csv(TRANS_FILE, delimiter='\t', quoting=csv.QUOTE_NONE)
trans_df.head()

Unnamed: 0,ID,English,Italian Translation
0,i001.s001,This document is a summary of the European Pub...,Il presente documento è una sintesi della rela...
1,i001.s002,It explains how the Committee for Medicinal Pr...,Spiega come il comitato per i medicinali per u...
2,i001.s003,If you need more information about your medica...,Se hai bisogno di maggiori informazioni sulla ...
3,i001.s004,If you want more information on the basis of t...,Se desideri maggiori informazioni sulla base d...
4,i001.s005,What is Alimta?,Cos&#39;è Alimta?


In [179]:
trans_sents = trans_df[language+ " Translation"].tolist()

# EVALUATE QUALITY

In [180]:
gold_df = pd.read_csv(GOLD_FILE, delimiter='\t', quoting=csv.QUOTE_NONE)
gold_df.head()


Unnamed: 0,ID,Sentence
0,i001.s001,Questo documento è la sintesi di una relazione...
1,i001.s002,L EPAR descrive il modo in cui il comitato per...
2,i001.s003,Per maggiori informazioni riguardanti le propr...
3,i001.s004,Per maggiori informazioni riguardo alle motiva...
4,i001.s005,Che cos è Alimta ?


In [181]:
gold_sents = gold_df["Sentence"].tolist()
len(gold_sents),len(trans_sents)

(139, 139)

In [182]:
def calculate_bleu(reference_sentences, candidate_sentences):
    """
    Calculate the BLEU score between two lists of sentences.

    :param reference_sentences: List of reference sentences
    :param candidate_sentences: List of candidate sentences
    :return: BLEU score
    """
    references = [[ref.split()] for ref in reference_sentences]
    candidates = [cand.split() for cand in candidate_sentences]
    
    # Calculate sentence-level BLEU scores
    sentence_bleu_scores = [sentence_bleu(ref, cand,  weights=(0.25,0.25,0.25,0.25)) for ref, cand in zip(references, candidates)]
    
    # Calculate corpus-level BLEU score
    corpus_bleu_score = corpus_bleu(references, candidates, weights=(0.25,0.25,0.25,0.25))
    
    return sentence_bleu_scores, corpus_bleu_score




In [183]:
sentence_scores, corpus_score = calculate_bleu(trans_sents, gold_sents)
print(f"Sentence-level BLEU scores: {sentence_scores}")
print(f"Corpus-level BLEU score: {corpus_score}")


Sentence-level BLEU scores: [7.806161490833773e-155, 0.27042049185058636, 2.517850593541856e-78, 3.597627147271579e-78, 0, 0.18951629567590741, 0.5081327481546147, 1.1200407237786664e-231, 2.850285529650453e-78, 0.2184256744855776, 0.42867218109674143, 0.7598356856515925, 6.86809206056511e-78, 0.4702773871923361, 0.37732439539854146, 7.601159375410181e-232, 0.11103047724581815, 4.036919020698081e-155, 2.6616427895302863e-155, 0, 0, 0, 1.1045881851389632e-231, 0, 7.659859945595747e-232, 6.130577389498242e-232, 0.09886263593840232, 1.8762356783661748e-78, 4.603336663472925e-232, 0, 1.1008876702055895e-231, 3.4980387115113817e-155, 1.0108847629878795e-231, 1.1739546629120287e-231, 3.88673520364074e-78, 5.389821251330404e-155, 0.16826922745352202, 1.0, 0.6520028108523781, 0.20522120509305022, 5.432864809012988e-78, 0.5169731539571706, 5.721523971118206e-155, 0.36409302398068727, 8.636168555094496e-78, 0.5302459604351236, 4.556426120374367e-78, 0.17115279944073908, 0.3388714363186176, 0.134

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [185]:
from torchtext.data.metrics import bleu_score
references = [[ref.split()] for ref in gold_sents]
candidates = [cand.split() for cand in trans_sents]
print(bleu_score( candidates, references))


0.15975379907255308


In [192]:
import evaluate
bleu = evaluate.load('bleu')

ImportError: cannot import name 'LastCommitInfo' from 'huggingface_hub.hf_api' (/usr/local/lib/python3.11/site-packages/huggingface_hub/hf_api.py)

In [187]:
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting xxhash (from evaluate)
  Using cached xxhash-3.4.1-cp311-cp311-macosx_10_9_x86_64.whl.metadata (12 kB)
Collecting pyarrow>=15.0.0 (from datasets>=2.0.0->evaluate)
  Downloading pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl.metadata (3.0 kB)
Collecting pyarrow-hotfix (from datasets>=2.0.0->evaluate)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting requests>=2.19.0 (from evaluate)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.62.1 (from evaluate)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.7.0 (from evaluate)
  Downloading huggingface_hub-

In [133]:
# Spanish BLEU 0.22304585226870685
# Italian BLEU 0.1595706884838815