## Environment Preparation

In [1]:
%pip install python-dotenv > /dev/null 2>&1
%pip install datasets > /dev/null 2>&1
%pip install nltk > /dev/null 2>&1
%pip install openai > /dev/null 2>&1
%pip install tenacity > /dev/null 2>&1

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import load_dotenv

# Load the .env file
load_dotenv()

True

## Define Abstract Classes

Here is a section to define abstract classes for translator and evaluator.

In [3]:
from abc import ABC, abstractmethod
from typing import Literal

# Define the abstract translator class that will be implemented by the different translators
# The translator class will have a translate method that will take a text, source language and target language and return the translated text
class Translator(ABC):
    @abstractmethod
    def translate(self, text: str, source_lang:str, target_lang:str, verbose:bool) -> str:
        pass


# Define the abstract evaluator class that will be implemented by the different evaluators
# The evaluator class will have a score method that will take a list of references and a list of candidates and return the score
class Evaluator(ABC):
    @abstractmethod
    def score(self, references: list[str], candidate: str) -> list[int] | Literal[0]:
        pass

## Translator

Define various of translators.

### Azure OpenAI

In [None]:
from openai import AzureOpenAI
from tenacity import retry, stop_after_attempt, wait_exponential


class InvalidSystemPromptError(Exception):
    pass


class AzureOpenAITranslator(Translator):
    def __init__(
        self,
        endpoint: str,
        subscription_key: str,
        deployment: str,
        api_version: str = "2024-02-01",
        system_prompt: str = "translate from <source_lang> to <target_lang>",
        **kwargs,
    ):

        if "<source_lang>" not in system_prompt or "<target_lang>" not in system_prompt:
            raise InvalidSystemPromptError(
                "system_prompt must contain <source_lang> and <target_lang> placeholders."
            )

        self._translator = AzureOpenAI(
            azure_endpoint=endpoint,
            api_key=subscription_key,
            api_version=api_version,
            **kwargs,
        )
        self._deployment = deployment
        self._system_prompt = system_prompt

    @retry(
        stop=stop_after_attempt(5), wait=wait_exponential(multiplier=10, min=10, max=60)
    )
    def translate(
        self, text: str, source_lang: str, target_lang: str, verbose: bool = False
    ) -> str:
        try:
            response = self._translator.chat.completions.create(
                model=self._deployment,
                messages=[
                    {
                        "role": "system",
                        "content": self._system_prompt.replace(
                            "<source_lang>", source_lang
                        ).replace("<target_lang>", target_lang),
                    },
                    {"role": "user", "content": text},
                ],
            )

            return (
                response.choices[0].message.content
                if response.choices[0].message.content is not None
                else ""
            )
        except Exception as e:
            if verbose:
                print(f"Error: {e}. Input: {text}")
            return ""

## Benchmark Evaluator

### BLEU

In [5]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from typing import Literal
import re
import string


class BLEUSentenceEvaluator(Evaluator):
    def __init__(
        self,
        smoothing_function=SmoothingFunction().method1,
        weights: tuple[float, float, float, float] = (0.25, 0.25, 0.25, 0.25),
    ):
        # Smoothing function is used to avoid the warning message when the candidate is empty and the BLEU score is 0
        # This is because the smoothing function will smooth the 0 score to a very small positive value
        self._smoothing_function = smoothing_function
        self._weights = weights
        pass

    def preprocess(self, text: str) -> str:
        """
        Preprocess the text.

        1. Remove extra spaces
        2. Remove leading and trailing spaces
        3. Convert the text to lowercase
        4. Remove punctuations
        5. Remove \n and \r characters
        """

        text = re.sub(r"\s{2,}", " ", text)
        text = text.lower()
        text = text.replace("\n", " ")
        text = text.replace("\r", " ")
        text = text.translate(str.maketrans("", "", string.punctuation))
        text = text.strip()
        
        return text

    def score(self, references: list[str], candidate: str) -> list[int] | Literal[0]:
        references = [self.preprocess(ref) for ref in references]
        candidate = self.preprocess(candidate)

        return sentence_bleu(
            references,
            candidate,
            smoothing_function=self._smoothing_function,
            weights=self._weights,
        )

## Test

### Load Dataset

Load dataset from local file.

In [9]:
from datasets import load_dataset
import pandas as pd

data_file = "./test_example.csv" # Path to the data file, replace with the correct path
ds = load_dataset('csv', data_files=data_file)
print(ds)

# Define the function that will evaluate the translations
def evalutate_translations(translator: Translator, evaluator: Evaluator, source_lang:str, target_lang:str, verbose: bool = False):
    source_column = "target" # Column name of the source text, replace with the correct column name
    reference_column = "reference" # Column name of the reference text, replace with the correct column name

    df = pd.DataFrame(columns=["source_text", "reference", "translated_text", "score"])
    
    for i in range(len(ds["train"])):

        source_text = ds["train"][i][source_column].strip()
        references = [ds["train"][i][reference_column].strip()]
        
        translated_text = translator.translate(source_text, source_lang, target_lang, verbose=verbose)
        translated_text = translated_text
        score = evaluator.score(references, translated_text)

        if verbose:
            print(f"Source Text: {source_text}")
            print(f"References: {references}")
            print(f"Translated Text: {translated_text}")
            print(f"Score: {score}")

        df.loc[i] = [source_text, references[0], translated_text, score]

    return df

DatasetDict({
    train: Dataset({
        features: ['target', 'reference'],
        num_rows: 1
    })
})


### Execute Testing

In [None]:
import os

azure_translator = AzureOpenAITranslator(
    endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT", ""),
    subscription_key=os.environ.get("AZURE_OPENAI_KEY", ""),
    deployment="gpt-4o", # change your deployment here, and make sure it is available in your Azure OpenAI account
    system_prompt="You are a translation expert. Please translate the user-provided <source_lang> content into fluent <target_lang>.  Before translating, please fully understand the user’s content and ensure the translation is accurate and error-free.",
)

evaluator = BLEUSentenceEvaluator()

result = evalutate_translations(azure_translator, evaluator, source_lang="english", target_lang="traditional chinese", verbose=False)

# save the result to a csv file
result.to_csv("test_result.csv", index=False, encoding="utf-8")

result

Unnamed: 0,source_text,reference,translated_text,score
0,This is a test example.,這是一個測試例子。,這是一個測試範例。,0.631197
