In [28]:
def _extract_clean_predicted_text(predicted_text):
    """Extract clean predicted text from the model prediction part.
    The function assumes:
    1. Each text starts with a CLS token
    2. There's a SEP token after the source sequence
    3. Everything after this SEP is the model's prediction
    4. We want text up to the first SEP token in the prediction part

    Args:
        predicted_text (list[str]): List of predicted texts with special tokens

    Returns:
        list[str]: List of clean predicted texts

    """
    clean_texts = []

    for text in predicted_text:
        # First, find the SEP token that comes after the source sequence
        parts = text.split("[SEP]", 1)  # Split on first SEP
        # The prediction part starts after the first SEP
        prediction_part = parts[1].strip()  # stripping sep in case of double sep after src sequence
        if prediction_part.find("[SEP]") == 0:
            prediction_part = prediction_part[len("[SEP]") :].strip()

        # If there are more SEP tokens in the prediction, take only up to the first one
        prediction_part = prediction_part.split("[SEP]", 1)[0]

        # Remove any remaining special tokens (like CLS) and strip whitespace
        clean_prediction = prediction_part.replace("[CLS]", "").replace("[PAD]", "").replace("[SEP]", "").strip()
        clean_texts.append(clean_prediction)

    return clean_texts

In [34]:
predicted_text = ["[CLS] This is a test sentence. [SEP] This is the model's prediction. [SEP] bla bla bla"]
# predicted_text = ["[CLS] This is a test sentence. [SEP][SEP] This is the model's prediction. [SEP] bla bla bla"]
predicted_text = [
    "[CLS] academic and educational advice : what can i do after completing bcom? [SEP] [CLS] what should i do after bcom? [SEP] [PAD] [PAD] is thrown in a vertically upward direction with a velocity of 50 km / s. what ' s the maximum height? [SEP] [PAD] [PAD] [PAD] [PAD] a [CLS] vertically [CLS] [SEP] [PAD]? [PAD] is [PAD]? [PAD] [PAD] [CLS]? i [PAD] [PAD] [PAD] upward [PAD] a? [CLS] is? what [CLS] [PAD] [PAD] [PAD] of [SEP] what? i what [PAD] [CLS] [PAD] what what a to after [CLS] [PAD] [CLS] [SEP] a [PAD] is [PAD] [CLS] what [SEP] [PAD] upward is [PAD] [PAD] [CLS] [PAD] [CLS] what [PAD] what is? [PAD] what a"
]
clean = _extract_clean_predicted_text(predicted_text)
print(clean)

['what should i do after bcom?']


In [35]:
"[SEP] This is the model's prediction. [SEP] bla bla bla".strip("[SEP]")

" This is the model's prediction. [SEP] bla bla bla"

In [37]:
import evaluate

bleu = evaluate.load("bleu")

ref = [["what should i do after bcom?"]]
hyp = clean
bleu_score = bleu.compute(predictions=hyp, references=ref)
print(bleu_score)

{'bleu': 1.0, 'precisions': [1.0, 1.0, 1.0, 1.0], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 7, 'reference_length': 7}
