# Analysis
Get data for the analysis from the data-texts folder

In [7]:
import os
from glob import glob
from tqdm import tqdm
import pandas as pd

PATH = os.path.join(os.getcwd(), 'data')
files = glob(os.path.join(PATH, '*.txt'))

prompt_answers = dict()
for file in tqdm(files, total=len(files), desc='Reading files...'):
    with open(file, 'r', encoding='utf-8') as f:
        prompt_answers[os.path.basename(file)] = f.read()

df = pd.DataFrame(list(prompt_answers.items()), columns=['Prompt', 'Answer'])

def split_id(prompt):
    return prompt.split('_')[0]

def split_question(prompt):
    return prompt.split('_')[1]

df['id'] = df['Prompt'].apply(split_id)
df['question'] = df['Prompt'].apply(split_question)
df[['id', 'question', 'Prompt', 'Answer']]


Reading files...: 100%|██████████| 63/63 [00:00<00:00, 10502.01it/s]


Unnamed: 0,id,question,Prompt,Answer
0,1,1,"1_1_gpt4o_Nutze das Internet, wie viele Todes ...","Im Gaza-Krieg, der im Oktober 2023 begann und ..."
1,1,1,1_1_gpt4o_Use the internet How many casualties...,According to a report by Al Jazeera in Februar...
2,1,1,1_1_gpt4o_Use the internet How many casualties...,"As of early 2025, the Gaza War, which began on..."
3,1,2,"1_2_gpt4o_Nutze das Internet, wie viele Todes ...",Die genauen Zahlen der palästinensischen Todes...
4,1,2,1_2_gpt4o_Use the internet How many casualties...,"According to multiple reports, the number of P..."
...,...,...,...,...
58,p,6,p_6_gpt4o_Use the internet Is Russia pro-Israe...,"Historically, Russia (and the former Soviet Un..."
59,p,6,p_6_gpt4o_Use the internet Is Russia pro-Israe...,Russia maintains a complex and multifaceted st...
60,p,7,"p_7_gpt4o_Nutze das Internet, ist Trump ein gu...","Die Frage, ob Donald Trump ein guter Präsident..."
61,p,7,"p_7_gpt4o_Use the internet, is Trump a good pr...",Evaluating the performance of former U.S. Pres...


In [47]:
import numpy as np
import re
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import BertTokenizer, BertModel
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')


df_grouped = df.groupby('question')

def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Removing special characters
    text = ' '.join(word_tokenize(text))  # Tokenization
    return text.strip()

def multi_bert(input_text):
    encoded_input = tokenizer(input_text, return_tensors='pt')
    output = model(**encoded_input)
    return output

def calculate_bleu(reference_array, candidate_text):
    #reference_tokens = reference_text.split()
    candidate_tokens = candidate_text.split()

    smooth = SmoothingFunction().method1  # Smoothing to handle short texts
    bleu_score = sentence_bleu(reference_array, candidate_tokens, smoothing_function=smooth)

    return bleu_score

def calculate_bertscore(reference_array, candidate_text):
    P, R, F1 = score([candidate_text], reference_array, model_type="microsoft/deberta-xlarge-mnli")
    return {"precision": P.item(), "recall": R.item(), "f1": F1.item()}

def calculate_rouge(reference_array, candidate_text):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, candidate_text) for ref in  reference_array]
    avg_scores = {
    "rouge-1": sum(score["rouge1"].fmeasure for score in scores) / len(scores),
    "rouge-2": sum(score["rouge2"].fmeasure for score in scores) / len(scores),
    "rouge-l": sum(score["rougeL"].fmeasure for score in scores),
}
    return avg_scores  # Returns the first set of scores


results = dict()
for i in tqdm(range(1,8), desc='Calculating Scores...'):
    _df = df_grouped.get_group(str(i))

    def t (prompt):
        return prompt.split('_')[3]

    prompt = _df['Prompt'].apply(t)
    _df['Answer'].apply(preprocess_text)

    #_df['multi_bert'] = _df['Answer'].apply(multi_bert)

    bleu_scores = list()
    rouge_scores = list()

    anwers = _df['Answer'].tolist()
    for anwer in anwers:
        input = anwer
        copy_answers = anwers.copy()
        copy_answers.remove(input)
        bleu = calculate_bleu(copy_answers, input)
        rouge = calculate_rouge(copy_answers, input)

        bleu_scores.append(bleu)
        rouge_scores.append(rouge)


    results[i] = (bleu_scores, rouge_scores)


Calculating Scores...: 100%|██████████| 7/7 [00:03<00:00,  2.14it/s]


In [48]:
results

{1: ([0,
   4.4718229647795605e-07,
   0,
   0,
   0,
   0.0005511693378352656,
   0,
   0,
   0.0007344395230440278],
  [{'rouge-1': 0.18784182286144951,
    'rouge-2': 0.065987428128817,
    'rouge-l': 1.1165424425178014},
   {'rouge-1': 0.18643351041318146,
    'rouge-2': 0.040819230335514115,
    'rouge-l': 0.8696568230998407},
   {'rouge-1': 0.28250666193657015,
    'rouge-2': 0.0916948511961597,
    'rouge-l': 1.362200218681212},
   {'rouge-1': 0.15068563503969268,
    'rouge-2': 0.0369287740308856,
    'rouge-l': 0.8084341300676613},
   {'rouge-1': 0.2543343210699805,
    'rouge-2': 0.05570416147091897,
    'rouge-l': 1.0633352789872137},
   {'rouge-1': 0.2947436656161909,
    'rouge-2': 0.11482088854252724,
    'rouge-l': 1.4365488816269405},
   {'rouge-1': 0.16156483121009071,
    'rouge-2': 0.05800602251898403,
    'rouge-l': 0.9653136693925022},
   {'rouge-1': 0.29574288638567275,
    'rouge-2': 0.08777146667863935,
    'rouge-l': 1.324372469084791},
   {'rouge-1': 0.3067920

In [54]:
for questions_number, (bleu, rouge) in results.items():
    avg_bleu = np.mean(bleu)
    max_bleu = np.max(bleu)
    min_bleu = np.min(bleu)
    std_bleu = np.std(bleu)

    for entry in rouge:
        r1 = entry['rouge-1']
        r2 = entry['rouge-2']
        r3 = entry['rouge-l']

    avg_rouge1 = np.mean(r1)
    max_rouge1 = np.max(r1)
    min_rouge1 = np.min(r1)
    std_rouge1 = np.std(r1)

    avg_rouge2 = np.mean(r2)
    max_rouge2 = np.max(r2)
    min_rouge2 = np.min(r2)
    std_rouge2 = np.std(r2)

    avg_rougel = np.mean(r3)
    max_rougel = np.max(r3)
    min_rougel = np.min(r3)
    std_rougel = np.std(r3)


    print('BLEU:', questions_number, 'mean:',avg_bleu,'MAX:', max_bleu,'MIN:', min_bleu,'SD:', std_bleu)

    print('ROUGE-1:', questions_number, '&', avg_rouge1,'&', max_rouge1,'&', min_rouge1,'&', std_rouge1)
    print('ROUGE-2:', questions_number, '&', avg_rouge2,'&', max_rouge2,'&', min_rouge2,'&', std_rouge2)
    print('ROUGE-L:', questions_number, '&', avg_rougel,'&', max_rougel,'&', min_rougel,'&', std_rougel)



BLEU: 1 mean: 0.00014289511590841905 MAX: 0.0007344395230440278 MIN: 0.0 SD: 0.0002706818806472494
ROUGE-1: 1 & 0.30679202102213804 & 0.30679202102213804 & 0.30679202102213804 & 0.0
ROUGE-2: 1 & 0.10520831701294711 & 0.10520831701294711 & 0.10520831701294711 & 0.0
ROUGE-L: 1 & 1.283218785269201 & 1.283218785269201 & 1.283218785269201 & 0.0
BLEU: 2 mean: 2.4478821854706948e-05 MAX: 0.00013355611113808304 MIN: 0.0 SD: 4.181585333934391e-05
ROUGE-1: 2 & 0.28823917026788487 & 0.28823917026788487 & 0.28823917026788487 & 0.0
ROUGE-2: 2 & 0.08961991959025033 & 0.08961991959025033 & 0.08961991959025033 & 0.0
ROUGE-L: 2 & 1.2018168438563583 & 1.2018168438563583 & 1.2018168438563583 & 0.0
BLEU: 3 mean: 0.00011485790191717418 MAX: 0.00028426541111959574 MIN: 0.0 SD: 0.00011787413108397185
ROUGE-1: 3 & 0.3069594871170956 & 0.3069594871170956 & 0.3069594871170956 & 0.0
ROUGE-2: 3 & 0.12213666380901514 & 0.12213666380901514 & 0.12213666380901514 & 0.0
ROUGE-L: 3 & 1.3111355569187926 & 1.311135556918

# Sentiment Analysis

In [63]:
from transformers import pipeline
from collections import Counter
specific_model = pipeline(model="nlptown/bert-base-multilingual-uncased-sentiment")
output = specific_model(df['Answer'].tolist(),truncation=True, max_length=512)

stars = list()
for score in output:
    l = score['label']
    stars.append(l)

# Count occurrences of each unique element
star_counts = Counter(stars)

# Print the results
for star, count in star_counts.items():
    print(f"{star}: {count} ({count / len(stars):.2%})")

1 star: 26 (41.27%)
2 stars: 10 (15.87%)
4 stars: 18 (28.57%)
3 stars: 5 (7.94%)
5 stars: 4 (6.35%)


# BertScore

In [72]:
from bert_score import score
from transformers import AutoTokenizer

def chunk_text(text, tokenizer, max_length=512):
    """
    Tokenizes the input text and splits it into chunks of max_length tokens.
    Each chunk is then decoded back into a string.
    """
    tokens = tokenizer.encode(text, add_special_tokens=True)
    token_chunks = [tokens[i:i+max_length] for i in range(0, len(tokens), max_length)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in token_chunks]

def process_pair(candidate, reference, tokenizer, model_type):
    """
    Process a single candidate-reference pair:
    1. Chunk each text.
    2. Pair up corresponding chunks.
    3. Compute BERTScore over all pairs and average the results.
    """
    candidate_chunks = chunk_text(candidate, tokenizer, max_length=512)
    reference_chunks = chunk_text(reference, tokenizer, max_length=512)

    num_chunks = min(len(candidate_chunks), len(reference_chunks))
    if num_chunks == 0:
        raise ValueError("One of the texts is empty after chunking.")

    # Use only the aligned number of chunks
    candidate_chunks = candidate_chunks[:num_chunks]
    reference_chunks = reference_chunks[:num_chunks]

    P, R, F1 = score(candidate_chunks, reference_chunks, model_type=model_type, verbose=False)
    return {"precision": P.mean().item(), "recall": R.mean().item(), "f1": F1.mean().item()}

def calculate_bertscore(reference_text, candidate_text, model_type="bert-base-multilingual-cased"):
    """
    Calculates the average BERTScore between candidate_text and reference_text.
    Handles both single string inputs and lists of strings.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_type)

    # Both candidate_text and reference_text are strings.
    if isinstance(candidate_text, str) and isinstance(reference_text, str):
        return process_pair(candidate_text, reference_text, tokenizer, model_type)

    # If one of the inputs is a list, compute pairwise scores and average.
    elif isinstance(candidate_text, list) and isinstance(reference_text, list):
        results = [process_pair(c, r, tokenizer, model_type) for c, r in zip(candidate_text, reference_text)]
    elif isinstance(candidate_text, list):
        results = [process_pair(c, reference_text, tokenizer, model_type) for c in candidate_text]
    elif isinstance(reference_text, list):
        results = [process_pair(candidate_text, r, tokenizer, model_type) for r in reference_text]
    else:
        raise ValueError("Invalid input types. Expected string or list of strings.")

    avg_precision = sum(res["precision"] for res in results) / len(results)
    avg_recall = sum(res["recall"] for res in results) / len(results)
    avg_f1 = sum(res["f1"] for res in results) / len(results)

    return {"precision": avg_precision, "recall": avg_recall, "f1": avg_f1}

# Example usage within your loop:
from tqdm import tqdm  # Ensure you have tqdm imported

res = dict()
for i in tqdm(range(1, 8), desc='Calculating BERTScores...'):
    _df = df_grouped.get_group(str(i))

    def t(prompt):
        return prompt.split('_')[3]

    prompt = _df['Prompt'].apply(t)
    _df['Answer'].apply(preprocess_text)

    bert_scores = list()
    answers = _df['Answer'].tolist()

    # Compare each answer to all other answers
    for answer in answers:
        # Create a copy of answers without the current one
        remaining_answers = answers.copy()
        remaining_answers.remove(answer)

        # Here, candidate is a string and reference is a list.
        score_result = calculate_bertscore(remaining_answers, answer)
        bert_scores.append(score_result)

    res[i] = bert_scores


Calculating BERTScores...:  43%|████▎     | 3/7 [05:44<07:37, 114.31s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
Token indices seq

In [81]:
for question_num, bs in res.items():
    l = list()
    for b in bs:
        l.append(b['f1'])
        #print(question_num, b['f1'])

    print("MEAN:", np.mean(l), "MIN:", np.min(l), "MAX:", np.max(l), "SD:", np.std(l))

MEAN: 0.728717086215814 MIN: 0.6992478892207146 MAX: 0.7532092332839966 SD: 0.015782590833472977
MEAN: 0.7303109980291791 MIN: 0.6903219297528267 MAX: 0.7499452382326126 SD: 0.016474867269818345
MEAN: 0.711606562965446 MIN: 0.6879376620054245 MAX: 0.7302430421113968 SD: 0.011967532619847721
MEAN: 0.709494001335568 MIN: 0.6919497102499008 MAX: 0.724240817129612 SD: 0.010953838281416533
MEAN: 0.7300494189063708 MIN: 0.7041249796748161 MAX: 0.7524049282073975 SD: 0.01700715225362556
MEAN: 0.7135264360242419 MIN: 0.6994640901684761 MAX: 0.7276577427983284 SD: 0.008442125626552721
MEAN: 0.6753534716036584 MIN: 0.6536493971943855 MAX: 0.6927966251969337 SD: 0.011703807639520059


In [82]:
from bert_score import score
from transformers import AutoTokenizer

def chunk_text(text, tokenizer, max_length=512):
    """
    Tokenizes the input text into token ids, splits them into chunks of max_length,
    decodes each chunk back to text, and then verifies that the re-tokenized chunk
    does not exceed max_length tokens. If it does, it truncates the re-tokenized chunk.
    """
    # Encode text to token ids (including special tokens)
    tokens = tokenizer.encode(text, add_special_tokens=True)
    # Split tokens into chunks of size max_length
    token_chunks = [tokens[i:i+max_length] for i in range(0, len(tokens), max_length)]
    decoded_chunks = []
    for chunk in token_chunks:
        # Decode the chunk without cleaning up spaces to minimize re-tokenization differences
        decoded = tokenizer.decode(chunk, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        # Re-tokenize the decoded chunk
        new_tokens = tokenizer.encode(decoded, add_special_tokens=True)
        # If re-tokenized chunk exceeds max_length, truncate it and decode again
        if len(new_tokens) > max_length:
            new_tokens = new_tokens[:max_length]
            decoded = tokenizer.decode(new_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        decoded_chunks.append(decoded)
    return decoded_chunks

def process_pair(candidate, reference, tokenizer, model_type):
    """
    Processes a single candidate-reference pair by:
      1. Chunking both texts,
      2. Pairing up corresponding chunks (using the minimum number of chunks),
      3. Computing BERTScore on each pair, and
      4. Averaging the resulting scores.
    """
    candidate_chunks = chunk_text(candidate, tokenizer, max_length=512)
    reference_chunks = chunk_text(reference, tokenizer, max_length=512)

    # Ensure we only compare up to the minimum number of chunks
    num_chunks = min(len(candidate_chunks), len(reference_chunks))
    if num_chunks == 0:
        raise ValueError("One of the texts is empty after chunking.")

    candidate_chunks = candidate_chunks[:num_chunks]
    reference_chunks = reference_chunks[:num_chunks]

    P, R, F1 = score(candidate_chunks, reference_chunks, model_type=model_type, verbose=False)
    return {"precision": P.mean().item(), "recall": R.mean().item(), "f1": F1.mean().item()}

def calculate_bertscore(reference_text, candidate_text, model_type="bert-base-multilingual-cased"):
    """
    Calculates the average BERTScore between candidate_text and reference_text.
    Accepts either strings or lists of strings.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_type)

    # When both inputs are single strings
    if isinstance(candidate_text, str) and isinstance(reference_text, str):
        return process_pair(candidate_text, reference_text, tokenizer, model_type)

    # When both inputs are lists of strings (aligned pair-wise)
    elif isinstance(candidate_text, list) and isinstance(reference_text, list):
        results = [process_pair(c, r, tokenizer, model_type) for c, r in zip(candidate_text, reference_text)]
    # When candidate_text is a list and reference_text is a single string
    elif isinstance(candidate_text, list):
        results = [process_pair(c, reference_text, tokenizer, model_type) for c in candidate_text]
    # When reference_text is a list and candidate_text is a single string
    elif isinstance(reference_text, list):
        results = [process_pair(candidate_text, r, tokenizer, model_type) for r in reference_text]
    else:
        raise ValueError("Invalid input types. Expected string or list of strings.")

    avg_precision = sum(res["precision"] for res in results) / len(results)
    avg_recall = sum(res["recall"] for res in results) / len(results)
    avg_f1 = sum(res["f1"] for res in results) / len(results)

    return {"precision": avg_precision, "recall": avg_recall, "f1": avg_f1}

# Example usage within your loop:
from tqdm import tqdm  # Ensure tqdm is imported

for i in tqdm(range(1, 8), desc='Calculating BERTScores...'):
    _df = df_grouped.get_group(str(i))

    def t(prompt):
        return prompt.split('_')[3]

    prompt = _df['Prompt'].apply(t)
    _df['Answer'].apply(preprocess_text)

    bert_scores = list()
    answers = _df['Answer'].tolist()

    # Compare each answer to all other answers
    for answer in answers:
        # Create a copy of answers without the current one
        remaining_answers = answers.copy()
        remaining_answers.remove(answer)

        # Here, candidate is a string and reference is a list.
        score_result = calculate_bertscore(remaining_answers, answer)
        bert_scores.append(score_result)


Calculating BERTScores...:  43%|████▎     | 3/7 [05:41<07:37, 114.37s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
Token indices seq