In [1]:
!pip install --upgrade transformers torch accelerate bitsandbytes unsloth --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m90.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.7/374.7 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m306.8/306.8 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m561.5/561.5 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m104.3 MB/s[0m eta [36m0:00:00[0m00:01

In [2]:
#import libraries
import os
import json
import torch
from transformers import pipeline
import pandas as pd

2025-08-11 02:56:17.962943: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754880978.131062      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754880978.179387      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
PROMPT_TEMPLATE = """
You are a meticulous, unbiased English-to-Filipino Translation Quality Judge.
Your analysis is based on a predefined, multi-dimensional rubric. Your task is to
evaluate a given Filipino translation based on a source English text from literature.

Evaluation Rubric:
1.  Accuracy (1-5): How faithfully is the core meaning of the source text preserved?
    - 1: Completely wrong or nonsensical.
    - 3: Core meaning is present, but with significant errors.
    - 5: Perfectly accurate.
2.  Tone & Style (1-5): How well does the translation capture the original's tone (e.g., formal, ironic, gothic, conversational)?
    - 1: Completely mismatched tone.
    - 3: Tone is recognizable but inconsistent or flawed.
    - 5: Perfectly captures the original style.
3.  Nuance & Metaphor (1-5): How well are metaphors, idioms, and subtle cultural contexts handled?
    - 1: All nuance is lost; metaphors are translated literally and nonsensically.
    - 3: Attempts to translate nuance but misses the mark or simplifies it.
    - 5: Nuances and metaphors are translated effectively, using cultural equivalents where necessary.
4.  Fluency (1-5): Does the Filipino translation read naturally and fluently on its own?
    - 1: Awkward, ungrammatical, and difficult to read.
    - 3: Grammatically correct but sounds stiff or unnatural ("translatorese").
    - 5: Perfectly fluent and natural-sounding.

Source Text (English):
"{source_text}"

Translation to Evaluate (Filipino):
"{translation_text}"

Your Task:
Provide a quantitative score for each of the four criteria. Calculate an Overall Score by averaging the four scores. Finally, write a detailed qualitative explanation for your assessment. Your explanation must be structured, objective, and reference specific words or phrases from the texts to justify your scores.

Output strictly in the following JSON format and nothing else:
```json
{{
  "scores": {{
    "accuracy": <score>,
    "tone_and_style": <score>,
    "nuance_and_metaphor": <score>,
    "fluency": <score>,
    "overall": <average_score>
  }},
  "explanation": "<Your detailed justification here.>"
}}```
"""


In [4]:
def initialize_model():
    model_id = "unsloth/llama-3-8b-Instruct-bnb-4bit"
    try:
        llm_pipeline = pipeline(
            "text-generation",
            model=model_id,
            model_kwargs={"torch_dtype": "auto"},
            device_map="auto",
        )
        print("--- Model initialized successfully. ---")
        return llm_pipeline
    except Exception as e:
        print(f"Error initializing model: {e}")
        print("Please ensure you have run 'pip install transformers torch accelerate' and have a stable internet connection.")
        return None

In [5]:
def evaluate_translation(llm_pipeline, source_text: str, translation_text: str) -> dict | None:
    print("\n--- Preparing evaluation for Baseline Judge ---")
    
    prompt = PROMPT_TEMPLATE.format(
        source_text=source_text,
        translation_text=translation_text
    )
    messages = [{"role": "user", "content": prompt}]

    print("Sending prompt to local Llama 3 model...")
    try:
        outputs = llm_pipeline(
            messages,
            max_new_tokens=1024,
            eos_token_id=llm_pipeline.tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.1,
            top_p=0.9
        )
        llm_output_text = outputs[0]['generated_text'][-1]['content']
    except Exception as e:
        print(f"An error occurred during model inference: {e}")
        return None

    print("Parsing LLM response...")
    try:
        # 1. Find the first opening curly brace.
        start_index = llm_output_text.find('{')
        # 2. Find the last closing curly brace.
        end_index = llm_output_text.rfind('}')

        if start_index == -1 or end_index == -1:
            print("Error: Could not find a complete JSON object in the LLM response.")
            print("Received output:", llm_output_text)
            return None
        
        # 3. Extract the substring between them (inclusive).
        json_string = llm_output_text[start_index : end_index + 1]
        
        # 4. Parse the extracted string.
        evaluation_result = json.loads(json_string)
        return evaluation_result
        # -----------------------------------------------

    except json.JSONDecodeError:
        print("Error: Failed to decode JSON from the extracted substring.")
        # Print both the original output and the part we tried to parse for debugging.
        print("Original received output:", llm_output_text)
        print("Extracted substring for parsing:", json_string)
        return None

In [6]:
def main():
    # --- Configuration ---
    INPUT_CSV = '/kaggle/input/valset/Datasets - Human-Labeled Validation Set.csv'
    OUTPUT_JSON = 'baseline_variance_subset_results.json'
    SUBSET_LIMIT = 5  # <-- How many unique items to test
    NUM_RUNS = 3      # <-- How many times to test each item

    # Initialize the model once.
    llm = initialize_model()
    if not llm:
        print("Aborting due to model initialization failure.")
        return

    # Load the validation dataset.
    try:
        df = pd.read_csv(INPUT_CSV)
    except FileNotFoundError:
        print(f"Error: Input file not found at '{INPUT_CSV}'.")
        return

    # Limit the dataframe to the specified subset
    df = df.head(SUBSET_LIMIT)

    print(f"\n--- Running Baseline Judge Variance Test ---")
    print(f"--- Testing {NUM_RUNS} runs on the first {len(df)} entries from {INPUT_CSV} ---")
    
    all_results = []
    # Outer loop: Iterate through each of the 5 items in the subset
    for index, row in df.iterrows():
        source = row.get('Source Text (English)')
        translation = row.get('Target Text (Filipino)')

        if pd.isna(source) or pd.isna(translation):
            continue

        print(f"\n{'='*20} Processing Entry {index + 1}/{len(df)} {'='*20}")
        
        variance_runs_for_this_entry = []
        # Inner loop: Run the evaluation NUM_RUNS times for this single item
        for i in range(NUM_RUNS):
            print(f"--> Run {i + 1}/{NUM_RUNS}...")
            result = evaluate_translation(llm, source, translation)
            
            variance_runs_for_this_entry.append({
                "run_number": i + 1,
                "evaluation_result": result
            })

        # Append the collected variance runs for this entry to the main results list
        all_results.append({
            "entry_index": index,
            "source_text": source,
            "target_text": translation,
            "human_score": row.get("Final Score                          (1 - lowest, 5 - highest)"),
            "variance_runs": variance_runs_for_this_entry
        })

    # Save all collected results to a single JSON file.
    with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2)
        
    print(f"\n--- Variance test complete. Results saved to {OUTPUT_JSON} ---")

In [6]:
# def main():
    
#     # --- Configuration ---
#     INPUT_CSV = '/kaggle/input/valset/Datasets - Human-Labeled Validation Set.csv'
#     OUTPUT_JSON = 'baseline_validation_results.json' # Renamed for clarity

#     llm = initialize_model()
#     if not llm:
#         print("Aborting due to model initialization failure.")
#         return

#     try:
#         df = pd.read_csv(INPUT_CSV)
#         print("CSV Columns Found:", df.columns.tolist())
#     except FileNotFoundError:
#         print(f"Error: Input file not found at '{INPUT_CSV}'. Please ensure it is in the same directory.")
#         return

#     print(f"\n--- Running Baseline Judge on {len(df)} entries from {INPUT_CSV} ---")
    
#     all_results = []
#     for index, row in df.iterrows():
#         source = row.get('Source Text (English)')
#         translation = row.get('Target Text (Filipino)')
        
#         if pd.isna(source) or pd.isna(translation):
#             print(f"Skipping row {index + 1} due to missing data.")
#             continue

#         print(f"--> Processing entry {index + 1}/{len(df)}...")
#         result = evaluate_translation(llm, source, translation)

#         # Add all relevant data for easy benchmarking later
#         all_results.append({
#             "entry_index": index,
#             "source_text": source,
#             "target_text": translation,
#             "evaluation_result": result,
#             "human_score": row.get("Final Score                          (1 - lowest, 5 - highest)"),
#             "human_explanation": row.get("Rater 1 Explanation")
#         })

#     # Save all collected results to a single JSON file.
#     with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
#         json.dump(all_results, f, ensure_ascii=False, indent=2)
        
#     print(f"\n--- Evaluation complete. Results saved to {OUTPUT_JSON} ---")

In [12]:
# def main():
#     # --- Configuration ---
#     INPUT_CSV = '/kaggle/input/valset/Datasets - Human-Labeled Validation Set.csv'
#     OUTPUT_JSON = 'baseline_test_results.json'
    
#     # Initialize the model once.
#     llm = initialize_model()
#     if not llm:
#         print("Aborting due to model initialization failure.")
#         return
    
#     # Load the test dataset.
#     try:
#         df = pd.read_csv(INPUT_CSV)
#         df.rename(columns={df.columns[0]: "English"}, inplace=True) # Clean up potential BOM character in first column name
#     except FileNotFoundError:
#         print(f"Error: Input file not found at '{INPUT_CSV}'. Please ensure it is in the same directory.")
#         return
    
#     print(f"\n--- Running Baseline Judge on {len(df)} entries from {INPUT_CSV} ---")
    
#     all_results = []
#     for index, row in df.iterrows():
#         source = row.get('English')
#         translation = row.get('Filipino')
    
#         if pd.isna(source) or pd.isna(translation):
#             continue
    
#         print(f"--> Processing entry {index + 1}/{len(df)}...")
#         result = evaluate_translation(llm, source, translation)
    
#         all_results.append({
#             "entry_index": index,
#             "source_text": source,
#             "target_text": translation,
#             "evaluation_result": result
#         })
    
#     # Save all collected results to a single JSON file.
#     with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
#         json.dump(all_results, f, ensure_ascii=False, indent=2)
        
#     print(f"\n--- Evaluation complete. Results saved to {OUTPUT_JSON} ---")

In [None]:
# def main():
#     llm = initialize_model()
#     if not llm:
#         return # Exit if model fails to initialize

#     print("\n--- Running Baseline LLM-as-a-Judge ---")

#     # source = "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife."
#     # translation = "Totoo at alam ng lahat na kapag ang isang lalaking single ay maraming pera, kailangan niya talaga ng asawa."

#     # source = "It's raining cats and dogs."
#     # translation = "Umuulan ng pusa at aso."

#     source = "Fear is the mind-killer. Fear is the little-death that brings total obliteration."
#     translation =  "Ang takot ay nakakamatay ng isip. Ang takot ay parang maliit na kamatayan na nagdadala ng pagkasira."
    
#     result = evaluate_translation(llm, source, translation)

#     if result:
#         print("\n--- Evaluation Complete ---")
#         print(f"Source Text: {source}")
#         print(f"Translation: {translation}")
#         print("\n--- Scores ---")
#         for criterion, score in result.get("scores", {}).items():
#             print(f"- {criterion.replace('_', ' ').title()}: {score}/5")
        
#         print("\n--- Justification ---")
#         print(result.get("explanation", "No explanation provided."))
#         print("\n-------------------------\n")

In [7]:
if __name__ == "__main__":
    main()

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Device set to use cuda:0


--- Model initialized successfully. ---

--- Running Baseline Judge Variance Test ---
--- Testing 3 runs on the first 5 entries from /kaggle/input/valset/Datasets - Human-Labeled Validation Set.csv ---

--> Run 1/3...

--- Preparing evaluation for Baseline Judge ---
Sending prompt to local Llama 3 model...
Parsing LLM response...
--> Run 2/3...

--- Preparing evaluation for Baseline Judge ---
Sending prompt to local Llama 3 model...
Parsing LLM response...
--> Run 3/3...

--- Preparing evaluation for Baseline Judge ---
Sending prompt to local Llama 3 model...
Parsing LLM response...

--> Run 1/3...

--- Preparing evaluation for Baseline Judge ---
Sending prompt to local Llama 3 model...
Parsing LLM response...
--> Run 2/3...

--- Preparing evaluation for Baseline Judge ---
Sending prompt to local Llama 3 model...
Parsing LLM response...
--> Run 3/3...

--- Preparing evaluation for Baseline Judge ---
Sending prompt to local Llama 3 model...
Parsing LLM response...

--> Run 1/3...

--- P

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Parsing LLM response...
--> Run 2/3...

--- Preparing evaluation for Baseline Judge ---
Sending prompt to local Llama 3 model...
Parsing LLM response...
--> Run 3/3...

--- Preparing evaluation for Baseline Judge ---
Sending prompt to local Llama 3 model...
Parsing LLM response...

--> Run 1/3...

--- Preparing evaluation for Baseline Judge ---
Sending prompt to local Llama 3 model...
Parsing LLM response...
--> Run 2/3...

--- Preparing evaluation for Baseline Judge ---
Sending prompt to local Llama 3 model...
Parsing LLM response...
--> Run 3/3...

--- Preparing evaluation for Baseline Judge ---
Sending prompt to local Llama 3 model...
Parsing LLM response...

--- Variance test complete. Results saved to baseline_variance_subset_results.json ---
