In [3]:
#SET UNIVERSAL ENVIRONMENT VARIABLES
MODEL_NAME = "facebook/nllb-200-distilled-600M"

# According to the NLLB docs, English is "eng_Latn" and Georgian is "kat_Geor".
SRC_LANG = "eng_Latn"
TGT_LANG = "kat_Geor"

DATA_FILE = "train_conf.json"  # Your parallel data


In [4]:
### Load the data
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)


In [28]:
### Load the parallel data from a JSON file
# The JSON file should contain a list of dictionaries with "source" and "target" keys.
import json
from datasets import Dataset

# Read the entire JSON into a list of dicts
with open(DATA_FILE, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Convert to a huggingface Dataset
dataset = Dataset.from_list(raw_data)

# Let's do a train/test split: 90% train, 10% test
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]


In [29]:
#### Preprocess the data
# We need to tokenize the input and target texts.
# The tokenizer needs to know the source and target languages.
# The tokenizer will automatically add the language codes to the input text.
# The tokenizer will also add the language codes to the target text.
def preprocess_function(examples):
    # We’ll store the “en” text in source, “ka” text in target
    src_texts = []
    tgt_texts = []

    for item in examples["translation"]:
        en_text = item["en"]
        ka_text = item["ge"]

        # For NLLB, set the correct source/target language code:
        # e.g. we want to translate from English -> Georgian:
        #   src_lang = "eng_Latn"
        #   tgt_lang = "kat_Geor"
        src_texts.append(en_text)
        tgt_texts.append(ka_text)

    # The tokenizer must know we are dealing with these language codes
    # for input and output
    tokenizer.src_lang = SRC_LANG
    tokenizer.tgt_lang = TGT_LANG

    model_inputs = tokenizer(
        src_texts,
        max_length=256,
        truncation=True,
    )

    # For seq2seq, we also prepare labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            tgt_texts,
            max_length=256,
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [30]:
## Tokenize the dataset
# We can use the map function to apply the preprocessing function to the dataset.

train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["translation"]  # remove original text to keep dataset clean
)

test_dataset = test_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["translation"]
)


Map:   0%|          | 0/640 [00:00<?, ? examples/s]



Map:   0%|          | 0/72 [00:00<?, ? examples/s]

In [31]:
###Separation and Algignemtn of talks by newline character
### This script will take the original JSON file and split the text by newlines, aligning them properly. 
### This is chosen instead of by sentence since there is not perfect algiangment between en and ge sentences

import json

def split_by_newline(text):
    """
    Splits the given text by newlines.
    Strips each line and discards empty lines.
    """
    lines = text.split("\n")
    # Clean up any trailing spaces, remove empty lines
    lines = [l.strip() for l in lines if l.strip()]
    return lines

def main():
    input_json_path = "train_conf.json"        # The input JSON array file
    output_json_path = "train_conf_line.json"  # Output

    # 1) Load the JSON array
    with open(input_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        # data is a list of objects, each something like:
        # {
        #   "translation": { "ge": "...", "en": "..." }
        # }

    all_pairs = []

    # 2) For each item in the array, split by newlines
    for item_idx, item in enumerate(data):
        translation = item["translation"]
        ge_text = translation["ge"]
        en_text = translation["en"]

        # 3) Split each text into lines
        ge_lines = split_by_newline(ge_text)
        en_lines = split_by_newline(en_text)

        # 4) Pair them up
        min_len = min(len(ge_lines), len(en_lines))
        for i in range(min_len):
            all_pairs.append({
                "translation": {
                    "ge": ge_lines[i],
                    "en": en_lines[i]
                }
            })

    # 5) Save the new array of line-based pairs
    with open(output_json_path, "w", encoding="utf-8") as f_out:
        json.dump(all_pairs, f_out, ensure_ascii=False, indent=2)

    print(f"Done! Created {len(all_pairs)} line-based pairs in '{output_json_path}'.")

if __name__ == "__main__":
    main()



Done! Created 21208 line-based pairs in 'train_conf_line.json'.


In [32]:
###Separation and Algignemtn of talks by newline character
### This script will take the original JSON file and split the text by newlines, aligning them properly. 
### This is chosen instead of by sentence since there is not perfect algiangment between en and ge sentences

import json

def split_by_newline(text):
    """
    Splits the given text by newlines.
    Strips each line and discards empty lines.
    """
    lines = text.split("\n")
    # Clean up any trailing spaces, remove empty lines
    lines = [l.strip() for l in lines if l.strip()]
    return lines

def main():
    input_json_path = "test_conf.json"        # The input JSON array file
    output_json_path = "test_conf_line.json"  # Output

    # 1) Load the JSON array
    with open(input_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        # data is a list of objects, each something like:
        # {
        #   "translation": { "ge": "...", "en": "..." }
        # }

    all_pairs = []

    # 2) For each item in the array, split by newlines
    for item_idx, item in enumerate(data):
        translation = item["translation"]
        ge_text = translation["ge"]
        en_text = translation["en"]

        # 3) Split each text into lines
        ge_lines = split_by_newline(ge_text)
        en_lines = split_by_newline(en_text)

        # 4) Pair them up
        min_len = min(len(ge_lines), len(en_lines))
        for i in range(min_len):
            all_pairs.append({
                "translation": {
                    "ge": ge_lines[i],
                    "en": en_lines[i]
                }
            })

    # 5) Save the new array of line-based pairs
    with open(output_json_path, "w", encoding="utf-8") as f_out:
        json.dump(all_pairs, f_out, ensure_ascii=False, indent=2)

    print(f"Done! Created {len(all_pairs)} line-based pairs in '{output_json_path}'.")

if __name__ == "__main__":
    main()

Done! Created 2497 line-based pairs in 'test_conf_line.json'.


In [24]:
# import json
# import stanza
# import subprocess
# import os

# def load_single_json(json_path):
#     """
#     If the JSON is a list with at least one object like:
#       [
#         {
#           "translation": {
#             "ge": "...",
#             "en": "..."
#           }
#         }
#       ]
#     Returns ge_text, en_text from the first item.
#     If multiple items exist, adapt as needed.
#     """
#     with open(json_path, "r", encoding="utf-8") as f:
#         data = json.load(f)  # data is a list
#     # pick first element
#     item = data[0]
#     translation = item["translation"]
#     ge_text = translation["ge"]
#     en_text = translation["en"]
#     return ge_text, en_text

# def stanza_sentence_split(text, lang="ka"):
#     """
#     Use Stanza to do sentence segmentation in the given language (lang).
#     Returns a list of sentence strings.
#     Make sure you've run (in Python) e.g.:
#       stanza.download('ka')
#       stanza.download('en')
#     before calling this function for the first time.
#     """
#     nlp = stanza.Pipeline(lang=lang, processors='tokenize', use_gpu=False)
#     doc = nlp(text)
#     sentences = []
#     for sent in doc.sentences:
#         sentences.append(sent.text.strip())
#     return sentences

# def main():
#     input_json = "train_conf.json"       # The input JSON array with "ge"/"en" text
#     output_json = "train_conf_sentence.json"

#     # 1) Load your big text from the first item in the JSON
#     ge_text, en_text = load_single_json(input_json)

#     # 2) Sentence-split with Stanza
#     ge_sentences = stanza_sentence_split(ge_text, lang="ka")
#     en_sentences = stanza_sentence_split(en_text, lang="en")

#     # 3) Write the splitted lines to temp files for HunAlign
#     ge_tempfile = "temp_ge.txt"
#     en_tempfile = "temp_en.txt"
#     with open(ge_tempfile, "w", encoding="utf-8") as gf:
#         for s in ge_sentences:
#             gf.write(s + "\n")

#     with open(en_tempfile, "w", encoding="utf-8") as ef:
#         for s in en_sentences:
#             ef.write(s + "\n")
#     new_array = []  # This will hold the final aligned sentence pairs

#     # 4) Call HunAlign
#     hunalign_bin = "/path/to/hunalign"  # or just "hunalign.exe" if in PATH
#     dictionary = "empty.dic"           # or a real dictionary file
#     cmd = [hunalign_bin, dictionary, ge_tempfile, en_tempfile, "-text"]

#     result = subprocess.run(cmd, capture_output=True, text=True)
#     aligned_output = result.stdout

#     # 5) Parse hunalign output into a final JSON array
#     # Format: ge_sentence \t en_sentence \t alignment_score
#     new_array = []
#     for line in aligned_output.split("\n"):
#         line = line.strip()
#         if not line:
#             continue
#         parts = line.split("\t")
#         if len(parts) < 2:
#             continue
#         ge_sent = parts[0].strip()
#         en_sent = parts[1].strip()
#         new_array.append({
#             "translation": {
#                 "ge": ge_sent,
#                 "en": en_sent
#             }
#         })

#     # 6) Save the aligned sentences
#     with open(output_json, "w", encoding="utf-8") as out_f:
#         json.dump(new_array, out_f, ensure_ascii=False, indent=2)

#     # optional cleanup
#     try:
#         os.remove(ge_tempfile)
#         os.remove(en_tempfile)
#     except:
#         pass

#     print(f"Created {len(new_array)} aligned sentence pairs in {output_json}")

# if __name__ == "__main__":
#     main()


In [33]:
#### Training the model
# We will use the Trainer API from Hugging Face to train the model.
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

# A DataCollator will pad dynamically at batch time
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="nllb_finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=2e-5,
    warmup_steps=100,
    logging_steps=50,
    save_total_limit=1,  # keep only the last checkpoint
    fp16=True,  # if your GPU supports it
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.7372,1.416623
2,1.5428,1.332442
3,1.4488,1.307248




TrainOutput(global_step=960, training_loss=1.6152222047249476, metrics={'train_runtime': 15676.1389, 'train_samples_per_second': 0.122, 'train_steps_per_second': 0.061, 'total_flos': 1040210209013760.0, 'train_loss': 1.6152222047249476, 'epoch': 3.0})

In [34]:
#### Evaluate the model
metrics = trainer.evaluate()
print(metrics)

trainer.save_model("nllb_finetuned")
tokenizer.save_pretrained("nllb_finetuned")


{'eval_loss': 1.3072477579116821, 'eval_runtime': 67.4299, 'eval_samples_per_second': 1.068, 'eval_steps_per_second': 0.534, 'epoch': 3.0}


('nllb_finetuned\\tokenizer_config.json',
 'nllb_finetuned\\special_tokens_map.json',
 'nllb_finetuned\\sentencepiece.bpe.model',
 'nllb_finetuned\\added_tokens.json',
 'nllb_finetuned\\tokenizer.json')

In [5]:
#### Inference
# Now we can use the fine-tuned model for inference.
from transformers import pipeline

# Reload your model
model_ft = AutoModelForSeq2SeqLM.from_pretrained("nllb_finetuned")
tokenizer_ft = AutoTokenizer.from_pretrained("nllb_finetuned")

# Create a pipeline for translation
translator = pipeline(
    "translation",
    model=model_ft,
    tokenizer=tokenizer_ft,
    src_lang="eng_Latn",
    tgt_lang="kat_Geor",
)

# If needed, set the correct language codes again:
tokenizer_ft.src_lang = "en"
tokenizer_ft.tgt_lang = "ka"

test_en = "We have been promised, “Because of our covenant with God, He will never tire in His efforts to help us, and we will never exhaust His merciful patience with us.”"
result = translator(test_en, max_length=2500)
print(result[0]["translation_text"])



Device set to use cpu


და გვპირდება: "ჩვენ აღვთანხმდით ღმერთთან, რომ ის არ დაიღალებს ჩვენთან და ჩვენ არ დავმსრულებთ მის დიდებას ჩვენთან ერთად".


In [6]:
## If you want to use the original model for inference, you can do so as well.
# Load the original model again (not fine-tuned)
model_ft = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
tokenizer_ft = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")



translator = pipeline(
    "translation",
    model=model_ft,
    tokenizer=tokenizer_ft,
    src_lang="eng_Latn",
    tgt_lang="kat_Geor",
)

# If needed, set the correct language codes again:
tokenizer_ft.src_lang = "en"
tokenizer_ft.tgt_lang = "ka"

test_en = "We have been promised, “Because of our covenant with God, He will never tire in His efforts to help us, and we will never exhaust His merciful patience with us.”"
result = translator(test_en, max_length=2500)
print(result[0]["translation_text"])


Device set to use cpu


და გვპირდებოდა: "ჩვენთვის აღთქმა იყო, რომ ღმერთი არ დაიღალება თავის ძალისხმევაში და ჩვენთან ერთად ვერც კი ამოწურავთ მისი მოთმინებას".


Loaded 85 test lines from test_conf_line_copy.json
Loading model/tokenizer from nllb_finetuned...


KeyboardInterrupt: 

In [49]:
def main():
    """
    Example single entry point. 
    You can pick which main function to run or parse arguments in a real scenario.
    """
    run_translation_main_baseline()
    #run_translation_main_finetuned()   

if __name__ == "__main__":
    main()



Loaded 200 test lines from test_conf_line_copy.json
Loading model/tokenizer from facebook/nllb-200-distilled-600M...
Saved baseline outputs to baseline_outputs.txt
Saved fine-tuned outputs to finetuned_outputs.txt


In [None]:
import json
import sacrebleu
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import time
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_num_threads(os.cpu_count())
print(f"Using device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")
if device == "cuda":
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
print(f"Number of CPU cores: {os.cpu_count()}")

# Define a custom Dataset class to handle source texts for translation
class TranslationDataset(Dataset):
    # Initialize the dataset with a list of source texts
    def __init__(self, src_texts):
        self.src_texts = src_texts  # Store the source texts (e.g., English sentences)
    
    # Return the total number of texts in the dataset
    def __len__(self):
        return len(self.src_texts)
    
    # Retrieve a specific text by its index
    def __getitem__(self, idx):
        return self.src_texts[idx]  # Return the text at the given index

# Function to load source and reference texts from a JSON file
def load_json_data(json_path, src_key="en", tgt_key="ge"):
    # Notify the user that JSON loading has started
    print(f"Loading JSON from {json_path}...")
    # Record the start time for performance measurement
    start_time = time.time()
    try:
        # Open and read the JSON file with UTF-8 encoding
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)  # Parse JSON into a Python object (list of dictionaries)
        # Extract source texts (e.g., English) from the "translation" field, stripping whitespace
        src_texts = [item["translation"][src_key].strip() for item in data if "translation" in item and src_key in item["translation"]]
        # Extract reference texts (e.g., Georgian) from the "translation" field, stripping whitespace
        ref_texts = [item["translation"][tgt_key].strip() for item in data if "translation" in item and tgt_key in item["translation"]]
        # Report the number of loaded texts and the time taken
        print(f"Loaded {len(src_texts)} source texts and {len(ref_texts)} reference texts in {time.time() - start_time:.2f} seconds")
        # Return the lists of source and reference texts
        return src_texts, ref_texts
    except Exception as e:
        # If an error occurs (e.g., file not found, invalid JSON), print it and return empty lists
        print(f"Error loading JSON: {e}")
        return [], []

# Function to generate translations using a specified model
def generate_translations(model_name_or_path, src_texts, batch_size=8, max_length=512, num_beams=1, src_lang="eng_Latn", tgt_lang="kat_Geor"):
    # Announce the start of the translation process with model and language details
    print(f"\nStarting translations with {model_name_or_path} from {src_lang} to {tgt_lang}...")
    # Record the start time for the entire translation process
    start_time = time.time()

    # Attempt to load the tokenizer for the specified model
    print(f"Loading tokenizer from {model_name_or_path}...")
    try:
        # Load the tokenizer with the source language specified (e.g., English as "eng_Latn")
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, src_lang=src_lang)
        print("Tokenizer loaded")  # Confirm successful loading
    except Exception as e:
        # If loading fails (e.g., model not found), print the error and return an empty list
        print(f"Error loading tokenizer: {e}")
        return []

    # Attempt to load the sequence-to-sequence model
    print(f"Loading model from {model_name_or_path}...")
    try:
        # Load the model and move it to the selected device (CPU or GPU)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path).to(device)
        # Set the model to evaluation mode (disables training-specific operations like dropout)
        model.eval()
        # Report successful loading with the time taken
        print(f"Model loaded on {device} in {time.time() - start_time:.2f} seconds")
    except Exception as e:
        # If loading fails, print the error and return an empty list
        print(f"Error loading model: {e}")
        return []

    # Get the token ID for the target language (e.g., "kat_Geor" for Georgian)
    try:
        # Use the tokenizer’s language code mapping to get the ID for the target language
        tgt_lang_id = tokenizer.lang_code_to_id[tgt_lang]
        # Display the target language and its corresponding ID
        print(f"Target language '{tgt_lang}' ID: {tgt_lang_id}")
    except AttributeError:
        # If the tokenizer lacks lang_code_to_id (older versions), try a workaround
        print("Error: Tokenizer does not support lang_code_to_id. Attempting workaround...")
        # Attempt to get the ID using convert_tokens_to_ids
        tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang)
        # Validate the ID (NLLB language IDs are typically high, e.g., >256000)
        if tgt_lang_id is None or tgt_lang_id < 256000:
            print(f"Error: Could not find valid ID for '{tgt_lang}' (got {tgt_lang_id}). Aborting.")
            return []
        print(f"Workaround target language '{tgt_lang}' ID: {tgt_lang_id}")
    except KeyError:
        # If the target language isn’t in the tokenizer’s vocabulary, abort
        print(f"Error: '{tgt_lang}' not found in tokenizer's language codes. Aborting.")
        return []

    # Create a dataset object from the source texts
    dataset = TranslationDataset(src_texts)
    # Report the number of entries in the dataset
    print(f"Dataset size: {len(dataset)} entries")
    # Create a DataLoader to batch the dataset for processing
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    # Report the batch size and total number of batches
    print(f"DataLoader created with batch_size={batch_size}, num_batches={len(dataloader)}")

    # Initialize an empty list to store translated outputs
    outputs = []
    # Disable gradient computation for faster inference
    with torch.no_grad():
        # Iterate over batches with a progress bar
        for i, batch in enumerate(tqdm(dataloader, desc=f"Translating with {model_name_or_path}")):
            # Record the start time for this batch
            batch_start = time.time()
            # Report the current batch number and size
            print(f"\nProcessing batch {i+1}/{len(dataloader)} with {len(batch)} texts")
            
            # Tokenize the batch of texts into tensors, padding/truncating as needed
            inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
            # Move the tokenized inputs to the selected device
            inputs = {k: v.to(device) for k, v in inputs.items()}
            # Print the shape of the tokenized input (batch_size x sequence_length)
            print(f"Batch tokenized, input shape: {inputs['input_ids'].shape}")

            try:
                # Generate translations using the model
                gen_tokens = model.generate(
                    **inputs,  # Pass tokenized inputs
                    max_new_tokens=300,  # Limit output length to 300 new tokens
                    num_beams=num_beams,  # Use greedy decoding (num_beams=1) for speed
                    forced_bos_token_id=tgt_lang_id  # Force the output to start with the Georgian language token
                )
                # Decode the generated tokens into human-readable text, skipping special tokens
                translated = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
                # Add the translated texts to the output list
                outputs.extend(translated)
                # Report the time taken for this batch
                print(f"Batch {i+1} translated in {time.time() - batch_start:.2f} seconds")
            except Exception as e:
                # If generation fails (e.g., memory error), print the error and return partial results
                print(f"Error during generation: {e}")
                return outputs

    # Report the total time taken for this model’s translations
    print(f"Finished {model_name_or_path} in {time.time() - start_time:.2f} seconds")
    # Return the list of all translated texts
    return outputs

# Main function to orchestrate the translation comparison
def main():
    # Announce the start of the comparison process
    print("Starting translation comparison...")
    # Record the overall start time
    overall_start = time.time()

    # Define configuration variables
    test_json = "test_conf_line_copy.json"  # Path to the JSON file with test data
    baseline_model = "facebook/nllb-200-distilled-600M"  # Pretrained NLLB model
    finetuned_model = "nllb_finetuned"  # Path to a finetuned model (needs replacement if valid)
    output_file = "translation_comparison.tsv"  # Output file for results
    batch_size = 8  # Number of texts to process per batch

    # Load English source texts and Georgian reference texts from JSON
    en_texts, ge_references = load_json_data(test_json)
    # If loading fails or no texts are found, exit
    if not en_texts or not ge_references:
        print("Failed to load data. Exiting.")
        return
    # Show a preview of the first source and reference texts
    print(f"Sample source text: {en_texts[0][:50]}...")
    print(f"Sample reference text: {ge_references[0][:50]}...")

    # Run translations with the baseline model
    print("\nRunning baseline model...")
    baseline_outputs = generate_translations(baseline_model, en_texts, batch_size)
    # Check if the number of outputs matches the input count
    if len(baseline_outputs) != len(en_texts):
        print(f"Warning: Baseline outputs ({len(baseline_outputs)}) don’t match inputs ({len(en_texts)})")
    else:
        # Show a preview of the first baseline translation
        print(f"Baseline sample output: {baseline_outputs[0][:50]}...")

    # Run translations with the finetuned model
    print("\nRunning finetuned model...")
    finetuned_outputs = generate_translations(finetuned_model, en_texts, batch_size)
    # Check if the number of outputs matches the input count
    if len(finetuned_outputs) != len(en_texts):
        print(f"Warning: Finetuned outputs ({len(finetuned_outputs)}) don’t match inputs ({len(en_texts)})")
    else:
        # Show a preview of the first finetuned translation
        print(f"Finetuned sample output: {finetuned_outputs[0][:50]}...")

    # Write the results to a TSV file
    print(f"\nWriting results to {output_file}...")
    try:
        # Open the output file in write mode with UTF-8 encoding
        with open(output_file, "w", encoding="utf-8") as f:
            # Write the header row
            f.write("Source (EN)\tBaseline (GE)\tFinetuned (GE)\tReference (GE)\n")
            # Write each set of source, baseline, finetuned, and reference texts as a tab-separated row
            for src, base, fine, ref in zip(en_texts, baseline_outputs, finetuned_outputs, ge_references):
                f.write(f"{src}\t{base}\t{fine}\t{ref}\n")
        print("Results saved successfully")  # Confirm successful write
    except Exception as e:
        # If writing fails (e.g., permission error), print the error
        print(f"Error writing file: {e}")

    # Calculate and display BLEU scores for evaluation
    print("\nCalculating BLEU scores...")
    try:
        # Compute BLEU score for baseline translations against reference texts
        baseline_bleu = sacrebleu.corpus_bleu(baseline_outputs, [ge_references])
        # Compute BLEU score for finetuned translations against reference texts
        finetuned_bleu = sacrebleu.corpus_bleu(finetuned_outputs, [ge_references])
        # Print the scores rounded to 2 decimal places
        print(f"Baseline BLEU: {baseline_bleu.score:.2f}")
        print(f"Finetuned BLEU: {finetuned_bleu.score:.2f}")
    except Exception as e:
        # If BLEU calculation fails (e.g., empty outputs), print the error
        print(f"Error calculating BLEU: {e}")

    # Report the total runtime for the entire process
    print(f"\nTotal runtime: {time.time() - overall_start:.2f} seconds")

# Entry point: run the main function if this script is executed directly
if __name__ == "__main__":
    main()

Using device: cpu
CUDA available: False
Number of CPU cores: 22
Starting translation comparison...
Loading JSON from test_conf_line_copy.json...
Loaded 85 source texts and 85 reference texts in 0.00 seconds
Sample source text: Stop it!...
Sample reference text: შეჩერდით!...

Running baseline model...

Starting translations with facebook/nllb-200-distilled-600M from eng_Latn to kat_Geor...
Loading tokenizer from facebook/nllb-200-distilled-600M...
Tokenizer loaded
Loading model from facebook/nllb-200-distilled-600M...
Model loaded on cpu in 3.99 seconds
Error: Tokenizer does not support lang_code_to_id. Attempting workaround...
Workaround target language 'kat_Geor' ID: 256086
Dataset size: 85 entries
DataLoader created with batch_size=8, num_batches=11


Translating with facebook/nllb-200-distilled-600M:   0%|          | 0/11 [00:00<?, ?it/s]


Processing batch 1/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 174])


Translating with facebook/nllb-200-distilled-600M:   9%|▉         | 1/11 [01:04<10:48, 64.84s/it]

Batch 1 translated in 64.84 seconds

Processing batch 2/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 100])


Translating with facebook/nllb-200-distilled-600M:  18%|█▊        | 2/11 [01:35<06:43, 44.79s/it]

Batch 2 translated in 30.76 seconds

Processing batch 3/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 63])


Translating with facebook/nllb-200-distilled-600M:  27%|██▋       | 3/11 [01:45<03:49, 28.69s/it]

Batch 3 translated in 9.52 seconds

Processing batch 4/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 58])


Translating with facebook/nllb-200-distilled-600M:  36%|███▋      | 4/11 [01:53<02:23, 20.53s/it]

Batch 4 translated in 8.01 seconds

Processing batch 5/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 42])


Translating with facebook/nllb-200-distilled-600M:  45%|████▌     | 5/11 [03:03<03:51, 38.62s/it]

Batch 5 translated in 70.70 seconds

Processing batch 6/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 142])


Translating with facebook/nllb-200-distilled-600M:  55%|█████▍    | 6/11 [04:20<04:17, 51.43s/it]

Batch 6 translated in 76.30 seconds

Processing batch 7/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 138])


Translating with facebook/nllb-200-distilled-600M:  64%|██████▎   | 7/11 [05:10<03:24, 51.14s/it]

Batch 7 translated in 50.53 seconds

Processing batch 8/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 172])


Translating with facebook/nllb-200-distilled-600M:  73%|███████▎  | 8/11 [05:44<02:17, 45.71s/it]

Batch 8 translated in 34.07 seconds

Processing batch 9/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 21])


Translating with facebook/nllb-200-distilled-600M:  82%|████████▏ | 9/11 [05:56<01:10, 35.24s/it]

Batch 9 translated in 12.24 seconds

Processing batch 10/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 84])


Translating with facebook/nllb-200-distilled-600M:  91%|█████████ | 10/11 [06:19<00:31, 31.31s/it]

Batch 10 translated in 22.51 seconds

Processing batch 11/11 with 5 texts
Batch tokenized, input shape: torch.Size([5, 118])


Translating with facebook/nllb-200-distilled-600M: 100%|██████████| 11/11 [07:00<00:00, 38.20s/it]

Batch 11 translated in 40.64 seconds
Finished facebook/nllb-200-distilled-600M in 424.15 seconds
Baseline sample output: ჟრწ!...

Running finetuned model...

Starting translations with nllb_finetuned from eng_Latn to kat_Geor...
Loading tokenizer from nllb_finetuned...





Tokenizer loaded
Loading model from nllb_finetuned...
Model loaded on cpu in 3.34 seconds
Error: Tokenizer does not support lang_code_to_id. Attempting workaround...
Workaround target language 'kat_Geor' ID: 256086
Dataset size: 85 entries
DataLoader created with batch_size=8, num_batches=11


Translating with nllb_finetuned:   0%|          | 0/11 [00:00<?, ?it/s]


Processing batch 1/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 174])


Translating with nllb_finetuned:   9%|▉         | 1/11 [00:44<07:27, 44.73s/it]

Batch 1 translated in 44.73 seconds

Processing batch 2/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 100])


Translating with nllb_finetuned:  18%|█▊        | 2/11 [01:07<04:47, 31.97s/it]

Batch 2 translated in 23.04 seconds

Processing batch 3/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 63])


Translating with nllb_finetuned:  27%|██▋       | 3/11 [01:24<03:20, 25.09s/it]

Batch 3 translated in 16.91 seconds

Processing batch 4/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 58])


Translating with nllb_finetuned:  36%|███▋      | 4/11 [01:41<02:31, 21.68s/it]

Batch 4 translated in 16.45 seconds

Processing batch 5/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 42])


Translating with nllb_finetuned:  45%|████▌     | 5/11 [01:55<01:54, 19.12s/it]

Batch 5 translated in 14.56 seconds

Processing batch 6/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 142])


Translating with nllb_finetuned:  55%|█████▍    | 6/11 [02:33<02:06, 25.34s/it]

Batch 6 translated in 37.43 seconds

Processing batch 7/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 138])


Translating with nllb_finetuned:  64%|██████▎   | 7/11 [03:20<02:10, 32.61s/it]

Batch 7 translated in 47.57 seconds

Processing batch 8/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 172])


Translating with nllb_finetuned:  73%|███████▎  | 8/11 [04:19<02:02, 40.84s/it]

Batch 8 translated in 58.45 seconds

Processing batch 9/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 21])


Translating with nllb_finetuned:  82%|████████▏ | 9/11 [04:24<00:59, 29.65s/it]

Batch 9 translated in 5.05 seconds

Processing batch 10/11 with 8 texts
Batch tokenized, input shape: torch.Size([8, 84])


Translating with nllb_finetuned:  91%|█████████ | 10/11 [04:55<00:30, 30.11s/it]

Batch 10 translated in 31.12 seconds

Processing batch 11/11 with 5 texts
Batch tokenized, input shape: torch.Size([5, 118])


Translating with nllb_finetuned: 100%|██████████| 11/11 [05:32<00:00, 30.23s/it]

Batch 11 translated in 37.19 seconds
Finished nllb_finetuned in 335.86 seconds





Finetuned sample output: ჟრწ!...

Writing results to translation_comparison.tsv...
Results saved successfully

Calculating BLEU scores...
Baseline BLEU: 8.88
Finetuned BLEU: 11.87

Total runtime: 760.55 seconds


In [4]:
import pandas as pd
from comet import download_model, load_from_checkpoint
import time
import os
import numpy as np
from scipy.stats import ttest_rel

def load_tsv_data(tsv_path):
    print(f"Loading TSV from {tsv_path}...")
    start_time = time.time()
    try:
        df = pd.read_csv(tsv_path, sep="\t", encoding="utf-8")
        src_texts = df["Source (EN)"].tolist()
        baseline_texts = df["Baseline (GE)"].tolist()
        finetuned_texts = df["Finetuned (GE)"].tolist()
        ref_texts = df["Reference (GE)"].tolist()
        if not (len(src_texts) == len(baseline_texts) == len(finetuned_texts) == len(ref_texts)):
            raise ValueError("Mismatch in number of entries across columns")
        print(f"Loaded {len(src_texts)} entries in {time.time() - start_time:.2f} seconds")
        return src_texts, baseline_texts, finetuned_texts, ref_texts
    except Exception as e:
        print(f"Error loading TSV: {e}")
        return [], [], [], []

def compute_xcomet_scores(src_texts, hyp_texts, ref_texts, model_path="Unbabel/wmt22-comet-da"):
    print(f"\nComputing XCOMET scores with {model_path}...")
    start_time = time.time()
    try:
        print("Downloading model...")
        checkpoint_path = download_model(model_path)
        print("Loading model from checkpoint...")
        model = load_from_checkpoint(checkpoint_path)
        print("XCOMET model loaded")
    except Exception as e:
        print(f"Error loading XCOMET model: {e}")
        return None
    data = [{"src": src, "mt": hyp, "ref": ref} for src, hyp, ref in zip(src_texts, hyp_texts, ref_texts)]
    try:
        print("Starting prediction...")
        scores = model.predict(data, batch_size=8, gpus=0, progress_bar=True)
        print(f"XCOMET scores computed in {time.time() - start_time:.2f} seconds")
        return scores
    except Exception as e:
        print(f"Error computing XCOMET scores: {e}")
        return None

def write_scores_to_file(scores, model_name, output_file):
    try:
        with open(output_file, "a", encoding="utf-8") as f:
            f.write(f"\n{model_name} XCOMET Scores:\n")
            f.write(f"System-level score: {scores.system_score:.6f}\n")
            avg_score = sum(scores.scores) / len(scores.scores)
            f.write(f"Average segment-level score: {avg_score:.6f}\n")
            f.write("Individual segment scores:\n")
            for i, score in enumerate(scores.scores):
                f.write(f"Sentence {i+1}: {score:.6f}\n")
    except Exception as e:
        print(f"Error writing to file: {e}")

def save_texts_to_files(src_texts, baseline_texts, finetuned_texts, ref_texts):
    with open("src.en.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(src_texts))
    with open("baseline.ge.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(baseline_texts))
    with open("finetuned.ge.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(finetuned_texts))
    with open("ref.ge.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(ref_texts))
    print("Text files saved: src.en.txt, baseline.ge.txt, finetuned.ge.txt, ref.ge.txt")

def compute_statistical_significance(baseline_scores, finetuned_scores, output_file, num_splits=300, sample_ratio=0.4):
    """
    Replaces comet-compare: computes paired t-test and bootstrap resampling on XCOMET scores.
    Args:
        baseline_scores: XCOMET scores object for baseline system
        finetuned_scores: XCOMET scores object for finetuned system
        output_file: File to write results
        num_splits: Number of bootstrap resamples (matches comet-compare default)
        sample_ratio: Fraction of data to sample per split (matches comet-compare default)
    """
    print("\nComputing statistical significance...")
    start_time = time.time()
    
    # Extract segment-level scores
    baseline_seg_scores = np.array(baseline_scores.scores)
    finetuned_seg_scores = np.array(finetuned_scores.scores)
    
    # System-level scores
    baseline_sys_score = baseline_scores.system_score
    finetuned_sys_score = finetuned_scores.system_score
    
    # Paired t-test
    t_stat, p_value_ttest = ttest_rel(baseline_seg_scores, finetuned_seg_scores)
    
    # Bootstrap resampling
    differences = finetuned_seg_scores - baseline_seg_scores
    observed_mean_diff = np.mean(differences)
    n_samples = int(len(differences) * sample_ratio)
    resample_means = []
    np.random.seed(1)  # Match comet-compare’s default seed
    for _ in range(num_splits):
        resample = np.random.choice(differences, size=n_samples, replace=True)
        resample_means.append(np.mean(resample))
    p_value_bootstrap = np.mean(np.array(resample_means) <= 0)  # One-sided: Finetuned better
    
    # Write results
    with open(output_file, "a", encoding="utf-8") as f:
        f.write("\nStatistical Significance Results:\n")
        f.write(f"System 1 (Baseline): {baseline_sys_score:.4f}\n")
        f.write(f"System 2 (Finetuned): {finetuned_sys_score:.4f}\n")
        f.write(f"Paired T-Test: t-statistic = {t_stat:.2f}, p-value = {p_value_ttest:.3f}\n")
        f.write(f"Bootstrap Resampling (n={num_splits}, ratio={sample_ratio}): p-value = {p_value_bootstrap:.3f}\n")
    
    # Print results
    print(f"System 1 (Baseline): {baseline_sys_score:.4f}")
    print(f"System 2 (Finetuned): {finetuned_sys_score:.4f}")
    print(f"Paired T-Test: t-statistic = {t_stat:.2f}, p-value = {p_value_ttest:.3f}")
    print(f"Bootstrap Resampling (n={num_splits}, ratio={sample_ratio}): p-value = {p_value_bootstrap:.3f}")
    print(f"Statistical significance computed in {time.time() - start_time:.2f} seconds")

def main():
    print("Starting XCOMET scoring and comparison...")
    overall_start = time.time()
    tsv_path = "translation_comparison.tsv"
    output_file = "xcomet_scores.txt"
    
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("XCOMET Scoring Results\n")
        f.write(f"Run started at: {time.ctime()}\n")

    src_texts, baseline_texts, finetuned_texts, ref_texts = load_tsv_data(tsv_path)
    if not src_texts:
        print("Failed to load data. Exiting.")
        return

    baseline_scores = compute_xcomet_scores(src_texts, baseline_texts, ref_texts)
    if baseline_scores is not None:
        write_scores_to_file(baseline_scores, "Baseline", output_file)
        print(f"Baseline XCOMET scores written to {output_file}")

    finetuned_scores = compute_xcomet_scores(src_texts, finetuned_texts, ref_texts)
    if finetuned_scores is not None:
        write_scores_to_file(finetuned_scores, "Finetuned", output_file)
        print(f"Finetuned XCOMET scores written to {output_file}")

    save_texts_to_files(src_texts, baseline_texts, finetuned_texts, ref_texts)
    
    # Replace run_comet_compare with our custom function
    if baseline_scores is not None and finetuned_scores is not None:
        compute_statistical_significance(baseline_scores, finetuned_scores, output_file)

    total_time = time.time() - overall_start
    print(f"\nTotal runtime: {total_time:.2f} seconds")
    with open(output_file, "a", encoding="utf-8") as f:
        f.write(f"\nTotal runtime: {total_time:.2f} seconds\n")

if __name__ == "__main__":
    main()

Starting XCOMET scoring and comparison...
Loading TSV from translation_comparison.tsv...
Loaded 85 entries in 0.02 seconds

Computing XCOMET scores with Unbabel/wmt22-comet-da...
Downloading model...


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Loading model from checkpoint...


Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\jbren\.cache\huggingface\hub\models--Unbabel--wmt22-comet-da\snapshots\2760a223ac957f30acfb18c8aa649b01cf1d75f2\checkpoints\model.ckpt`
Encoder model frozen.
c:\Users\jbren\AppData\Local\R-MINI~1\envs\stat486\lib\site-packages\pytorch_lightning\core\saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


XCOMET model loaded
Starting prediction...


Predicting DataLoader 0: 100%|██████████| 11/11 [00:30<00:00,  2.76s/it]

XCOMET scores computed in 40.68 seconds
Baseline XCOMET scores written to xcomet_scores.txt

Computing XCOMET scores with Unbabel/wmt22-comet-da...
Downloading model...





Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Loading model from checkpoint...


Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\jbren\.cache\huggingface\hub\models--Unbabel--wmt22-comet-da\snapshots\2760a223ac957f30acfb18c8aa649b01cf1d75f2\checkpoints\model.ckpt`
Encoder model frozen.
c:\Users\jbren\AppData\Local\R-MINI~1\envs\stat486\lib\site-packages\pytorch_lightning\core\saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


XCOMET model loaded
Starting prediction...


Predicting DataLoader 0: 100%|██████████| 11/11 [00:32<00:00,  3.00s/it]

XCOMET scores computed in 41.51 seconds
Finetuned XCOMET scores written to xcomet_scores.txt
Text files saved: src.en.txt, baseline.ge.txt, finetuned.ge.txt, ref.ge.txt

Computing statistical significance...
System 1 (Baseline): 0.7350
System 2 (Finetuned): 0.7655
Paired T-Test: t-statistic = -1.76, p-value = 0.082
Bootstrap Resampling (n=300, ratio=0.4): p-value = 0.093
Statistical significance computed in 0.03 seconds

Total runtime: 82.25 seconds





In [11]:
##SAVE MODEL##
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model

model_to_save.save_pretrained(
    "./my_nllb_finetuned_split_conf",
    safe_serialization=True,     # will save model.safetensors
    max_shard_size="2GB"        # optional: shard to smaller files
)

tokenizer.save_pretrained("./my_nllb_finetuned")

('./my_nllb_finetuned\\tokenizer_config.json',
 './my_nllb_finetuned\\special_tokens_map.json',
 './my_nllb_finetuned\\sentencepiece.bpe.model',
 './my_nllb_finetuned\\added_tokens.json',
 './my_nllb_finetuned\\tokenizer.json')

In [12]:
##LOAD PREVIOUSLY SAVED MODEL##
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("./my_nllb_finetuned")
tokenizer = AutoTokenizer.from_pretrained("./my_nllb_finetuned")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]