In [1]:
import pandas as pd
import re
import numpy as np

def parse_test_input(input_path):
    """
    Parse the test input file and return a list of (transcription, translation) tuples.
    """
    entries = []
    transcription = None
    translation = None

    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line.startswith("\\t"):
                transcription = line[2:].strip()
            elif line.startswith("\\l"):
                translation = line[2:].strip()
                if transcription:
                    entries.append((transcription, translation))
                    transcription, translation = None, None
    return entries


def merge_with_outputs(input_txt, output_csv, merged_csv):
    """
    Merge parsed transcriptions/translations with the output CSV using pandas.
    """
    # Parse the transcription/translation pairs
    pairs = parse_test_input(input_txt)
    df_pairs = pd.DataFrame(pairs, columns=["transcription", "translation"])

    # Read the output CSV
    df_out = pd.read_csv(output_csv)

    # Align and merge (assumes same order and number of rows)
    if len(df_out) != len(df_pairs):
        if len(df_out) == 2 * len(df_pairs):
            print(f"⚠️ Warning: double the outputs. Assuming segmentation and gloss")
            df_pairs = df_pairs.loc[np.repeat(df_pairs.index, 2)].reset_index(drop=True)
        else:
            print(f"⚠️ Warning: Mismatch in rows — output={len(df_out)}, inputs={len(df_pairs)}. Will align by index.")
    df_merged = pd.concat([df_out.reset_index(drop=True), df_pairs.reset_index(drop=True)], axis=1)

    # Save merged CSV
    df_merged.to_csv(merged_csv, index=False, encoding="utf-8")
    print(f"✅ Merged CSV saved to: {merged_csv}")


# Example usage
input_txt = "/projects/enri8153/polygloss/data/nort2938/nort2938-test.txt"      # input file with \t and \l lines
output_csv = "/projects/enri8153/polygloss/experiments/polygloss_peft/nort2938/predictions.csv"    # file with predicted, reference, etc.
merged_csv = "/projects/enri8153/polygloss/experiments/polygloss_peft/nort2938/predictions_with_gt.csv"  # output path

merge_with_outputs(input_txt, output_csv, merged_csv)


✅ Merged CSV saved to: /projects/enri8153/polygloss/experiments/polygloss_peft/nort2938/predictions_with_gt.csv


In [1]:
import json
import pandas as pd
path = "/projects/enri8153/polygloss/wandb/run-20250918_163458-z71wr0x6/files/media/table/predictions_14847_bc257628e1e97d66196c.table.json"
# Load JSON
with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert to DataFrame and save as CSV
df = pd.DataFrame(data["data"], columns=data["columns"])
df.to_csv("output.csv", index=False)

In [2]:
import pandas as pd
from glossing import eval
df = pd.read_csv('/projects/enri8153/polygloss/experiments/igt_unsegmented_no_pretrain/dido1241/None/predictions.csv')
eval.evaluate_glosses(df.predicted.tolist(), df.reference.tolist()) 

[93.31455677438483, 87.31800766283524, 81.686230964467, 76.61515699490383]


{'word_level': {'average_accuracy': 0.8062821967573969,
  'accuracy': 0.7941986025831039},
 'morpheme_accuracy': {'average_accuracy': 0.7880896507878158,
  'accuracy': 0.7544560872852976},
 'classes': {'stem': {'prec': 0.7506444808999297,
   'rec': 0.7436730903180868,
   'f1': 0.7471425239094939},
  'gram': {'prec': 0.8094978165938864,
   'rec': 0.8032715848770448,
   'f1': 0.8063726822902505}},
 'bleu': 84.21675369266296}

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, TaskType
from peft import get_peft_model

peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)

tokenizer = AutoTokenizer.from_pretrained("lecslab/polygloss_byt5_2025_09_18")
model = AutoModelForSeq2SeqLM.from_pretrained("lecslab/polygloss_byt5_2025_09_18")
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()




  from .autonotebook import tqdm as notebook_tqdm


trainable params: 1,105,920 || all params: 582,759,168 || trainable%: 0.1898


In [1]:
import csv

def deduplicate_csv(input_file, output_file, key_columns=None):
    """
    Deduplicate rows in a CSV file.

    :param input_file: Path to input CSV file
    :param output_file: Path to save deduplicated CSV file
    :param key_columns: List of column names or indices to use as deduplication keys.
                        If None, the entire row is used.
    """
    seen = set()
    deduped_rows = []

    with open(input_file, mode="r", newline="", encoding="utf-8") as infile:
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames

        for row in reader:
            # Create a unique identifier based on selected key columns or the whole row
            if key_columns:
                key = tuple(row[col] for col in key_columns)
            else:
                key = tuple(row.values())

            if key not in seen:
                seen.add(key)
                deduped_rows.append(row)

    # Write deduplicated rows to output
    with open(output_file, mode="w", newline="", encoding="utf-8") as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(deduped_rows)

    print(f"Deduplicated CSV written to {output_file}")


# Example usage
if __name__ == "__main__":
    deduplicate_csv("/projects/enri8153/polygloss/tsez_byt5_test.csv", "/projects/enri8153/polygloss/tsez_byt5_test.csv")  # deduplicate based on full row
    # deduplicate_csv("input.csv", "output.csv", key_columns=["id", "email"])  # deduplicate based on specific columns


Deduplicated CSV written to /projects/enri8153/polygloss/tsez_byt5_test.csv


In [7]:
from transformers import AutoModelForSeq2SeqLM
from peft import PeftModel, LoraConfig, TaskType, get_peft_model
import torch

base_model_name = "lecslab/polygloss_byt5_2025_09_18"
adapter_dir = "/scratch/alpine/enri8153/polygloss-peft-dido1241/chunk_0_10/dido1241/10"
lora_config = LoraConfig(
            task_type=TaskType.SEQ_2_SEQ_LM,
            r=8,
            lora_alpha=32,
            lora_dropout=0.1
        )
# 1️⃣ Load the base model
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name)

# 2️⃣ Load the PEFT adapter
lora_model = get_peft_model(base_model, lora_config)


ckpt_path = "/scratch/alpine/enri8153/polygloss-peft-dido1241/chunk_0_10/egxgqorw.checkpoint.pt"
# 2️⃣ Load checkpoint from state dict?
lora_model.load_state_dict(torch.load(ckpt_path))



RuntimeError: PytorchStreamReader failed reading zip archive: failed finding central directory

In [None]:
# Load the base model
model = AutoModel.from_pretrained('BAAI/bge-multilingual-gemma2')
model.load_state_dict(torch.load('base_model.pt'))

# Load the adapter layers (PEFT part)
lora_model = get_peft_model(model, lora_config)
lora_model.load_state_dict(torch.load('adapter_model.pt'))
