In [1]:
#cell 0
!pip install unbabel-comet

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://packagecloud.io/github/git-lfs/pypi/simple


In [2]:
#cell 1
import torch

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("✅ GPU is available!")
    print("GPU Name:", torch.cuda.get_device_name(0))

    # Perform a small tensor operation to confirm it's using GPU
    x = torch.rand(10000, 10000).to(device)
    y = torch.rand(10000, 10000).to(device)
    z = torch.matmul(x, y)  # Matrix multiplication on GPU
    print("GPU test operation successful! 🚀")
else:
    print("❌ No GPU detected. Running on CPU.")


✅ GPU is available!
GPU Name: NVIDIA A40
GPU test operation successful! 🚀


In [3]:
#cell 2
import pandas as pd
import os
import time
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from comet import download_model, load_from_checkpoint
from scipy.stats import linregress
from statsmodels.stats.anova import AnovaRM
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import logging
from scipy.stats import linregress

In [4]:
#cell 3 Setup logging
logging.basicConfig(
    filename="/content/drive/MyDrive/24/comet_outputs/comet_evaluation.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def log_and_print(message):
    print(message)
    logging.info(message)

In [5]:

#cell 5 Setup output directory in Google Drive
output_dir = "afra_outputs"
os.makedirs(output_dir, exist_ok=True)

# Function to log and print messages
def log_and_print(message):
    print(message)

# Handle missing hparams file
def handle_missing_hparams(model_path, model_name):
    hparams_path = os.path.join(os.path.dirname(model_path), "hparams.yaml")
    if not os.path.exists(hparams_path):
        with open(hparams_path, "w") as f:
            f.write("dummy_hparams: true\n")
        log_and_print(f"Dummy hparams.yaml created for {model_name}")



In [6]:
# cell 6 Create necessary directories
def ensure_directory_exists(path):
    os.makedirs(path, exist_ok=True)

# Load COMET Models
def load_comet_models():
    model_names = ["wmt20-comet-da", "wmt21-comet-da", "Unbabel/wmt22-comet-da"]
    comet_models = {}
    for model_name in model_names:
        try:
            model_ckpt_path = download_model(model_name)
            model = load_from_checkpoint(model_ckpt_path)
            model.eval()
            if torch.cuda.is_available():
                model = model.cuda()
            comet_models[model_name] = model
            log_and_print(f"{model_name} loaded successfully.")
        except Exception as e:
            log_and_print(f"Skipping {model_name}: {str(e)}")
    return comet_models

comet_models = load_comet_models()

wmt20-comet-da is already in cache.
Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../u/nzawad/.cache/torch/unbabel_comet/wmt20-comet-da/checkpoints/model.ckpt`
Encoder model frozen.
/u/nzawad/.local/lib/python3.9/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
wmt21-comet-da is already in cache.


wmt20-comet-da loaded successfully.


Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../u/nzawad/.cache/torch/unbabel_comet/wmt21-comet-da/checkpoints/model.ckpt`
Encoder model frozen.


wmt21-comet-da loaded successfully.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../u/nzawad/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/f49d328952c3470eff6bb6f545d62bfdb6e66304/checkpoints/model.ckpt`
Encoder model frozen.


Unbabel/wmt22-comet-da loaded successfully.


In [7]:
# cell 7 calculate_statistics
def calculate_statistics(results):
    """ Compute aggregated COMET statistics and save the results. """
    metrics = results.groupby(["Language Pair", "Model", "Tool"]).agg(
        Mean_COMET=("COMET Score", "mean"),
        Std_Dev=("COMET Score", "std"),
        Min_COMET=("COMET Score", "min"),
        Max_COMET=("COMET Score", "max"),
    ).reset_index()

    # Save statistics
    metrics.to_csv(f"{output_dir_sample}/Aggregated_COMET_Statistics_Arabic.csv", index=False)
    print("✅ Aggregated COMET statistics saved.")


In [8]:
# cell 8: visualize_comet_scores
import matplotlib.pyplot as plt
import seaborn as sns
import os

def visualize_comet_scores(results):
    """ Generate boxplots and heatmaps for COMET scores. """
    vis_dir = f"{output_dir_sample}/Model_Performance_Visualizations"
    os.makedirs(vis_dir, exist_ok=True)

    # Boxplot: COMET Scores by Model and Language Pair
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=results, x="Model", y="COMET Score", hue="Language Pair")
    plt.title("COMET Score Distributions by Model and Language Pair (Arabic)")
    plt.xlabel("COMET Model")
    plt.ylabel("COMET Score")
    plt.xticks(rotation=45)
    plt.legend(title="Language Pair")
    plt.tight_layout()
    plt.savefig(f"{vis_dir}/Model_COMET_Boxplot_Arabic.png")
    plt.close()

    # Heatmap: COMET Score Averages by Tool and Model
    heatmap_data = results.pivot_table(index="Tool", columns="Model", values="COMET Score", aggfunc="mean")
    plt.figure(figsize=(10, 8))
    sns.heatmap(heatmap_data, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
    plt.title("Average COMET Scores Across Models and Tools (Arabic)")
    plt.xlabel("COMET Model")
    plt.ylabel("Translation Tool")
    plt.tight_layout()
    plt.savefig(f"{vis_dir}/Model_COMET_Heatmap_Arabic.png")
    plt.close()

    print("✅ Visualization files saved.")


**For all**

In [9]:

#  cell 9: Define Paths Dynamically for One Language Pair
file_paths = {
    "English→Arabic": {
        "source": "afra_data/en_annotations_binary_complete.tsv",
        "translation": "afra_data/EN_AR_GOOGLE_binary_complete.tsv",
        "reference": "afra_data/ar_annotations_binary_complete.tsv"
    }
}

def count_rows(file_path):
    """Counts the number of rows in a TSV file, handling potential errors."""
    try:
        # Using pandas to read the file and count rows
        df = pd.read_csv(file_path, delimiter="\t", header=None, encoding="utf-8",
                         dtype=str, keep_default_na=False, on_bad_lines="warn")  # Fix: Updated deprecated argument
        num_rows = len(df)
        print(f"✅ File: {file_path}, Row Count: {num_rows}")  # Print the row count
        return df  # Return the DataFrame instead of just row count
    except Exception as e:
        print(f"⚠️ Error reading {file_path}: {type(e).__name__}, {e}")
        return None

# Loop through file paths and print row counts
for lang_pair, files in file_paths.items():
    print(f"\n📊 File Row Counts for {lang_pair}:")
    for file_type, file_path in files.items():
        df = count_rows(file_path)  # Call count_rows to print and get the DataFrame
        if df is not None:
            file_paths[lang_pair][file_type] = df  # Store DataFrame instead of path

# Assign DataFrames to variables for further use
lang_pair = "English→Arabic"
data_info = file_paths[lang_pair]

source_data = data_info["source"]  # Already a DataFrame
translation_data = data_info["translation"]  # Already a DataFrame
reference_data = data_info["reference"]  # Already a DataFrame






📊 File Row Counts for English→Arabic:
✅ File: afra_data/en_annotations_binary_complete.tsv, Row Count: 11494
✅ File: afra_data/EN_AR_GOOGLE_binary_complete.tsv, Row Count: 11494
✅ File: afra_data/ar_annotations_binary_complete.tsv, Row Count: 11494


In [10]:

# cell 10: Evaluate COMET Scores
# Ensure all three datasets have the same length
min_len = min(len(source_data), len(translation_data), len(reference_data))
source_data = source_data.iloc[:min_len]
translation_data = translation_data.iloc[:min_len]
reference_data = reference_data.iloc[:min_len]


#  Evaluate COMET Scores
def evaluate_comet(source, translation, reference, model, model_name, lang_pair):
    """Computes COMET scores and ensures correct alignment of input data."""
    results = []

    for i in range(len(source)):
        inputs = {"src": source[i], "mt": translation[i], "ref": reference[i]}
        predictions = model.predict([inputs], batch_size=1)  # Run prediction

        # ✅ Fix: Handle different COMET output formats
        if isinstance(predictions[0], dict):
            score = predictions[0]["score"]  # Expected format: dictionary
        elif isinstance(predictions[0], list):
            score = predictions[0][0]  # Alternative format: list
        else:
            raise ValueError(f"Unexpected COMET output format: {predictions}")

        results.append({
            "Source": source[i],
            "Reference": reference[i],
            "Translation": translation[i],
            "COMET Score": score,
            "Language Pair": lang_pair,
            "Model": model_name
        })

        # Save partial results every 1000 sentences
        if len(results) % 1000 == 0:
            temp_df = pd.DataFrame(results)
            temp_df.to_csv(f"{output_dir}/partial_results_{lang_pair}_{model_name}.csv", index=False)
            print(f"✅ Saved partial results for {lang_pair} with {model_name}.")

    return pd.DataFrame(results)



In [11]:
import os

os.environ["SLURM_NTASKS_PER_NODE"] = "16"  # Set to desired value

In [None]:
 #Cell 11: Run Evaluation for One Language Pair
start_time = time.time()
all_results = pd.DataFrame()

for model_name, model in comet_models.items():
    log_and_print(f"Evaluating {lang_pair} with {model_name}...")
    results = evaluate_comet(source_data[0].tolist(), translation_data[0].tolist(), reference_data[0].tolist(), model, model_name, lang_pair)
    all_results = pd.concat([all_results, results], ignore_index=True)

# Save Full Results
all_results.to_csv(f"{output_dir}/COMET_Evaluation_Results_{lang_pair}.tsv", index=False, sep="\t")
log_and_print(f"✅ All results saved to {output_dir}")

# Statistical Analysis for One Language Pair
metrics = all_results.groupby(["Model"]).agg(
    Mean_COMET=("COMET Score", "mean"),
    Std_Dev=("COMET Score", "std"),
    Min_COMET=("COMET Score", "min"),
    Max_COMET=("COMET Score", "max")
).reset_index()

metrics.to_csv(f"{output_dir}/Aggregated_COMET_Statistics_{lang_pair}.tsv", index=False, sep="\t")
log_and_print("✅ Aggregated COMET statistics saved.")

# Visualization and Execution Time Tracking
# Boxplot for Model Performance
plt.figure(figsize=(12, 6))
sns.boxplot(data=all_results, x="Model", y="COMET Score")
plt.title(f"COMET Score Distributions for {lang_pair}")
plt.xlabel("COMET Model")
plt.ylabel("COMET Score")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(f"{output_dir}/Model_COMET_Boxplot_{lang_pair}.png")
plt.close()

# Execution Time
end_time = time.time()
execution_time = end_time - start_time
hours, rem = divmod(execution_time, 3600)
minutes, seconds = divmod(rem, 60)

with open(f"{output_dir}/Execution_Time_{lang_pair}.txt", "w") as f:
    f.write(f"Total Execution Time for {lang_pair}: {int(hours)}h {int(minutes)}m {int(seconds)}s\n")

log_and_print(f"✅ Execution Time for {lang_pair}: {int(hours)}h {int(minutes)}m {int(seconds)}s")

log_and_print(f"🚀 Processing complete for {lang_pair}.")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A40') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Evaluating English→Arabic with wmt20-comet-da...


Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  1.75it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 18.85it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 18.29it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.39it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU 

✅ Saved partial results for English→Arabic with wmt20-comet-da.


SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 15.51it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.14it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 18.53it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.39it/s]
GPU available: Tru

✅ Saved partial results for English→Arabic with wmt20-comet-da.


SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.29it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.12it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.34it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.28it/s]
GPU available: Tru

✅ Saved partial results for English→Arabic with wmt20-comet-da.


SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 18.79it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.42it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 17.87it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 18.98it/s]
GPU available: Tru

✅ Saved partial results for English→Arabic with wmt20-comet-da.


SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 20.01it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.40it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.59it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.16it/s]
GPU available: Tru

✅ Saved partial results for English→Arabic with wmt20-comet-da.


SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.50it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.53it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 13.53it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.34it/s]
GPU available: Tru

✅ Saved partial results for English→Arabic with wmt20-comet-da.


SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.74it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.44it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.08it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.87it/s]
GPU available: Tru