In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


editing the full dataset to have percentages


In [None]:
import pandas as pd
import re

# 1) prepare regexes
arabic_re = re.compile(r'[\u0600-\u06FF]')
english_re = re.compile(r'[A-Za-z]')

def token_language(tok):
    """Return 'arabic', 'english' or 'other' for a single token."""
    has_ar = bool(arabic_re.search(tok))
    has_en = bool(english_re.search(tok))
    if has_ar and not has_en:
        return 'arabic'
    if has_en and not has_ar:
        return 'english'
    return 'other'

def pct_lang(text, lang):
    """
    Compute fraction of tokens in `text` that are classified as `lang`
    (lang must be 'arabic' or 'english').
    """
    if not isinstance(text, str) or not text.strip():
        return 0.0
    toks = text.split()
    langs = [token_language(t) for t in toks]
    return langs.count(lang) / len(toks)

def bin_pct(p):
    """Bin a float p in [0,1] into one of three string buckets."""
    if p <= 0.33:
        return '0–33%'
    if p <= 0.66:
        return '34–66%'
    return '67–100%'

# 2) load
df = pd.read_csv('/content/drive/MyDrive/Project/Full_Dataset.csv')



# 3) compute raw percentages
df['pr_ar'] = df['CodeMixed_Arabic'].apply(lambda t: pct_lang(t, 'arabic'))
df['pr_en'] = df['CodeMixed_English'].apply(lambda t: pct_lang(t, 'english'))

# 4) compute discrete categories
df['cat_ar'] = df['pr_ar'].apply(bin_pct).map(lambda s: f"Arabic {s}")
df['cat_en'] = df['pr_en'].apply(bin_pct).map(lambda s: f"English {s}")

out_cols = [
    'CodeMixed_Arabic','pr_ar','cat_ar',
    'CodeMixed_English','pr_en','cat_en',
    'GroundTruth_Arabic','GroundTruth_English'
]
df[out_cols].to_csv(
    '/content/drive/MyDrive/Project/Full_Dataset_with_pct_and_bins.csv',
    index=False
)

print("Done.  Added pr_ar, pr_en (floats) plus cat_ar, cat_en (6 discrete categories).")

Done.  Added pr_ar, pr_en (floats) plus cat_ar, cat_en (6 discrete categories).


BlEU scores

In [None]:
#mbart50

import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# 1) helper for binning
def bin_pct(p):
    """Bin a float p in [0,1] into one of three string buckets."""
    if p <= 0.33:
        return '0–33%'
    elif p <= 0.66:
        return '34–66%'
    else:
        return '67–100%'

smoother = SmoothingFunction().method1


full_df = pd.read_csv('/content/drive/MyDrive/Project/Full_Dataset_with_pct_and_bins.csv')

gpt_df = pd.read_csv('/content/drive/MyDrive/Project/Benchmark_data_results/mbart-50_outputs.csv')


assert len(full_df) == len(gpt_df)

df = pd.concat([full_df,
                gpt_df[['Full_English_Translation','Full_Arabic_Translation']]],
               axis=1)

def compute_bleus(row):
    pred_en = (row.Full_English_Translation or "").split()
    gt_en   = (row.GroundTruth_English   or "").split()
    pred_ar = (row.Full_Arabic_Translation  or "").split()
    gt_ar   = (row.GroundTruth_Arabic   or "").split()

    bleu_en = sentence_bleu([gt_en], pred_en, smoothing_function=smoother)
    bleu_ar = sentence_bleu([gt_ar], pred_ar, smoothing_function=smoother)
    return pd.Series({
        'English_BLEU': bleu_en,
        'Arabic_BLEU':   bleu_ar
    })

bleu_scores = df.apply(compute_bleus, axis=1)
df = pd.concat([df, bleu_scores], axis=1)

out_cols = [
    'CodeMixed_English','pr_en','cat_en',
    'GroundTruth_English','Full_English_Translation','English_BLEU',
    'CodeMixed_Arabic','pr_ar','cat_ar',
    'GroundTruth_Arabic','Full_Arabic_Translation','Arabic_BLEU',
]
df.to_csv('/content/drive/MyDrive/Project/Benchmark_data_results/bleu_per_example.csv',
          columns=out_cols, index=False)

en_summary = df.groupby('cat_en').English_BLEU.agg(
    count='count',
    mean='mean',
    std='std'
).reset_index().sort_values('cat_en')

ar_summary = df.groupby('cat_ar').Arabic_BLEU.agg(
    count='count',
    mean='mean',
    std='std'
).reset_index().sort_values('cat_ar')

en_summary.to_csv('/content/drive/MyDrive/Project/Benchmark_data_results/bleu_summary_en.csv', index=False)
ar_summary.to_csv('/content/drive/MyDrive/Project/Benchmark_data_results/bleu_summary_ar.csv', index=False)

print("Done.  • per-example BLEUs -> bleu_per_example.csv  • summaries -> bleu_summary_*.csv")

Done.  • per-example BLEUs -> bleu_per_example.csv  • summaries -> bleu_summary_*.csv


In [None]:
# google
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smoother = SmoothingFunction().method1

full_df = pd.read_csv('/content/drive/MyDrive/Project/Full_Dataset_with_pct_and_bins.csv')

gt_df = pd.read_csv('/content/drive/MyDrive/Project/Benchmark_data_results/google_translate_outputs.csv')

gt_df.columns = gt_df.columns.str.strip()

assert len(full_df) == len(gt_df), "make sure row counts match!"

df = pd.concat([
    full_df.reset_index(drop=True),
    gt_df[['CodeMixed_Arabic_Full_English_Translation',
           'CodeMixed_English_Full_Arabic_Translation']].reset_index(drop=True)
], axis=1)

for col in ['CodeMixed_Arabic_Full_English_Translation',
            'CodeMixed_English_Full_Arabic_Translation',
            'GroundTruth_English',
            'GroundTruth_Arabic']:
    if col in df:
        df[col] = df[col].fillna('').astype(str).str.replace('&#39;', "'", regex=False)

def compute_bleus(row):
    # tokenize on whitespace
    pred_en = row['CodeMixed_Arabic_Full_English_Translation'].split()
    gt_en   = row['GroundTruth_English'].split()
    pred_ar = row['CodeMixed_English_Full_Arabic_Translation'].split()
    gt_ar   = row['GroundTruth_Arabic'].split()

    bleu_en = sentence_bleu([gt_en], pred_en, smoothing_function=smoother)
    bleu_ar = sentence_bleu([gt_ar], pred_ar, smoothing_function=smoother)

    return pd.Series({
        'English_BLEU': bleu_en,
        'Arabic_BLEU':  bleu_ar
    })

bleu_scores = df.apply(compute_bleus, axis=1)
df = pd.concat([df, bleu_scores], axis=1)

out_cols = [
    'CodeMixed_English','pr_en','cat_en',
    'GroundTruth_English','CodeMixed_Arabic_Full_English_Translation','English_BLEU',
    'CodeMixed_Arabic','pr_ar','cat_ar',
    'GroundTruth_Arabic','CodeMixed_English_Full_Arabic_Translation','Arabic_BLEU'
]
df.to_csv('/content/drive/MyDrive/Project/Benchmark_data_results/google_bleu.csv',
          columns=out_cols, index=False)

en_summary = df.groupby('cat_en').English_BLEU.agg(
    count='count', mean='mean', std='std'
).reset_index().sort_values('cat_en')

ar_summary = df.groupby('cat_ar').Arabic_BLEU.agg(
    count='count', mean='mean', std='std'
).reset_index().sort_values('cat_ar')

en_summary.to_csv('/content/drive/MyDrive/Project/Benchmark_data_results/google_bleu_summary_en.csv', index=False)
ar_summary.to_csv('/content/drive/MyDrive/Project/Benchmark_data_results/google_bleu_summary_ar.csv', index=False)

print("✅ Done. • Per‐example → bleu_per_example_google.csv • Summaries → bleu_summary_*.csv")

✅ Done. • Per‐example → bleu_per_example_google.csv • Summaries → bleu_summary_*.csv


In [None]:
# GPT 4.1
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smoother = SmoothingFunction().method1

full_df = pd.read_csv('/content/drive/MyDrive/Project/Full_Dataset_with_pct_and_bins.csv')
gt_df = pd.read_csv('/content/drive/MyDrive/Project/Benchmark_data_results/gpt_4.1_outputs.csv')
gt_df.columns = gt_df.columns.str.strip()

assert len(full_df) == len(gt_df), "make sure row counts match!"

df = pd.concat([
    full_df.reset_index(drop=True),
    gt_df[['CodeMixed_Arabic_Full_English_Translation',
           'CodeMixed_English_Full_Arabic_Translation']].reset_index(drop=True)
], axis=1)

for col in ['CodeMixed_Arabic_Full_English_Translation',
            'CodeMixed_English_Full_Arabic_Translation',
            'GroundTruth_English',
            'GroundTruth_Arabic']:
    if col in df:
        df[col] = df[col].fillna('').astype(str).str.replace('&#39;', "'", regex=False)

def compute_bleus(row):
    # tokenize on whitespace
    pred_en = row['CodeMixed_Arabic_Full_English_Translation'].split()
    gt_en   = row['GroundTruth_English'].split()
    pred_ar = row['CodeMixed_English_Full_Arabic_Translation'].split()
    gt_ar   = row['GroundTruth_Arabic'].split()

    bleu_en = sentence_bleu([gt_en], pred_en, smoothing_function=smoother)
    bleu_ar = sentence_bleu([gt_ar], pred_ar, smoothing_function=smoother)

    return pd.Series({
        'English_BLEU': bleu_en,
        'Arabic_BLEU':  bleu_ar
    })

bleu_scores = df.apply(compute_bleus, axis=1)
df = pd.concat([df, bleu_scores], axis=1)

out_cols = [
    'CodeMixed_English','pr_en','cat_en',
    'GroundTruth_English','CodeMixed_Arabic_Full_English_Translation','English_BLEU',
    'CodeMixed_Arabic','pr_ar','cat_ar',
    'GroundTruth_Arabic','CodeMixed_English_Full_Arabic_Translation','Arabic_BLEU'
]
# note: adjust these names if your column names differ
df.to_csv('/content/drive/MyDrive/Project/Benchmark_data_results/gpt41_bleu.csv',
          columns=out_cols, index=False)

en_summary = df.groupby('cat_en').English_BLEU.agg(
    count='count', mean='mean', std='std'
).reset_index().sort_values('cat_en')

ar_summary = df.groupby('cat_ar').Arabic_BLEU.agg(
    count='count', mean='mean', std='std'
).reset_index().sort_values('cat_ar')

en_summary.to_csv('/content/drive/MyDrive/Project/Benchmark_data_results/gpt41_bleu_summary_en.csv', index=False)
ar_summary.to_csv('/content/drive/MyDrive/Project/Benchmark_data_results/gpt41_bleu_summary_ar.csv', index=False)
print("✅ Done. • Per‐example → bleu_per_example_google.csv • Summaries → bleu_summary_*.csv")

✅ Done. • Per‐example → bleu_per_example_google.csv • Summaries → bleu_summary_*.csv


In [None]:
# phi
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smoother = SmoothingFunction().method1

full_df = pd.read_csv('/content/drive/MyDrive/Project/Full_Dataset_with_pct_and_bins.csv')
gt_df = gpt_df = pd.read_csv('/content/drive/MyDrive/Project/Benchmark_data_results/microsoft_phi_3.5_mini_outputs.csv')

gt_df.columns = gt_df.columns.str.strip()

assert len(full_df) == len(gt_df), "make sure row counts match!"

df = pd.concat([
    full_df.reset_index(drop=True),
    gt_df[['CodeMixed_Arabic_Full_English_Translation',
           'CodeMixed_English_Full_Arabic_Translation']].reset_index(drop=True)
], axis=1)

for col in ['CodeMixed_Arabic_Full_English_Translation',
            'CodeMixed_English_Full_Arabic_Translation',
            'GroundTruth_English',
            'GroundTruth_Arabic']:
    if col in df:
        df[col] = df[col].fillna('').astype(str).str.replace('&#39;', "'", regex=False)

def compute_bleus(row):
    # tokenize on whitespace
    pred_en = row['CodeMixed_Arabic_Full_English_Translation'].split()
    gt_en   = row['GroundTruth_English'].split()
    pred_ar = row['CodeMixed_English_Full_Arabic_Translation'].split()
    gt_ar   = row['GroundTruth_Arabic'].split()

    bleu_en = sentence_bleu([gt_en], pred_en, smoothing_function=smoother)
    bleu_ar = sentence_bleu([gt_ar], pred_ar, smoothing_function=smoother)

    return pd.Series({
        'English_BLEU': bleu_en,
        'Arabic_BLEU':  bleu_ar
    })

bleu_scores = df.apply(compute_bleus, axis=1)
df = pd.concat([df, bleu_scores], axis=1)

out_cols = [
    'CodeMixed_English','pr_en','cat_en',
    'GroundTruth_English','CodeMixed_Arabic_Full_English_Translation','English_BLEU',
    'CodeMixed_Arabic','pr_ar','cat_ar',
    'GroundTruth_Arabic','CodeMixed_English_Full_Arabic_Translation','Arabic_BLEU'
]
# note: adjust these names if your column names differ
df.to_csv('/content/drive/MyDrive/Project/Benchmark_data_results/phi_bleu.csv',
          columns=out_cols, index=False)

en_summary = df.groupby('cat_en').English_BLEU.agg(
    count='count', mean='mean', std='std'
).reset_index().sort_values('cat_en')

ar_summary = df.groupby('cat_ar').Arabic_BLEU.agg(
    count='count', mean='mean', std='std'
).reset_index().sort_values('cat_ar')

en_summary.to_csv('/content/drive/MyDrive/Project/Benchmark_data_results/phi_bleu_summary_en.csv', index=False)
ar_summary.to_csv('/content/drive/MyDrive/Project/Benchmark_data_results/phi_bleu_summary_ar.csv', index=False)
print("✅ Done. • Per‐example → bleu_per_example_google.csv • Summaries → bleu_summary_*.csv")

✅ Done. • Per‐example → bleu_per_example_google.csv • Summaries → bleu_summary_*.csv


-------------------------------------------------

**BERT**

In [None]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

In [None]:
from bert_score import score as bert_score

EN_MODEL = "roberta-large"
AR_MODEL = "asafaya/bert-base-arabic"
MM_MODEL = "xlm-roberta-large"

DEVICE = "cuda"

In [None]:
import pandas as pd


full_df = pd.read_csv('/content/drive/MyDrive/Project/Full_Dataset_with_pct_and_bins.csv')
gt_df = pd.read_csv('/content/drive/MyDrive/Project/Benchmark_data_results/gpt_4.1_outputs.csv')

gt_df.columns = gt_df.columns.str.strip()

assert len(full_df) == len(gt_df), "make sure row counts match!"

df = pd.concat([
    full_df.reset_index(drop=True),
    gt_df[['CodeMixed_Arabic_Full_English_Translation',
           'CodeMixed_English_Full_Arabic_Translation']].reset_index(drop=True)
], axis=1)


In [None]:
en_preds = df["CodeMixed_Arabic_Full_English_Translation"].fillna("").tolist()
en_refs  = df["GroundTruth_English"].fillna("").tolist()

ar_preds = df["CodeMixed_English_Full_Arabic_Translation"].fillna("").tolist()
ar_refs  = df["GroundTruth_Arabic"].fillna("").tolist()

In [None]:
# English BERT-Score
P_en, R_en, F_en = bert_score(
    en_preds, en_refs,
    lang="en",
    model_type=EN_MODEL,
    device=DEVICE,
    batch_size=32,
    verbose=True
)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/320 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/164 [00:00<?, ?it/s]

done in 19.46 seconds, 268.55 sentences/sec


In [None]:
# Arabic BERT-Score
P_ar, R_ar, F_ar = bert_score(
    ar_preds, ar_refs,
    lang="ar",
    model_type=AR_MODEL,
    device=DEVICE,
    batch_size=32,
    verbose=True,
    num_layers=12
)

tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/334k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/287 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/164 [00:00<?, ?it/s]



done in 10.55 seconds, 495.41 sentences/sec


In [None]:
# these are torch tensors; convert to floats
df["BERT_P_en"] = P_en.cpu().numpy()
df["BERT_R_en"] = R_en.cpu().numpy()
df["BERT_F_en"] = F_en.cpu().numpy()

df["BERT_P_ar"] = P_ar.cpu().numpy()
df["BERT_R_ar"] = R_ar.cpu().numpy()
df["BERT_F_ar"] = F_ar.cpu().numpy()

In [None]:
out_cols = [
  # English side
  "CodeMixed_English","pr_en","cat_en",
  "GroundTruth_English","CodeMixed_Arabic_Full_English_Translation",
  "BERT_P_en","BERT_R_en","BERT_F_en",
  # Arabic side
  "CodeMixed_Arabic","pr_ar","cat_ar",
  "GroundTruth_Arabic","CodeMixed_English_Full_Arabic_Translation",
  "BERT_P_ar","BERT_R_ar","BERT_F_ar"
]
df.to_csv(
  "/content/drive/MyDrive/Project/Benchmark_data_results/bertscore_gpt41.csv",
  columns=out_cols,
  index=False
)

In [None]:
en_summary = df.groupby("cat_en").BERT_F_en.agg(
    count="count", mean="mean", std="std"
).reset_index().sort_values("cat_en")

ar_summary = df.groupby("cat_ar").BERT_F_ar.agg(
    count="count", mean="mean", std="std"
).reset_index().sort_values("cat_ar")

en_summary.to_csv(
  "/content/drive/MyDrive/Project/Benchmark_data_results/gpt41_bertscore_summary_en.csv",
  index=False
)
ar_summary.to_csv(
  "/content/drive/MyDrive/Project/Benchmark_data_results/gpt41_bertscore_summary_ar.csv",
  index=False
)

print("✅ Done. Per-example → gpt41_bertscore.csv • Summaries → gpt41_bertscore_summary_*.csv")

✅ Done. Per-example → gpt41_bertscore.csv • Summaries → gpt41_bertscore_summary_*.csv


phi

In [None]:
import pandas as pd

full_df = pd.read_csv('/content/drive/MyDrive/Project/Full_Dataset_with_pct_and_bins.csv')
gt_df = gpt_df = pd.read_csv('/content/drive/MyDrive/Project/Benchmark_data_results/microsoft_phi_3.5_mini_outputs.csv')

gt_df.columns = gt_df.columns.str.strip()

assert len(full_df) == len(gt_df), "make sure row counts match!"

df = pd.concat([
    full_df.reset_index(drop=True),
    gt_df[['CodeMixed_Arabic_Full_English_Translation',
           'CodeMixed_English_Full_Arabic_Translation']].reset_index(drop=True)
], axis=1)

In [None]:
en_preds = df["CodeMixed_Arabic_Full_English_Translation"].fillna("").tolist()
en_refs  = df["GroundTruth_English"].fillna("").tolist()

ar_preds = df["CodeMixed_English_Full_Arabic_Translation"].fillna("").tolist()
ar_refs  = df["GroundTruth_Arabic"].fillna("").tolist()

In [None]:
# English BERT-Score
P_en, R_en, F_en = bert_score(
    en_preds, en_refs,
    lang="en",
    model_type=EN_MODEL,
    device=DEVICE,
    batch_size=32,
    verbose=True
)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/322 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/164 [00:00<?, ?it/s]

done in 19.12 seconds, 273.34 sentences/sec


In [None]:
# Arabic BERT-Score
P_ar, R_ar, F_ar = bert_score(
    ar_preds, ar_refs,
    lang="ar",
    model_type=AR_MODEL,
    device=DEVICE,
    batch_size=32,
    verbose=True,
    num_layers=12
)

calculating scores...
computing bert embedding.


  0%|          | 0/286 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/164 [00:00<?, ?it/s]



done in 8.76 seconds, 596.54 sentences/sec




In [None]:
# these are torch tensors; convert to floats
df["BERT_P_en"] = P_en.cpu().numpy()
df["BERT_R_en"] = R_en.cpu().numpy()
df["BERT_F_en"] = F_en.cpu().numpy()

df["BERT_P_ar"] = P_ar.cpu().numpy()
df["BERT_R_ar"] = R_ar.cpu().numpy()
df["BERT_F_ar"] = F_ar.cpu().numpy()
out_cols = [
  # English side
  "CodeMixed_English","pr_en","cat_en",
  "GroundTruth_English","CodeMixed_Arabic_Full_English_Translation",
  "BERT_P_en","BERT_R_en","BERT_F_en",
  # Arabic side
  "CodeMixed_Arabic","pr_ar","cat_ar",
  "GroundTruth_Arabic","CodeMixed_English_Full_Arabic_Translation",
  "BERT_P_ar","BERT_R_ar","BERT_F_ar"
]
df.to_csv(
  "/content/drive/MyDrive/Project/Benchmark_data_results/phi_bertscore.csv",
  columns=out_cols,
  index=False
)
en_summary = df.groupby("cat_en").BERT_F_en.agg(
    count="count", mean="mean", std="std"
).reset_index().sort_values("cat_en")

ar_summary = df.groupby("cat_ar").BERT_F_ar.agg(
    count="count", mean="mean", std="std"
).reset_index().sort_values("cat_ar")

en_summary.to_csv(
  "/content/drive/MyDrive/Project/Benchmark_data_results/phi_bertscore_summary_en.csv",
  index=False
)
ar_summary.to_csv(
  "/content/drive/MyDrive/Project/Benchmark_data_results/phi_bertscore_summary_ar.csv",
  index=False
)

print("✅ Done. Per-example → gpt41_bertscore.csv • Summaries → gpt41_bertscore_summary_*.csv")

✅ Done. Per-example → gpt41_bertscore.csv • Summaries → gpt41_bertscore_summary_*.csv


google

In [None]:
import pandas as pd

full_df = pd.read_csv('/content/drive/MyDrive/Project/Full_Dataset_with_pct_and_bins.csv')
gt_df = pd.read_csv('/content/drive/MyDrive/Project/Benchmark_data_results/google_translate_outputs.csv')

gt_df.columns = gt_df.columns.str.strip()

assert len(full_df) == len(gt_df), "make sure row counts match!"

df = pd.concat([
    full_df.reset_index(drop=True),
    gt_df[['CodeMixed_Arabic_Full_English_Translation',
           'CodeMixed_English_Full_Arabic_Translation']].reset_index(drop=True)
], axis=1)
en_preds = df["CodeMixed_Arabic_Full_English_Translation"].fillna("").tolist()
en_refs  = df["GroundTruth_English"].fillna("").tolist()

ar_preds = df["CodeMixed_English_Full_Arabic_Translation"].fillna("").tolist()
ar_refs  = df["GroundTruth_Arabic"].fillna("").tolist()

In [None]:
# English BERT-Score
P_en, R_en, F_en = bert_score(
    en_preds, en_refs,
    lang="en",
    model_type=EN_MODEL,
    device=DEVICE,
    batch_size=32,
    verbose=True
)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/319 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/164 [00:00<?, ?it/s]

done in 22.03 seconds, 237.27 sentences/sec


In [None]:
# Arabic BERT-Score
P_ar, R_ar, F_ar = bert_score(
    ar_preds, ar_refs,
    lang="ar",
    model_type=AR_MODEL,
    device=DEVICE,
    batch_size=32,
    verbose=True,
    num_layers=12
)

calculating scores...
computing bert embedding.


  0%|          | 0/290 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/164 [00:00<?, ?it/s]

done in 10.44 seconds, 500.64 sentences/sec


In [None]:
# these are torch tensors; convert to floats
df["BERT_P_en"] = P_en.cpu().numpy()
df["BERT_R_en"] = R_en.cpu().numpy()
df["BERT_F_en"] = F_en.cpu().numpy()

df["BERT_P_ar"] = P_ar.cpu().numpy()
df["BERT_R_ar"] = R_ar.cpu().numpy()
df["BERT_F_ar"] = F_ar.cpu().numpy()
out_cols = [
  # English side
  "CodeMixed_English","pr_en","cat_en",
  "GroundTruth_English","CodeMixed_Arabic_Full_English_Translation",
  "BERT_P_en","BERT_R_en","BERT_F_en",
  # Arabic side
  "CodeMixed_Arabic","pr_ar","cat_ar",
  "GroundTruth_Arabic","CodeMixed_English_Full_Arabic_Translation",
  "BERT_P_ar","BERT_R_ar","BERT_F_ar"
]
df.to_csv(
  "/content/drive/MyDrive/Project/Benchmark_data_results/google_bertscore.csv",
  columns=out_cols,
  index=False
)
en_summary = df.groupby("cat_en").BERT_F_en.agg(
    count="count", mean="mean", std="std"
).reset_index().sort_values("cat_en")

ar_summary = df.groupby("cat_ar").BERT_F_ar.agg(
    count="count", mean="mean", std="std"
).reset_index().sort_values("cat_ar")

en_summary.to_csv(
  "/content/drive/MyDrive/Project/Benchmark_data_results/google_bertscore_summary_en.csv",
  index=False
)
ar_summary.to_csv(
  "/content/drive/MyDrive/Project/Benchmark_data_results/google_bertscore_summary_ar.csv",
  index=False
)

print("✅ Done. Per-example → gpt41_bertscore.csv • Summaries → gpt41_bertscore_summary_*.csv")

✅ Done. Per-example → gpt41_bertscore.csv • Summaries → gpt41_bertscore_summary_*.csv


mbart

In [None]:
import pandas as pd

full_df = pd.read_csv('/content/drive/MyDrive/Project/Full_Dataset_with_pct_and_bins.csv')
gt_df = gpt_df = pd.read_csv('/content/drive/MyDrive/Project/Benchmark_data_results/mbart-50_outputs.csv')


gt_df.columns = gt_df.columns.str.strip()

assert len(full_df) == len(gt_df), "make sure row counts match!"

df = pd.concat([
    full_df.reset_index(drop=True),
    gt_df[['Full_English_Translation',
           'Full_Arabic_Translation']].reset_index(drop=True)
], axis=1)
en_preds = df["Full_English_Translation"].fillna("").tolist()
en_refs  = df["GroundTruth_English"].fillna("").tolist()

ar_preds = df["Full_Arabic_Translation"].fillna("").tolist()
ar_refs  = df["GroundTruth_Arabic"].fillna("").tolist()

In [None]:
# English BERT-Score
P_en, R_en, F_en = bert_score(
    en_preds, en_refs,
    lang="en",
    model_type=EN_MODEL,
    device=DEVICE,
    batch_size=32,
    verbose=True
)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/321 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/164 [00:00<?, ?it/s]

done in 20.39 seconds, 256.40 sentences/sec


In [None]:
# Arabic BERT-Score
P_ar, R_ar, F_ar = bert_score(
    ar_preds, ar_refs,
    lang="ar",
    model_type=AR_MODEL,
    device=DEVICE,
    batch_size=32,
    verbose=True,
    num_layers=12
)

calculating scores...
computing bert embedding.


  0%|          | 0/290 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/164 [00:00<?, ?it/s]

done in 10.24 seconds, 510.30 sentences/sec


In [None]:
# these are torch tensors; convert to floats
df["BERT_P_en"] = P_en.cpu().numpy()
df["BERT_R_en"] = R_en.cpu().numpy()
df["BERT_F_en"] = F_en.cpu().numpy()

df["BERT_P_ar"] = P_ar.cpu().numpy()
df["BERT_R_ar"] = R_ar.cpu().numpy()
df["BERT_F_ar"] = F_ar.cpu().numpy()
out_cols = [
  # English side
  "CodeMixed_English","pr_en","cat_en",
  "GroundTruth_English","Full_English_Translation",
  "BERT_P_en","BERT_R_en","BERT_F_en",
  # Arabic side
  "CodeMixed_Arabic","pr_ar","cat_ar",
  "GroundTruth_Arabic","Full_Arabic_Translation",
  "BERT_P_ar","BERT_R_ar","BERT_F_ar"
]
df.to_csv(
  "/content/drive/MyDrive/Project/Benchmark_data_results/mbart_bertscore.csv",
  columns=out_cols,
  index=False
)
en_summary = df.groupby("cat_en").BERT_F_en.agg(
    count="count", mean="mean", std="std"
).reset_index().sort_values("cat_en")

ar_summary = df.groupby("cat_ar").BERT_F_ar.agg(
    count="count", mean="mean", std="std"
).reset_index().sort_values("cat_ar")

en_summary.to_csv(
  "/content/drive/MyDrive/Project/Benchmark_data_results/mbart_bertscore_summary_en.csv",
  index=False
)
ar_summary.to_csv(
  "/content/drive/MyDrive/Project/Benchmark_data_results/mbart_bertscore_summary_ar.csv",
  index=False
)

print("✅ Done. Per-example → gpt41_bertscore.csv • Summaries → gpt41_bertscore_summary_*.csv")

✅ Done. Per-example → gpt41_bertscore.csv • Summaries → gpt41_bertscore_summary_*.csv


COSINE SIMILARITY

In [None]:
!pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

MBart cosine similarty

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np

def bin_pct(p):
    """Bin a float p in [0,1] into one of three string buckets."""
    if p <= 0.33:
        return '0–33%'
    elif p <= 0.66:
        return '34–66%'
    else:
        return '67–100%'

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

full_df = pd.read_csv('/content/drive/MyDrive/Senior Year/Spring Semester (masters 1st semester)/CSCI 5541/Project/Full_Dataset_with_pct_and_bins.csv')

gpt_df = pd.read_csv('/content/drive/MyDrive/Senior Year/Spring Semester (masters 1st semester)/CSCI 5541/Project/Benchmark_data_results/mbart-50_outputs.csv')

assert len(full_df) == len(gpt_df)

df = pd.concat([full_df,
                gpt_df[['Full_English_Translation','Full_Arabic_Translation']]], axis=1)

#compute cosine similarity per row
def compute_cosines(row):
    pred_en = row.Full_English_Translation or ""
    gt_en   = row.GroundTruth_English     or ""
    pred_ar = row.Full_Arabic_Translation or ""
    gt_ar   = row.GroundTruth_Arabic      or ""

    embeddings = model.encode([pred_en, gt_en, pred_ar, gt_ar])
    en_sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    ar_sim = cosine_similarity([embeddings[2]], [embeddings[3]])[0][0]
    return pd.Series({
        'English_Cosine': en_sim,
        'Arabic_Cosine':  ar_sim
    })

cosine_scores = df.apply(compute_cosines, axis=1)
df = pd.concat([df, cosine_scores], axis=1)

out_cols = [
    'CodeMixed_English','pr_en','cat_en',
    'GroundTruth_English','Full_English_Translation','English_Cosine',
    'CodeMixed_Arabic','pr_ar','cat_ar',
    'GroundTruth_Arabic','Full_Arabic_Translation','Arabic_Cosine',
]
df.to_csv('/content/drive/MyDrive/Senior Year/Spring Semester (masters 1st semester)/CSCI 5541/Project/Benchmark_data_results/cosine similarity/cosine_per_example.csv',
          columns=out_cols, index=False)

en_summary = df.groupby('cat_en').English_Cosine.agg(
    count='count',
    mean='mean',
    std='std'
).reset_index().sort_values('cat_en')

ar_summary = df.groupby('cat_ar').Arabic_Cosine.agg(
    count='count',
    mean='mean',
    std='std'
).reset_index().sort_values('cat_ar')

en_summary.to_csv('/content/drive/MyDrive/Senior Year/Spring Semester (masters 1st semester)/CSCI 5541/Project/Benchmark_data_results/cosine similarity/mbart_cosine_summary_en.csv', index=False)
ar_summary.to_csv('/content/drive/MyDrive/Senior Year/Spring Semester (masters 1st semester)/CSCI 5541/Project/Benchmark_data_results/cosine similarity/mbart_cosine_summary_ar.csv', index=False)

print("Done.  • per-example cosine similarities -> cosine_per_example.csv  • summaries -> mbart_cosine_summary_*.csv")


Done.  • per-example cosine similarities -> cosine_per_example.csv  • summaries -> mbart_cosine_summary_*.csv


Phi cosine similarty

Done.  • per-example cosine similarities -> phi_cosine_per_example.csv  • summaries -> phi_cosine_summary_*.csv


GPT cosine similarty

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np

def bin_pct(p):
    """Bin a float p in [0,1] into one of three string buckets."""
    if p <= 0.33:
        return '0–33%'
    elif p <= 0.66:
        return '34–66%'
    else:
        return '67–100%'

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

full_df = pd.read_csv('/content/drive/MyDrive/Senior Year/Spring Semester (masters 1st semester)/CSCI 5541/Project/Full_Dataset_with_pct_and_bins.csv')

gt_df = pd.read_csv('/content/drive/MyDrive/Senior Year/Spring Semester (masters 1st semester)/CSCI 5541/Project/Benchmark_data_results/gpt_4.1_outputs.csv')

gt_df.columns = gt_df.columns.str.strip()

assert len(full_df) == len(gt_df), "make sure row counts match!"

df = pd.concat([
    full_df.reset_index(drop=True),
    gt_df[['CodeMixed_Arabic_Full_English_Translation',
           'CodeMixed_English_Full_Arabic_Translation']].rename(columns={
               'CodeMixed_Arabic_Full_English_Translation': 'Full_English_Translation',
               'CodeMixed_English_Full_Arabic_Translation': 'Full_Arabic_Translation'
           }).reset_index(drop=True)
], axis=1)

def compute_cosines(row):
    pred_en = row.Full_English_Translation or ""
    gt_en   = row.GroundTruth_English     or ""
    pred_ar = row.Full_Arabic_Translation or ""
    gt_ar   = row.GroundTruth_Arabic      or ""

    embeddings = model.encode([pred_en, gt_en, pred_ar, gt_ar])
    en_sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    ar_sim = cosine_similarity([embeddings[2]], [embeddings[3]])[0][0]
    return pd.Series({
        'English_Cosine': en_sim,
        'Arabic_Cosine':  ar_sim
    })

cosine_scores = df.apply(compute_cosines, axis=1)
df = pd.concat([df, cosine_scores], axis=1)

out_cols = [
    'CodeMixed_English','pr_en','cat_en',
    'GroundTruth_English','Full_English_Translation','English_Cosine',
    'CodeMixed_Arabic','pr_ar','cat_ar',
    'GroundTruth_Arabic','Full_Arabic_Translation','Arabic_Cosine',
]
df.to_csv('/content/drive/MyDrive/Senior Year/Spring Semester (masters 1st semester)/CSCI 5541/Project/Benchmark_data_results/cosine similarity/gpt4.1_cosine_per_example.csv',
          columns=out_cols, index=False)

en_summary = df.groupby('cat_en').English_Cosine.agg(
    count='count',
    mean='mean',
    std='std'
).reset_index().sort_values('cat_en')

ar_summary = df.groupby('cat_ar').Arabic_Cosine.agg(
    count='count',
    mean='mean',
    std='std'
).reset_index().sort_values('cat_ar')

en_summary.to_csv('/content/drive/MyDrive/Senior Year/Spring Semester (masters 1st semester)/CSCI 5541/Project/Benchmark_data_results/cosine similarity/gpt4.1_cosine_summary_en.csv', index=False)
ar_summary.to_csv('/content/drive/MyDrive/Senior Year/Spring Semester (masters 1st semester)/CSCI 5541/Project/Benchmark_data_results/cosine similarity/gpt4.1_cosine_summary_ar.csv', index=False)

print("Done.  • per-example cosine similarities -> gpt4.1_cosine_per_example.csv  • summaries -> gpt4.1_cosine_summary_*.csv")


Done.  • per-example cosine similarities -> gpt4.1_cosine_per_example.csv  • summaries -> gpt4.1_cosine_summary_*.csv


Google cosine similarty

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np

def bin_pct(p):
    """Bin a float p in [0,1] into one of three string buckets."""
    if p <= 0.33:
        return '0–33%'
    elif p <= 0.66:
        return '34–66%'
    else:
        return '67–100%'

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

full_df = pd.read_csv('/content/drive/MyDrive/Senior Year/Spring Semester (masters 1st semester)/CSCI 5541/Project/Full_Dataset_with_pct_and_bins.csv')

gt_df = pd.read_csv('/content/drive/MyDrive/Senior Year/Spring Semester (masters 1st semester)/CSCI 5541/Project/Benchmark_data_results/google_translate_outputs.csv')

gt_df.columns = gt_df.columns.str.strip()

assert len(full_df) == len(gt_df), "make sure row counts match!"

df = pd.concat([
    full_df.reset_index(drop=True),
    gt_df[['CodeMixed_Arabic_Full_English_Translation',
           'CodeMixed_English_Full_Arabic_Translation']].rename(columns={
               'CodeMixed_Arabic_Full_English_Translation': 'Full_English_Translation',
               'CodeMixed_English_Full_Arabic_Translation': 'Full_Arabic_Translation'
           }).reset_index(drop=True)
], axis=1)

def compute_cosines(row):
    pred_en = row.Full_English_Translation or ""
    gt_en   = row.GroundTruth_English     or ""
    pred_ar = row.Full_Arabic_Translation or ""
    gt_ar   = row.GroundTruth_Arabic      or ""

    embeddings = model.encode([pred_en, gt_en, pred_ar, gt_ar])
    en_sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    ar_sim = cosine_similarity([embeddings[2]], [embeddings[3]])[0][0]
    return pd.Series({
        'English_Cosine': en_sim,
        'Arabic_Cosine':  ar_sim
    })

cosine_scores = df.apply(compute_cosines, axis=1)
df = pd.concat([df, cosine_scores], axis=1)

out_cols = [
    'CodeMixed_English','pr_en','cat_en',
    'GroundTruth_English','Full_English_Translation','English_Cosine',
    'CodeMixed_Arabic','pr_ar','cat_ar',
    'GroundTruth_Arabic','Full_Arabic_Translation','Arabic_Cosine',
]
df.to_csv('/content/drive/MyDrive/Senior Year/Spring Semester (masters 1st semester)/CSCI 5541/Project/Benchmark_data_results/cosine similarity/google_translate_cosine_per_example.csv',
          columns=out_cols, index=False)

en_summary = df.groupby('cat_en').English_Cosine.agg(
    count='count',
    mean='mean',
    std='std'
).reset_index().sort_values('cat_en')

ar_summary = df.groupby('cat_ar').Arabic_Cosine.agg(
    count='count',
    mean='mean',
    std='std'
).reset_index().sort_values('cat_ar')

en_summary.to_csv('/content/drive/MyDrive/Senior Year/Spring Semester (masters 1st semester)/CSCI 5541/Project/Benchmark_data_results/cosine similarity/google_translate_cosine_summary_en.csv', index=False)
ar_summary.to_csv('/content/drive/MyDrive/Senior Year/Spring Semester (masters 1st semester)/CSCI 5541/Project/Benchmark_data_results/cosine similarity/google_translate_cosine_summary_ar.csv', index=False)

print("Done.  • per-example cosine similarities -> google_translate_cosine_per_example.csv  • summaries -> google_transalate_cosine_summary_*.csv")


Done.  • per-example cosine similarities -> google_translate_cosine_per_example.csv  • summaries -> google_transalate_cosine_summary_*.csv


GPT

---------------------------------------------------------------------


ROGUE - failed to do due to it being asci dependant, which doesn't help with tokenizing arabic text




In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=af584c367e84b37d7711f0fad0059251eee7f96250f613620079c87e866c5422
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
from rouge_score import rouge_scorer
full_df =  pd.read_csv('/content/drive/MyDrive/Project/Full_Dataset_with_pct_and_bins.csv')

gt_df = pd.read_csv('/content/drive/MyDrive/Project/Benchmark_data_results/gpt_4.1_outputs.csv')
df = pd.merge(full_df, gt_df, left_index=True, right_index=True) # Merge dataframes
df = df.drop(columns=[c for c in df.columns if c.startswith("Unnamed:")])
df = df.loc[:,~df.columns.duplicated()]
scorer_en = rouge_scorer.RougeScorer(
    ['rouge1','rouge2','rougeL'], use_stemmer=True
)
scorer_ar = rouge_scorer.RougeScorer(
    ['rouge1','rouge2','rougeL'], use_stemmer=False
)

In [None]:
print(df.columns.tolist())

['CodeMixed_Arabic', 'pr_ar', 'cat_ar', 'CodeMixed_English', 'pr_en', 'cat_en', 'GroundTruth_Arabic', 'GroundTruth_English', 'CodeMixed_Arabic_Full_English_Translation', 'CodeMixed_English_Full_Arabic_Translation', 'English_ROUGE1', 'English_ROUGE2', 'English_ROUGE_L', 'Arabic_ROUGE1', 'Arabic_ROUGE2', 'Arabic_ROUGE_L', 'English_ROUGE1', 'English_ROUGE2', 'English_ROUGE_L', 'Arabic_ROUGE1', 'Arabic_ROUGE2', 'Arabic_ROUGE_L']


In [None]:
class WhitespaceTokenizer:
    def tokenize(self, text):
        return text.split()

In [None]:

import pandas as pd
from rouge_score import rouge_scorer

class WhitespaceTokenizer:
    def tokenize(self, text):
        # split on any whitespace
        return text.split()

scorer = rouge_scorer.RougeScorer(
    ['rouge1','rouge2','rougeL'],
    use_stemmer=False,
    tokenizer=WhitespaceTokenizer()
)

text_cols = [
    'CodeMixed_Arabic_Full_English_Translation',
    'GroundTruth_English',
    'CodeMixed_English_Full_Arabic_Translation',
    'GroundTruth_Arabic'
]
for c in text_cols:
    if c in df.columns:
        df[c] = df[c].fillna('').astype(str)

def compute_rouges(row):
    e = scorer.score(
        row['GroundTruth_English'],
        row['CodeMixed_Arabic_Full_English_Translation']
    )
    a = scorer.score(
        row['GroundTruth_Arabic'],
        row['CodeMixed_English_Full_Arabic_Translation']
    )
    return pd.Series({
        'English_ROUGE1': e['rouge1'].fmeasure,
        'English_ROUGE2': e['rouge2'].fmeasure,
        'English_ROUGE_L': e['rougeL'].fmeasure,
        'Arabic_ROUGE1':  a['rouge1'].fmeasure,
        'Arabic_ROUGE2':  a['rouge2'].fmeasure,
        'Arabic_ROUGE_L': a['rougeL'].fmeasure,
    })

rouge_scores = df.apply(compute_rouges, axis=1)
df = pd.concat([df, rouge_scores], axis=1)

df = df.loc[:, ~df.columns.duplicated()]

out_cols = [
    'CodeMixed_Arabic','pr_ar','cat_ar',
    'CodeMixed_English','pr_en','cat_en',
    'GroundTruth_Arabic','GroundTruth_English',
    'CodeMixed_Arabic_Full_English_Translation',
    'CodeMixed_English_Full_Arabic_Translation',
    'English_ROUGE1','English_ROUGE2','English_ROUGE_L',
    'Arabic_ROUGE1','Arabic_ROUGE2','Arabic_ROUGE_L',
]
df.to_csv(
    '/content/drive/MyDrive/Project/Benchmark_data_results/gpt41_rouge.csv',
    columns=out_cols,
    index=False
)

en_summary = df.groupby('cat_en', as_index=False).agg(
    r1_mean=('English_ROUGE1','mean'),
    r1_std =('English_ROUGE1','std'),
    r2_mean=('English_ROUGE2','mean'),
    r2_std =('English_ROUGE2','std'),
    rl_mean=('English_ROUGE_L','mean'),
    rl_std =('English_ROUGE_L','std'),
)
ar_summary = df.groupby('cat_ar', as_index=False).agg(
    r1_mean=('Arabic_ROUGE1','mean'),
    r1_std =('Arabic_ROUGE1','std'),
    r2_mean=('Arabic_ROUGE2','mean'),
    r2_std =('Arabic_ROUGE2','std'),
    rl_mean=('Arabic_ROUGE_L','mean'),
    rl_std =('Arabic_ROUGE_L','std'),
)

en_summary.to_csv(
    '/content/drive/MyDrive/Project/Benchmark_data_results/gpt41_rouge_summary_en.csv',
    index=False
)
ar_summary.to_csv(
    '/content/drive/MyDrive/Project/Benchmark_data_results/gpt41_rouge_summary_ar.csv',
    index=False
)

print("✅ Done. Per-example → gpt41_rouge.csv • Summaries → gpt41_rouge_summary_*.csv")

✅ Done. Per-example → gpt41_rouge.csv • Summaries → gpt41_rouge_summary_*.csv


In [None]:
print(df['GroundTruth_Arabic'].str.len().describe())
print(df['CodeMixed_English_Full_Arabic_Translation'].str.len().describe())

count    5227.000000
mean       57.769084
std        26.446642
min        25.000000
25%        40.000000
50%        50.000000
75%        67.000000
max       268.000000
Name: GroundTruth_Arabic, dtype: float64
count    5227.000000
mean       57.347427
std        24.866775
min         0.000000
25%        41.000000
50%        51.000000
75%        67.000000
max       230.000000
Name: CodeMixed_English_Full_Arabic_Translation, dtype: float64


In [None]:
display(df[['GroundTruth_Arabic','CodeMixed_English_Full_Arabic_Translation']].head())

Unnamed: 0,GroundTruth_Arabic,CodeMixed_English_Full_Arabic_Translation
0,هل شيراز هي اللي طلعت كل الأشباح والمسوخ دي في...,هل شيراز هي اللي جابت كل الأشباح والمسوخ دي في...
1,هل ده معناه إن شيراز حتفضل ورانا واحد واحد؟,هل ده معناه إن شيراز هتفضل وراينا واحد واحد؟
2,الأهم من كل ده، إيه اللي هيوقفها؟,وأهم سؤال في الموضوع هو: إزاي ممكن نوقفها؟
3,بسبب توصية حضرتك البحث بتاعي اتقبل هنا في الكلية,بسبب توصيتك، الجامعة قبلت البحث بتاعي.
4,يا دكتور هو أنا يعني لاحق أذاكر الواقع لما كما...,يا دكتور أنا بالعافية عندي وقت أذاكر الواقع وك...
