In [1]:
import sys
import os
import pandas as pd
from tqdm import tqdm
import time
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from mta_metrics import bleu_sim, meteor_sim, chrf_sim, bertscore_sim, comet_sim, \
    bert_contained_in, chrp_contained_in
from mta_metrics.alignment import awesome_align, usw_nm
from translate import gpt, google, opus
tqdm.pandas()
import torch
torch.device('mps')
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('/Users/chany/research/mt-ambiguity/4_evaluation/data/base_1idiomPerRow.csv')
df.columns

Index(['idiom', 'meaning', 's_f_brkt', 's_l_brkt', 's_f', 's_l', 's_a'], dtype='object')

In [4]:
langs = {
    'de': 'German',
    'he': 'Hebrew', 
    'hi': 'Hindi',
    'hu': 'Hungarian', 
    'ko': 'Korean',
    'es': 'Spanish',
    'yo': 'Yoruba',
    'zh': 'Chinese'
}

# Translate

### GPT-4

In [5]:
def gpt_wrapper(x, lang, time_passed=1):
    time.sleep(time_passed)
    try:
        return gpt(x, lang)
    except:
        if time_passed >= 4:
            print("backoff: ", time_passed)
        return gpt_wrapper(x, lang, time_passed * 2)

In [7]:
gpt_wrapper("I love you", "Korean")

'나는 너를 사랑해.'

In [6]:
i = 0
for lang in langs:
    for afl in ['a', 'f', 'l']:
        df[f'p_{afl}_{lang}_gpt'] = df[f"s_{afl}"].progress_apply(
            lambda x: gpt_wrapper(x, langs[lang]))
        df.to_csv(f"./ckpt/gpt/ckpt_{i}.csv", index=False)
        i += 1

  0%|          | 0/512 [00:00<?, ?it/s]

  1%|          | 4/512 [00:04<08:50,  1.04s/it]


KeyboardInterrupt: 

In [None]:
df.to_csv("translation_gpt.csv")

**Merge**

In [None]:
os.chdir("ckpt")
for lang in langs:
    for label in ['a', 'f', 'l']:
        col = pd.read_csv(f'{label}_{lang}.csv')[f'p_{label}_{lang}_gpt']
        df[f'p_{label}_{lang}_gpt'] = col
os.chdir("..")

In [None]:
df.to_csv("translation_gpt.csv", index=False)

### Google

In [None]:
def google_wrapper(x, lang):
    try:
        return google(x, lang)
    except:
        time.sleep(5)
        return google(x, lang)

In [None]:
for lang in langs:
    print(f"Begin Google Translation: {lang}")
    df[f'p_f_{lang}_google'] = google_wrapper(df['s_f'], lang)
    df[f'p_l_{lang}_google'] = google_wrapper(df['s_l'], lang)
    df[f'p_a_{lang}_google'] = google_wrapper(df['s_a'], lang)
    print()

### Opus MT

In [None]:
for lang in langs:
    if lang == 'ko':
        df[f'p_a_ko_opus'] = ''
        df[f'p_f_ko_opus'] = ''
        df[f'p_l_ko_opus'] = ''
    try:
        print(f"Begin Opus MT: {lang}")
        df[f'p_a_{lang}_opus'] = opus(df['s_a'].tolist(), lang)
        df[f'p_f_{lang}_opus'] = opus(df['s_f'].tolist(), lang)
        df[f'p_l_{lang}_opus'] = opus(df['s_l'].tolist(), lang)
        print()
    except:
        print(f"Exception thrown at lang {lang}")
        continue

In [None]:
df.to_csv("translation_opus.csv", index=False)

### NLLB

In [None]:
os.chdir("nllb")
for label, idx in zip(['f', 'l', 'a'], range(0, 512 * 3, 512)):

    with open(f'deu_Latn.txt', 'r') as file:
        lines = [line.strip() for line in file]
        df[f'p_{label}_de_nllb'] = lines[idx : idx + 512]

    with open(f'heb_Hebr.txt', 'r') as file:
        lines = [line.strip() for line in file]
        df[f'p_{label}_he_nllb'] = lines[idx : idx + 512]

    with open(f'hin_Deva.txt', 'r') as file:
        lines = [line.strip() for line in file]
        df[f'p_{label}_hi_nllb'] = lines[idx : idx + 512]

    with open(f'hun_Latn.txt', 'r') as file:
        lines = [line.strip() for line in file]
        df[f'p_{label}_hu_nllb'] = lines[idx : idx + 512]

    with open(f'kor_Hang.txt', 'r') as file:
        lines = [line.strip() for line in file]
        df[f'p_{label}_ko_nllb'] = lines[idx : idx + 512]

    with open(f'spa_Latn.txt', 'r') as file:
        lines = [line.strip() for line in file]
        df[f'p_{label}_es_nllb'] = lines[idx : idx + 512]

    with open(f'yor_Latn.txt', 'r') as file:
        lines = [line.strip() for line in file]
        df[f'p_{label}_yo_nllb'] = lines[idx : idx + 512]

    with open(f'zho_Hans.txt', 'r') as file:
        lines = [line.strip() for line in file]
        df[f'p_{label}_zh_nllb'] = lines[idx : idx + 512]

os.chdir("..")

In [None]:
df.to_csv("translation_nllb.csv", index=False)

### LLaMA

In [None]:
os.chdir("llama")
for label, idx in zip(['f', 'l', 'a'], range(0, 512 * 3, 512)):

    with open(f'german.txt', 'r') as file:
        lines = [line.strip() for line in file]
        df[f'p_{label}_de_llama'] = lines[idx : idx + 512]

    with open(f'hebrew.txt', 'r') as file:
        lines = [line.strip() for line in file]
        df[f'p_{label}_he_llama'] = lines[idx : idx + 512]

    with open(f'hindi.txt', 'r') as file:
        lines = [line.strip() for line in file]
        df[f'p_{label}_hi_llama'] = lines[idx : idx + 512]

    with open(f'hungarian.txt', 'r') as file:
        lines = [line.strip() for line in file]
        df[f'p_{label}_hu_llama'] = lines[idx : idx + 512]

    with open(f'korean.txt', 'r') as file:
        lines = [line.strip() for line in file]
        df[f'p_{label}_ko_llama'] = lines[idx : idx + 512]

    with open(f'spanish.txt', 'r') as file:
        lines = [line.strip() for line in file]
        df[f'p_{label}_es_llama'] = lines[idx : idx + 512]

    with open(f'yoruba.txt', 'r') as file:
        lines = [line.strip() for line in file]
        df[f'p_{label}_yo_llama'] = lines[idx : idx + 512]

    with open(f'chinese.txt', 'r') as file:
        lines = [line.strip() for line in file]
        df[f'p_{label}_zh_llama'] = lines[idx : idx + 512]
os.chdir("..")

In [None]:
df.to_csv("translation_llama.csv", index=False)

### Combining

In [2]:
df = pd.concat([
    pd.read_csv("../data/translation_gpt.csv"),
    pd.read_csv("../data/translation_palm.csv"),
    pd.read_csv("../data/translation_google.csv"),
    pd.read_csv("../data/translation_opus.csv"),
    pd.read_csv("../data/translation_nllb.csv"),
], axis=1)
df = df.loc[:,~df.columns.duplicated(keep='first')]
df.to_csv("translation.csv", index=False)

# Evaluation

In [4]:
df.drop("Unnamed: 0", axis=1).to_csv("translation.csv", index=False)

In [14]:
df = pd.read_csv("translation.csv", keep_default_na=False, na_values=[])

## RQ1

In [15]:
import spacy
def tok(x, lang):
    if lang == 'zh':
        from spacy.lang.zh import Chinese
        cfg = {"segmenter": "jieba"}
        nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
    elif lang == 'ko':
        nlp = spacy.load("ko_core_news_sm")
    elif lang == 'de':
        nlp = spacy.load("de_core_news_sm")
    elif lang == 'es':
        nlp = spacy.load("es_core_news_sm")
    elif lang == 'en':
        nlp = spacy.load("en_core_web_sm")
    else:
        nlp = spacy.blank(lang)
    return ' '.join([token.text for token in nlp(x)])

In [16]:
usw_result, nm_result = [], []
models = ['gpt', 'palm', 'google', 'opus', 'nllb']

In [17]:
from string import punctuation
def sstrip(x):
    x = x.strip(" \n\t")
    x = x.strip(punctuation)
    return x

In [19]:
df['s_a_tok'] = df['s_a'].apply(sstrip)
df['s_a_tok'] = df['s_a'].apply(lambda x: tok(x, 'en'))
# for model in models:
for model in models:
    for lang in langs:
        print(model, lang)
        df[f'p_a_{lang}_{model}_tok'] = df[f'p_a_{lang}_{model}'].apply(lambda x: tok(x, lang))

gpt de


In [None]:
import os, sys

class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

In [None]:
usw_result, nm_result = [], []
models = ['gpt', 'palm', 'google', 'opus', 'nllb']

for model in models:
    for lang in langs:
        try:
            language_pair = f'en-{lang}'

            if model == 'opus' and lang == 'ko':
                usw_result.append([model, language_pair, 0])
                nm_result.append([model, language_pair, 0])
                continue

            # source and prediction
            section = df[['s_a_tok', f'p_a_{lang}_{model}_tok']].copy()

            # remove rows with empty string
            section = section[~(section == '').any(axis=1)].reset_index(drop=True)
            print(model, lang, len(section))

            with HiddenPrints():
                alignments = awesome_align(section['s_a_tok'], section[f'p_a_{lang}_{model}_tok'])
                usw, nm = usw_nm(section['s_a_tok'], alignments, section[f'p_a_{lang}_{model}_tok'])
                usw_result.append([model, language_pair, usw])
                nm_result.append([model, language_pair, nm])
                
        except:
            print("ERROR! ERROR!")
            print(model, lang)

In [None]:
(pd.DataFrame(usw_result, columns=["model", "language_pair", "score"])).to_csv("usw_result.csv", index=False)
(pd.DataFrame(nm_result, columns=["model", "language_pair", "score"])).to_csv("nm_result.csv", index=False)

## Contained In

In [None]:
uni_fig_chrp, uni_lit_chrp = [], []
for model in models:
    for lang in langs:
        language_pair = f'en-{lang}'
        p_a = df[f'p_a_{lang}_{model}']
        p_f = df[f'p_f_{lang}_{model}']
        p_l = df[f'p_l_{lang}_{model}']
        uni_fig_chrp.append([model, language_pair, chrp_contained_in(p_a, p_f)])
        uni_lit_chrp.append([model, language_pair, chrp_contained_in(p_a, p_l)])
(pd.DataFrame(uni_fig_chrp, columns=["model", "language_pair", "score"])).to_csv("uni_fig_chrp", index=False)
(pd.DataFrame(uni_lit_chrp, columns=["model", "language_pair", "score"])).to_csv("uni_lit_chrp", index=False)

In [None]:
uni_fig_bert, uni_lit_bert = [], []
for model in models:
    for lang in langs:
        language_pair = f'en-{lang}'
        p_a = df[f'p_a_{lang}_{model}']
        p_f = df[f'p_f_{lang}_{model}']
        p_l = df[f'p_l_{lang}_{model}']
        uni_fig_bert.append([model, language_pair, bert_contained_in(p_a, p_f)])
        uni_lit_bert.append([model, language_pair, bert_contained_in(p_a, p_l)])
(pd.DataFrame(uni_fig_bert, columns=["model", "language_pair", "score"])).to_csv("uni_fig_bert", index=False)
(pd.DataFrame(uni_lit_bert, columns=["model", "language_pair", "score"])).to_csv("uni_lit_bert", index=False)

In [None]:
uni_fig_chrf, uni_lit_chrf = [], []
for model in models:
    for lang in langs:
        language_pair = f'en-{lang}'
        p_a = df[f'p_a_{lang}_{model}']
        p_f = df[f'p_f_{lang}_{model}']
        p_l = df[f'p_l_{lang}_{model}']
        uni_fig_chrf.append([model, language_pair, chrf_sim(p_a, p_f)])
        uni_lit_chrf.append([model, language_pair, chrf_sim(p_a, p_l)])

(pd.DataFrame(uni_fig_chrf, columns=["model", "language_pair", "score"])).to_csv("uni_fig_chrf", index=False)
(pd.DataFrame(uni_lit_chrf, columns=["model", "language_pair", "score"])).to_csv("uni_lit_chrf", index=False)