In [8]:
import sys
import os
import pandas as pd
from tqdm import tqdm
import time
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from mta_metrics import bert_sim, \
    bert_contained_in, chrp_contained_in
from mta_metrics.alignment import awesome_align, usw_nm
from translate import gpt, google, opus
tqdm.pandas()

In [9]:
df = pd.read_csv('/Users/chany/research/mt-ambiguity/4_evaluation/data/base_1idiomPerRow.csv')
df.columns

Index(['idiom', 'meaning', 's_f_brkt', 's_l_brkt', 's_f', 's_l', 's_a'], dtype='object')

In [10]:
langs = {
    'de': 'German',
    'he': 'Hebrew', 
    'hi': 'Hindi',
    'hu': 'Hungarian', 
    'ko': 'Korean',
    'sp': 'Spanish',
    'yo': 'Yoruba',
    'zh': 'Chinese'
}

# Translate

### GPT-4

In [None]:
def gpt_wrapper(x, lang, time_passed=1):
    time.sleep(time_passed)
    try:
        return gpt(x, lang)
    except:
        if time_passed >= 8:
            print("backoff: ", time_passed)
        return gpt_wrapper(x, lang, time_passed * 2)
indices = [0, 128, 256, 384, 512]

In [None]:
lang = 'hi'
afl = 'f'

In [None]:
subset = 0
s_0 = df[f"s_{afl}"][indices[subset]: indices[subset + 1]].progress_apply(
    lambda x: gpt_wrapper(x, langs[lang]))
s_0

In [None]:
subset = 1
s_1 = df[f"s_{afl}"][indices[subset]: indices[subset + 1]].progress_apply(
    lambda x: gpt_wrapper(x, langs[lang]))
s_1

In [None]:
subset = 2
s_2 = df[f"s_{afl}"][indices[subset]: indices[subset + 1]].progress_apply(
    lambda x: gpt_wrapper(x, langs[lang]))
s_2

In [None]:
subset = 3
s_3 = df[f"s_{afl}"][indices[subset]: indices[subset + 1]].progress_apply(
    lambda x: gpt_wrapper(x, langs[lang]))
s_3

In [None]:
df[f'p_{afl}_{lang}_gpt'] = pd.concat([s_0, s_1, s_2, s_3]).reset_index(drop=True)
df.head()

In [None]:
df.to_csv(f'ckpt/{afl}_{lang}.csv', index=False)

### Google

In [None]:
def google_wrapper(x, lang):
    try:
        return google(x, lang)
    except:
        time.sleep(5)
        return google(x, lang)

In [None]:
for lang in langs:
    print(f"Begin Google Translation: {lang}")
    df[f'p_f_{lang}_google'] = google_wrapper(df['s_f'], lang)
    df[f'p_l_{lang}_google'] = google_wrapper(df['s_l'], lang)
    df[f'p_a_{lang}_google'] = google_wrapper(df['s_a'], lang)
    print()

### Opus MT

In [17]:
df['s_l'][0]

'During the anatomy class, we studied his Achilles heel and its function'

In [24]:
opus([df['s_l'][0], df['s_l'][1]], lang='hu')

/Users/chany/research/mt-ambiguity/4_evaluation/translate/opus/hu


['Az anatómia órán tanulmányoztuk az Achilles-sarkát és annak működését.',
 'A jelmezbálon megtalálták a Mr Right és Ms Right ruháikat a boltban.']

In [12]:
for lang in langs:
    if lang == 'ko': continue
    print(f"Begin Opus MT: {lang}")
    df[f'p_a_{lang}_opus'] = opus(df['s_a'].tolist(), lang)
    df[f'p_f_{lang}_opus'] = opus(df['s_f'].tolist(), lang)
    df[f'p_l_{lang}_opus'] = opus(df['s_l'].tolist(), lang)
    print()

Begin Opus MT: de
/Users/chany/research/mt-ambiguity/4_evaluation/translate/opus/de


[E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument


KeyboardInterrupt: 

In [None]:
df.to_csv("translation_opus.csv", index=False)

## Sample Translation: Spanish

## USW, NM

In [None]:
alignments_a = awesome_align(df['s_a'], df['p_a'])
alignments_f = awesome_align(df['s_f'], df['p_f'])
alignments_l = awesome_align(df['s_l'], df['p_l'])
df['usw_a'], df['nm_a'], usw_a, nm_a = usw_nm(df['s_a'], alignments_a, df['p_a'])
df['usw_f'], df['nm_f'], usw_f, nm_f = usw_nm(df['s_f'], alignments_f, df['p_f'])
df['usw_l'], df['nm_l'], usw_l, nm_l = usw_nm(df['s_l'], alignments_l, df['p_l'])

In [None]:
print(usw_a, usw_f, usw_l)
print(nm_a, nm_f, nm_l)

## Contained In

In [None]:
df['contained_in(p_a,p_f)_BERT'] = bert_score_contained_in(df['p_a'], df['p_f'], 'sp')
df['contained_in(p_a,p_l)_BERT'] = bert_score_contained_in(df['p_a'], df['p_l'], 'sp')
df['contained_in(p_a,p_f)_chrP'] = chrp_contained_in(df['p_a'], df['p_f'])
df['contained_in(p_a,p_l)_chrP'] = chrp_contained_in(df['p_a'], df['p_l'])
df['sensitivity_BERT'] = abs(df['contained_in(p_a,p_l)_BERT'] - df['contained_in(p_a,p_f)_BERT'])
df['sensitivity_chrP'] = abs(df['contained_in(p_a,p_l)_chrP'] -  df['contained_in(p_a,p_f)_chrP'])

In [None]:
df