In [12]:
import sys
import os
import pandas as pd
from tqdm import tqdm
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from mta_metrics import bert_score_sim, \
    bert_score_contained_in, weighted_bert_score_contained_in, chrp_contained_in
from mta_metrics.alignment import awesome_align, usw_nm
from translate import gpt4_translate

In [9]:
df = pd.read_csv('per_idiom_dataset.csv')
df = df[df['well_formed']].sample(n=50, random_state=42).reset_index(drop=True)
df.head()

Unnamed: 0,idiom,meaning,intended_figurative,intended_literal,maxVote_figurative,maxVote_literal,gold_figurative,gold_literal,well_formed,intended_ambiguous
0,up a creek without a paddle,being in trouble.,when our car broke down in the middle of nowhe...,"while canoeing, we accidentally dropped our pa...",figurative,literal,"['figurative', 'figurative']","['literal', 'literal']",True,found ourselves up a creek without a paddle
1,blind leading the blind,someone who does not understand something but ...,"when it comes to cooking, it's like the blind ...","in the experimental theater performance, they ...",figurative,literal,"['figurative', 'figurative']","['literal', 'literal']",True,the blind leading the blind
2,the sun rises and sets on them,to believe that someone is the most wonderful ...,she adores her children so much that the sun r...,"living on a remote island, the sun rises and s...",figurative,literal,"['figurative', 'figurative']","['literal', 'literal']",True,the sun rises and sets on them
3,where the rubber meets the road,when you have to face your challenges,the new employee will face a real challenge wh...,the intersection is where the rubber meets the...,figurative,literal,"['figurative', 'figurative']","['literal', 'literal']",True,where the rubber meets the road
4,the ball rolling,to start a conversation in a social setting,"to start the meeting, we need to get the ball ...","to play soccer, you must get the ball rolling ...",figurative,literal,"['figurative', 'figurative']","['literal', 'literal']",True,get the ball rolling


## Sample Translation: Spanish

In [10]:
tqdm.pandas()
df['gpt4_figurative_translation'] = df['intended_figurative'].progress_apply(lambda x: gpt4_translate(x, 'Spanish'))
df['gpt4_literal_translation'] = df['intended_literal'].progress_apply(lambda x: gpt4_translate(x, 'Spanish'))
df['gpt4_ambiguous_translation'] = df['intended_ambiguous'].progress_apply(lambda x: gpt4_translate(x, 'Spanish'))

df = df[['idiom', 'meaning', 'intended_ambiguous', 'intended_figurative', 'intended_literal', 
         'gpt4_ambiguous_translation', 'gpt4_figurative_translation', 'gpt4_literal_translation']]
df = df.rename(columns={
    'intended_ambiguous': 's_a',
    'intended_figurative': 's_f',
    'intended_literal': 's_l',
    'gpt4_ambiguous_translation': 'p_a',
    'gpt4_figurative_translation': 'p_f',
    'gpt4_literal_translation': 'p_l',
})

100%|██████████| 50/50 [02:43<00:00,  3.26s/it]
100%|██████████| 50/50 [03:02<00:00,  3.66s/it]
100%|██████████| 50/50 [01:16<00:00,  1.53s/it]


## USW, NM

In [13]:
alignments_a = awesome_align(df['s_a'], df['p_a'])
alignments_f = awesome_align(df['s_f'], df['p_f'])
alignments_l = awesome_align(df['s_l'], df['p_l'])
df['usw_a'], df['nm_a'], usw_a, nm_a = usw_nm(df['s_a'], alignments_a, df['p_a'])
df['usw_f'], df['nm_f'], usw_f, nm_f = usw_nm(df['s_f'], alignments_f, df['p_f'])
df['usw_l'], df['nm_l'], usw_l, nm_l = usw_nm(df['s_l'], alignments_l, df['p_l'])

Loading dataset...


100%|██████████| 50/50 [00:00<00:00, 3958.83it/s]
Extracting alignments: 100%|██████████| 50/50 [00:00<00:00, 60.27it/s]


File ./source.txt.target.txt has been removed successfully
File ./source.txt has been removed successfully
File ./target.txt has been removed successfully
Loading dataset...


100%|██████████| 50/50 [00:00<00:00, 1746.04it/s]
Extracting alignments: 100%|██████████| 50/50 [00:01<00:00, 44.01it/s]


File ./source.txt.target.txt has been removed successfully
File ./source.txt has been removed successfully
File ./target.txt has been removed successfully
Loading dataset...


100%|██████████| 50/50 [00:00<00:00, 1667.34it/s]
Extracting alignments: 100%|██████████| 50/50 [00:01<00:00, 45.26it/s]


File ./source.txt.target.txt has been removed successfully
File ./source.txt has been removed successfully
File ./target.txt has been removed successfully


In [14]:
print(usw_a, usw_f, usw_l)
print(nm_a, nm_f, nm_l)

11.6 10.966810966810966 7.932011331444759
7.557041446208113 5.977530322159433 4.640564034114135


## Contained In

In [9]:
df['contained_in(p_a,p_f)_BERT'] = bert_score_contained_in(df['p_a'], df['p_f'], 'sp')
df['contained_in(p_a,p_l)_BERT'] = bert_score_contained_in(df['p_a'], df['p_l'], 'sp')
df['contained_in(p_a,p_f)_chrP'] = chrp_contained_in(df['p_a'], df['p_f'])
df['contained_in(p_a,p_l)_chrP'] = chrp_contained_in(df['p_a'], df['p_l'])
df['sensitivity_BERT'] = abs(df['contained_in(p_a,p_l)_BERT'] - df['contained_in(p_a,p_f)_BERT'])
df['sensitivity_chrP'] = abs(df['contained_in(p_a,p_l)_chrP'] -  df['contained_in(p_a,p_f)_chrP'])

calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:03<00:00,  3.47s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 178.96it/s]


done in 3.48 seconds, 2.88 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:02<00:00,  2.70s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 218.15it/s]

done in 2.71 seconds, 3.69 sentences/sec



