In [21]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [35]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
tqdm.pandas()
import openai
import config

In [64]:
df = pd.read_csv('../meaning.csv')
df.head()

Unnamed: 0,idiom,meaning
0,get to [pron] point,get to the main point of a discussion or conve...
1,bring (somebody) to ([pron]) knees,force someone to submit to one's will or power.
2,make up [pron] mind,come to a decision or opinion.
3,build bridges,create a connection or relationship with someone.
4,raise eyebrows,express surprise or disapproval.


In [65]:
dfs_lang = [
    pd.read_csv('./kor/idioms_korean.csv'),
    pd.read_csv('./chn/idioms_chinese.csv'),
]
langs = ['kor', 'chn']

In [66]:
# Compute embedding for both lists

for i in range(len(langs)):
    df_lang = dfs_lang[i]
    def get_similarity(sentence1, sentence2):
        embedding_1= model.encode(sentence1, convert_to_tensor=True)
        embedding_2 = model.encode(sentence2, convert_to_tensor=True)
        p = float(util.pytorch_cos_sim(embedding_1, embedding_2)[0][0])
        return p
    def get_equivalence(row, lang_name=langs[i]):
        p_arr = df_lang['meaning'].apply(
            lambda key: get_similarity(row['meaning'], key))
        i_max = set(np.argpartition(p_arr, -6)[-6:])
        i_thr = set(np.where(p_arr > 0.5)[0])
        row[lang_name + '_idiom_ss'] = df_lang['idiom'][i_max & i_thr].tolist()
        row[lang_name + '_meaning_ss'] = df_lang['meaning'][i_max & i_thr].tolist()
        return row
    df = df[:25].progress_apply(get_equivalence, axis=1)

df

100%|██████████| 25/25 [03:19<00:00,  8.00s/it]
100%|██████████| 25/25 [00:44<00:00,  1.80s/it]


Unnamed: 0,idiom,meaning,kor_idiom_ss,kor_meaning_ss,chn_idiom_ss,chn_meaning_ss
0,get to [pron] point,get to the main point of a discussion or conve...,"[입이 싸다, 입을 떼다, 말이 많다]","[be talkative, begin to talk, be talkative]",[],[]
1,bring (somebody) to ([pron]) knees,force someone to submit to one's will or power.,"[무릎 을 꿇다, 바람을 넣다]","[to submit or surrender, incite a person to ac...",[强人所难],[to force someone to do something]
2,make up [pron] mind,come to a decision or opinion.,[],[],[],[]
3,build bridges,create a connection or relationship with someone.,"[발을 끊다, 손을 나누다]","[to end relations with someone, to seperate fr...",[骨肉之情],[very and intimate close relationship]
4,raise eyebrows,express surprise or disapproval.,[가슴이 내려앉다],[be startled],[],[]
5,come to blows,engage in physical violence.,"[몸을 쓰다, 손을 보다]","[be physically active, show one's anger by usi...",[],[]
6,home truth,"a frank and honest statement, often unpleasant...",[],[],[],[]
7,hot potato,a situation or issue that is difficult to hand...,"[허리가 부러지다, 손톱 여물을 썬다]","[a difficult to manage situation, to deal with...",[同舟共济],[to get through a time of difficulty by settli...
8,tie [pron] knot,get married.,"[머리를 얹다, 바람이 나다, 살을 섞다]","[to get married, have a love affair, live a ma...",[],[]
9,quantum leap,"a sudden, dramatic, and significant advancemen...",[],[],"[日新月异, 日渐激烈]","[rapid change, improvement, extreme progress]"


In [67]:
openai.organization = config.openai_organization
openai.api_key = config.openai_api_key

for i in range(len(langs)):
    df_lang = dfs_lang[i]
    lang_name = langs[i]
    def get_equivalence_gpt(row):
        prompt = 'Which of the following means has the same meaning as the phrase "' + row['meaning'].strip('.') + '"? There might be multiple answers. Only answer the numbers in a comma separated format (example: number1, number2, ...).\n'
        if len(row[lang_name + '_meaning_ss']) == 0: 
            return row
        for i in range(len(row[lang_name + '_meaning_ss'])):
            prompt += str(i) + ": \"" + row[lang_name + '_meaning_ss'][i] + "\"\n"
        prompt += str(len(row[lang_name + '_meaning_ss'])) + ": " + 'None of the above' + "\n" + "numbers: "
        answer = openai.Completion.create(
            model="text-davinci-003",
            prompt = prompt,
            max_tokens=400,
            temperature=0, # the higher this value, the less deterministic
            top_p=1, # the higher this value, the wider range of vocab is used
        ).choices[0].text.strip()
        try:
            answer = answer.split(',')
            answer = [int(i.strip()) for i in answer]
            if len(row[lang_name + '_idiom_ss']) in answer or len(answer) == 0:
                return row
            row[lang_name + '_idiom_gpt'] = [row[lang_name + '_idiom_ss'][i] for i in answer]
            row[lang_name + '_meaning_gpt'] = [row[lang_name + '_meaning_ss'][i] for i in answer]
        except:
            return row
        return row

    df[lang_name + '_idiom_gpt'] = [[] for _ in range(len(df))]
    df[lang_name + '_meaning_gpt'] = [[] for _ in range(len(df))]
    df = df.progress_apply(get_equivalence_gpt, axis=1)

100%|██████████| 25/25 [00:08<00:00,  2.94it/s]
100%|██████████| 25/25 [00:02<00:00,  9.94it/s]


In [68]:
df

Unnamed: 0,idiom,meaning,kor_idiom_ss,kor_meaning_ss,chn_idiom_ss,chn_meaning_ss,kor_idiom_gpt,kor_meaning_gpt,chn_idiom_gpt,chn_meaning_gpt
0,get to [pron] point,get to the main point of a discussion or conve...,"[입이 싸다, 입을 떼다, 말이 많다]","[be talkative, begin to talk, be talkative]",[],[],[말이 많다],[be talkative],[],[]
1,bring (somebody) to ([pron]) knees,force someone to submit to one's will or power.,"[무릎 을 꿇다, 바람을 넣다]","[to submit or surrender, incite a person to ac...",[强人所难],[to force someone to do something],[],[],[强人所难],[to force someone to do something]
2,make up [pron] mind,come to a decision or opinion.,[],[],[],[],[],[],[],[]
3,build bridges,create a connection or relationship with someone.,"[발을 끊다, 손을 나누다]","[to end relations with someone, to seperate fr...",[骨肉之情],[very and intimate close relationship],[],[],[骨肉之情],[very and intimate close relationship]
4,raise eyebrows,express surprise or disapproval.,[가슴이 내려앉다],[be startled],[],[],[가슴이 내려앉다],[be startled],[],[]
5,come to blows,engage in physical violence.,"[몸을 쓰다, 손을 보다]","[be physically active, show one's anger by usi...",[],[],[손을 보다],[show one's anger by using violence against so...,[],[]
6,home truth,"a frank and honest statement, often unpleasant...",[],[],[],[],[],[],[],[]
7,hot potato,a situation or issue that is difficult to hand...,"[허리가 부러지다, 손톱 여물을 썬다]","[a difficult to manage situation, to deal with...",[同舟共济],[to get through a time of difficulty by settli...,"[허리가 부러지다, 손톱 여물을 썬다]","[a difficult to manage situation, to deal with...",[],[]
8,tie [pron] knot,get married.,"[머리를 얹다, 바람이 나다, 살을 섞다]","[to get married, have a love affair, live a ma...",[],[],"[머리를 얹다, 살을 섞다]","[to get married, live a married life]",[],[]
9,quantum leap,"a sudden, dramatic, and significant advancemen...",[],[],"[日新月异, 日渐激烈]","[rapid change, improvement, extreme progress]",[],[],"[日新月异, 日渐激烈]","[rapid change, improvement, extreme progress]"
