Skip to content

Commit

Permalink
Update semantics WS dataset (#3204)
Browse files Browse the repository at this point in the history
Updated variant with additional gradient (<25%(no), 25%, 50%, 75%,
<100%(yes), 100%(same))
  • Loading branch information
0x22almostEvil committed May 26, 2023
1 parent 812d1b8 commit 86fa745
Show file tree
Hide file tree
Showing 2 changed files with 222 additions and 44 deletions.
21 changes: 3 additions & 18 deletions data/datasets/semantics_ws_qna_oa/data_process.py
Expand Up @@ -16,30 +16,15 @@ def create_qna(row):
random_num = random.randint(0, 2)

# extract rows' vals
con_type = row["Type"]
lang = row["Language"]
con_type = row["Type"]
word1 = row["Word1"]
word2 = row["Word2"]

score_percent = row["Score"]
# 0 - yes; 1 - 50%, 2 - no
if con_type == "sim":
instruction = random_stuff.random_dict_sim_q[lang][random_num].format(word1=word1, word2=word2)
else:
instruction = random_stuff.random_dict_rel_q[lang][random_num].format(word1=word1, word2=word2)
if score_percent < 3.0 and con_type == "sim":
response = random_stuff.random_dict_sim_a[lang][2][random_num].format(word1=word1, word2=word2)
elif score_percent < 3.0 and con_type == "rel":
response = random_stuff.random_dict_rel_a[lang][2][random_num].format(word1=word1, word2=word2)
elif score_percent < 9 and con_type == "sim":
response = random_stuff.random_dict_sim_a[lang][1][random_num].format(word1=word1, word2=word2)
elif score_percent < 9 and con_type == "rel":
response = random_stuff.random_dict_rel_a[lang][1][random_num].format(word1=word1, word2=word2)
elif score_percent >= 9 and con_type == "sim":
response = random_stuff.random_dict_sim_a[lang][0][random_num].format(word1=word1, word2=word2)
elif score_percent >= 9 and con_type == "rel":
response = random_stuff.random_dict_rel_a[lang][0][random_num].format(word1=word1, word2=word2)

instruction = random_stuff.qna_random_magic(lang, word1, word2, con_type, score_percent, random_num, True)
response = random_stuff.qna_random_magic(lang, word1, word2, con_type, score_percent, random_num, False)
source = "WordSim353"
metadata = {
"language": lang,
Expand Down
245 changes: 219 additions & 26 deletions data/datasets/semantics_ws_qna_oa/random_stuff.py
Expand Up @@ -24,6 +24,28 @@

# sim_answers:

# same words;
random_list_sim_en_a_same = [
"Yes, because it's the same word.",
'Of course, we\'re talking about the same word: "{word1}".',
"You repeated '{word1}' twice.",
]
random_list_sim_ru_a_same = [
"Да, ведь это одно и то же слово.",
'Конечно, ведь речь идёт об одном слове: "{word1}".',
"Вы повторили '{word1}' дважды.",
]
random_list_sim_de_a_same = [
"Ja, denn es ist dasselbe Wort.",
'Natürlich, wir sprechen über dasselbe Wort: "{word1}".',
"Du hast '{word1}' zweimal wiederholt.",
]
random_list_sim_it_a_same = [
"Sì, perché è la stessa parola.",
'Certo, stiamo parlando della stessa parola: "{word1}".',
"Hai ripetuto '{word1}' due volte.",
]

# yes;
random_list_sim_en_a_y = [
"Yes, {word1} and {word2} are synonymous.",
Expand All @@ -40,13 +62,34 @@
'"{word1}" und "{word2}" sind Synonyme.',
"Ja. Der Typ der Verbindung zwischen den Wörtern '{word1}' und '{word2}' ist synonym.",
]

random_list_sim_it_a_y = [
"Sì, {word1} e {word2} sono sinonimi.",
'"{word1}" e "{word2}" sono sinonimi.',
"Sì. Il tipo di connessione tra le parole '{word1}' e '{word2}' è sinonimico.",
]

# 75%;
random_list_sim_en_a_75 = [
"There is a big conceptual meaning similarity between the words {word1} and {word2}, but they are not exactly synonymous.",
'The words "{word1}" and "{word2}" share a significant similarity, but they cannot be considered true synonyms.',
"While {word1} and {word2} are not interchangeable, they do have a substantial overlap in meaning.",
]
random_list_sim_ru_a_75 = [
"Между словами {word1} и {word2} есть большое сходство, но они не являются полноценными синонимами.",
'Слова "{word1}" и "{word2}" имеют значительное сходство, но не могут считаться полноценными синонимами.',
"Хотя {word1} и {word2} не являются взаимозаменяемыми, у них есть значительное пересечение в значении.",
]
random_list_sim_de_a_75 = [
"Es gibt eine große Ähnlichkeit zwischen den Wörtern {word1} und {word2}, aber sie sind nicht exakt synonym.",
'DieWörter "{word1}" und "{word2}" weisen eine erhebliche Ähnlichkeit auf, können aber nicht als vollwertige Synonyme betrachtet werden.',
"Obwohl {word1} und {word2} nicht austauschbar sind, haben sie eine wesentliche Überlappung in der Bedeutung.",
]
random_list_sim_it_a_75 = [
"C'è una grande somiglianza tra le parole {word1} e {word2}, ma non sono esattamente sinonimi.",
'Le parole "{word1}" e "{word2}" condividono una notevole somiglianza, ma non possono essere considerate sinonimi veri e propri.',
"Anche se {word1} e {word2} non sono interscambiabili, hanno una sostanziale sovrapposizione di significato.",
]

# 50%;
random_list_sim_en_a_50 = [
"There is some connection between the words {word1} and {word2}, but they are not full-fledged synonyms.",
Expand All @@ -69,6 +112,28 @@
"Sì, c'è una connessione tra le parole '{word1}' e '{word2}', ma non possono essere chiamate sinonimi.",
]

# 25%;
random_list_sim_en_a_25 = [
"No, {word1} and {word2} are not really synonymous, and they have very little conceptual meaning in common.",
'The words "{word1}" and "{word2}" do not have the same meaning, and they share only a small amount of conceptual overlap.',
"While there is some similarity between {word1} and {word2}, they cannot be considered synonyms as their conceptual meaning has very little overlap.",
]
random_list_sim_ru_a_25 = [
"Нет, {word1} и {word2} не являются совсем синонимами, и у них очень мало общего в плане концептуального значения.",
'Слова "{word1}" и "{word2}" не имеют одинакового значения, и у них есть только небольшое концептуальное пересечение.',
"Хотя между {word1} и {word2} есть некоторое сходство, они не могут считаться синонимами, поскольку их концептуальное значение имеет очень мало общего.",
]
random_list_sim_de_a_25 = [
"Nein, {word1} und {word2} sind nicht wirklich Synonyme, und sie haben sehr wenig konzeptionelle Bedeutung gemeinsam.",
'Die Wörter "{word1}" und "{word2}" haben nicht dieselbe Bedeutung, und sie teilen nur eine geringe konzeptuelle Überschneidung.',
"Obwohl {word1} und {word2} einige Ähnlichkeiten aufweisen, können sie nicht als Synonyme betrachtetwerden, da ihr konzeptuelles Bedeutungsfeld nur sehr wenig gemeinsam hat.",
]
random_list_sim_it_a_25 = [
"No, {word1} e {word2} non sono veri e propri sinonimi, e hanno molto poco in comune a livello concettuale.",
'Le parole "{word1}" e "{word2}" non hanno lo stesso significato, e condividono solo una piccola quantità di sovrapposizione concettuale.',
"Anche se c'è una certa somiglianza tra {word1} e {word2}, non possono essere considerati sinonimi poiché il loro campo semantico ha molto poco in comune.",
]

# no;
random_list_sim_en_a_n = [
"No, the words {word1} and {word2} are not synonyms.",
Expand Down Expand Up @@ -116,6 +181,31 @@

# rel_answers:

# same words;
random_list_rel_en_a_same = [
"Yes, because it's the same word.",
'Of course, we\'re talking about the same word: "{word1}".',
"You repeated '{word1}' twice.",
]

random_list_rel_ru_a_same = [
"Да, ведь это одно и то же слово.",
'Конечно, ведь речь идёт об одном слове: "{word1}".',
"Вы повторили '{word1}' дважды.",
]

random_list_rel_de_a_same = [
"Ja, denn es ist dasselbe Wort.",
'Natürlich, wir sprechen über dasselbe Wort: "{word1}".',
"Du hast '{word1}' zweimal wiederholt.",
]

random_list_rel_it_a_same = [
"Sì, perché è la stessa parola.",
'Certo, stiamo parlando della stessa parola: "{word1}".',
"Hai ripetuto '{word1}' due volte.",
]

# yes;
random_list_rel_en_a_y = [
"Yes, there is an association between the words {word1} and {word2}.",
Expand All @@ -138,6 +228,28 @@
"C'è un legame associativo diretto tra le parole '{word1}' e '{word2}'.",
]

# 75%;
random_list_rel_en_a_75 = [
"There is a significant association between {word1} and {word2}, but the level of relatedness is not really high, about 75%.",
'While "{word1}" and "{word2}" are related to some extent, their conceptual overlap is not very strong.',
"There is a moderate association between '{word1}' and '{word2}', indicating that they are related a lot, but not completely.",
]
random_list_rel_ru_a_75 = [
"Между словами {word1} и {word2} существует значительная связь, но уровень связанности не превышает 75%.",
'Хотя слова "{word1}" и "{word2}" имеют некоторую связь, их концептуальное сходство не так сильно высоко, чтобы их можно было назвать полностью ассоциативными.',
"Существует умеренная связь между словами '{word1}' и '{word2}', что указывает на то, что они сильно связаны между собой, но не полностью.",
]
random_list_rel_de_a_75 = [
"Es besteht eine signifikante Assoziation zwischen {word1} und {word2}, aber das Maß der Verwandtschaft ist nicht sehr hoch.",
'Obwohl "{word1}" und "{word2}" in gewisser Weise miteinander verbunden sind, ist ihre konzeptuelle Überlappung nicht sehr stark.',
"Es besteht eine mäßige Assoziation zwischen '{word1}' und '{word2}', was darauf hinweist, dass sie stark miteinander verbunden sind, aber nicht vollständig.",
]
random_list_rel_it_a_75 = [
"C'è una significativa associazione tra {word1} e {word2}, ma il livello di relazione non è molto alto.",
'Anche se "{word1}" e "{word2}" sono in qualche modo correlati, il loro sovrapporsi concettuale non è molto forte.',
"C'è una moderata associazione tra '{word1}' e '{word2}', indicando che sono molto correlati, ma non completamente.",
]

# 50%;
random_list_rel_en_a_50 = [
"There is a slight association between the words {word1} and {word2}.",
Expand All @@ -160,6 +272,28 @@
"C'è una certa associazione tra le parole '{word1}' e '{word2}'.",
]

# 25%;
random_list_rel_en_a_25 = [
"There is very little conceptual related meaning in common between {word1} and {word2}, with a low level of relatedness.",
"The association between {word1} and {word2} is weak, suggesting that they are not very related.",
"While there is some association between {word1} and {word2}, the level of relatedness is quite low.",
]
random_list_rel_ru_a_25 = [
"Между словами {word1} и {word2} очень мало общего в плане концептуальной связи, уровень связанности низкий.",
"Связь между словами {word1} и {word2} слабая, что указывает на то, что они не очень связаны между собой.",
"Хотя между словами {word1} и {word2} есть некоторая связь, уровень связанности довольно низкий.",
]
random_list_rel_de_a_25 = [
"Es gibt sehr wenig konzeptuell verwandte Bedeutung zwischen den Wörtern {word1} und {word2}, mit einem niedrigen Verwandtheitsgrad.",
"Die Assoziation zwischen {word1} und {word2} ist schwach, was darauf hindeutet, dass sie nicht sehr verwandt sind.",
"Obwohl es eine gewisse Assoziation zwischen {word1} und {word2} gibt, ist das Maß der Verwandtschaft recht gering.",
]
random_list_rel_it_a_25 = [
"C'è molto poco significato concettualmente correlato tra {word1} e {word2}, con un basso livello di correlazione.",
"L'associazione tra {word1} e {word2} è debole, suggerendo che non sono molto correlati.",
"Anche se c'è una certa associazione tra {word1} e {word2}, il livello di correlazione è piuttosto basso.",
]

# no;
random_list_rel_en_a_n = [
"No, there is no associative relationship between the words {word1} and {word2}",
Expand Down Expand Up @@ -201,49 +335,108 @@
}

# dicts for a
# sim - random_dict_sim_a["ru"][0] # returns the list of "yes" answers for Russian
# sim - random_dict_sim_a["ru"][0
random_dict_sim_a = {
"en": {
0: random_list_sim_en_a_y,
1: random_list_sim_en_a_50,
2: random_list_sim_en_a_n,
0: random_list_sim_en_a_same,
1: random_list_sim_en_a_y,
2: random_list_sim_en_a_75,
3: random_list_sim_en_a_50,
4: random_list_sim_en_a_25,
5: random_list_sim_en_a_n,
},
"ru": {
0: random_list_sim_ru_a_y,
1: random_list_sim_ru_a_50,
2: random_list_sim_ru_a_n,
0: random_list_sim_ru_a_same,
1: random_list_sim_ru_a_y,
2: random_list_sim_ru_a_75,
3: random_list_sim_ru_a_50,
4: random_list_sim_ru_a_25,
5: random_list_sim_ru_a_n,
},
"de": {
0: random_list_sim_de_a_y,
1: random_list_sim_de_a_50,
2: random_list_sim_de_a_n,
0: random_list_sim_de_a_same,
1: random_list_sim_de_a_y,
2: random_list_sim_de_a_75,
3: random_list_sim_de_a_50,
4: random_list_sim_de_a_25,
5: random_list_sim_de_a_n,
},
"it": {
0: random_list_sim_it_a_y,
1: random_list_sim_it_a_50,
2: random_list_sim_it_a_n,
0: random_list_sim_it_a_same,
1: random_list_sim_it_a_y,
2: random_list_sim_it_a_75,
3: random_list_sim_it_a_50,
4: random_list_sim_it_a_25,
5: random_list_sim_it_a_n,
},
}
# rel
random_dict_rel_a = {
"en": {
0: random_list_rel_en_a_y,
1: random_list_rel_en_a_50,
2: random_list_rel_en_a_n,
0: random_list_rel_en_a_same,
1: random_list_rel_en_a_y,
2: random_list_rel_en_a_75,
3: random_list_rel_en_a_50,
4: random_list_rel_en_a_25,
5: random_list_rel_en_a_n,
},
"ru": {
0: random_list_rel_ru_a_y,
1: random_list_rel_ru_a_50,
2: random_list_rel_ru_a_n,
0: random_list_rel_ru_a_same,
1: random_list_rel_ru_a_y,
2: random_list_rel_ru_a_75,
3: random_list_rel_ru_a_50,
4: random_list_rel_ru_a_25,
5: random_list_rel_ru_a_n,
},
"de": {
0: random_list_rel_de_a_y,
1: random_list_rel_de_a_50,
2: random_list_rel_de_a_n,
0: random_list_rel_de_a_same,
1: random_list_rel_de_a_y,
2: random_list_rel_de_a_75,
3: random_list_rel_de_a_50,
4: random_list_rel_de_a_25,
5: random_list_rel_de_a_n,
},
"it": {
0: random_list_rel_it_a_y,
1: random_list_rel_it_a_50,
2: random_list_rel_it_a_n,
0: random_list_rel_it_a_same,
1: random_list_rel_it_a_y,
2: random_list_rel_it_a_75,
3: random_list_rel_it_a_50,
4: random_list_rel_it_a_25,
5: random_list_rel_it_a_n,
},
}


# 0 - same, 1 - yes, 2 - 75, 3 - 50, 4 - 25, 5 - no
def qna_random_magic(lang, word1, word2, con_type, score_percent, random_num, isQuestion):
if con_type == "sim":
instruction = random_dict_sim_q[lang][random_num].format(word1=word1, word2=word2)
else:
instruction = random_dict_rel_q[lang][random_num].format(word1=word1, word2=word2)
if score_percent < 1.85 and con_type == "sim":
response = random_dict_sim_a[lang][5][random_num].format(word1=word1, word2=word2)
elif score_percent < 1.85 and con_type == "rel":
response = random_dict_rel_a[lang][5][random_num].format(word1=word1, word2=word2)
elif score_percent < 3.85 and con_type == "sim":
response = random_dict_sim_a[lang][4][random_num].format(word1=word1, word2=word2)
elif score_percent < 3.85 and con_type == "rel":
response = random_dict_rel_a[lang][4][random_num].format(word1=word1, word2=word2)
elif score_percent < 6.3 and con_type == "sim":
response = random_dict_sim_a[lang][3][random_num].format(word1=word1, word2=word2)
elif score_percent < 6.3 and con_type == "rel":
response = random_dict_rel_a[lang][3][random_num].format(word1=word1, word2=word2)
elif score_percent < 8.85 and con_type == "sim":
response = random_dict_sim_a[lang][2][random_num].format(word1=word1, word2=word2)
elif score_percent < 8.85 and con_type == "rel":
response = random_dict_rel_a[lang][2][random_num].format(word1=word1, word2=word2)
elif score_percent < 10 and con_type == "sim":
response = random_dict_sim_a[lang][1][random_num].format(word1=word1, word2=word2)
elif score_percent < 10 and con_type == "rel":
response = random_dict_rel_a[lang][1][random_num].format(word1=word1, word2=word2)
elif score_percent == 10 and con_type == "sim":
response = random_dict_sim_a[lang][0][random_num].format(word1=word1, word2=word2)
elif score_percent == 10 and con_type == "rel":
response = random_dict_rel_a[lang][0][random_num].format(word1=word1, word2=word2)
else:
assert "Error"
return instruction if isQuestion else response

0 comments on commit 86fa745

Please sign in to comment.