Update semantics WS dataset (#3204)

Updated variant with additional gradient (<25%(no), 25%, 50%, 75%, <100%(yes), 100%(same))
LAION-AI · May 26, 2023 · 86fa745 · 86fa745
1 parent 812d1b8
commit 86fa745
Show file tree

Hide file tree

Showing 2 changed files with 222 additions and 44 deletions.
diff --git a/data/datasets/semantics_ws_qna_oa/data_process.py b/data/datasets/semantics_ws_qna_oa/data_process.py
@@ -16,30 +16,15 @@ def create_qna(row):
         random_num = random.randint(0, 2)
 
         # extract rows' vals
-        con_type = row["Type"]
         lang = row["Language"]
+        con_type = row["Type"]
         word1 = row["Word1"]
         word2 = row["Word2"]
 
         score_percent = row["Score"]
         # 0 - yes; 1 - 50%, 2 - no
-        if con_type == "sim":
-            instruction = random_stuff.random_dict_sim_q[lang][random_num].format(word1=word1, word2=word2)
-        else:
-            instruction = random_stuff.random_dict_rel_q[lang][random_num].format(word1=word1, word2=word2)
-        if score_percent < 3.0 and con_type == "sim":
-            response = random_stuff.random_dict_sim_a[lang][2][random_num].format(word1=word1, word2=word2)
-        elif score_percent < 3.0 and con_type == "rel":
-            response = random_stuff.random_dict_rel_a[lang][2][random_num].format(word1=word1, word2=word2)
-        elif score_percent < 9 and con_type == "sim":
-            response = random_stuff.random_dict_sim_a[lang][1][random_num].format(word1=word1, word2=word2)
-        elif score_percent < 9 and con_type == "rel":
-            response = random_stuff.random_dict_rel_a[lang][1][random_num].format(word1=word1, word2=word2)
-        elif score_percent >= 9 and con_type == "sim":
-            response = random_stuff.random_dict_sim_a[lang][0][random_num].format(word1=word1, word2=word2)
-        elif score_percent >= 9 and con_type == "rel":
-            response = random_stuff.random_dict_rel_a[lang][0][random_num].format(word1=word1, word2=word2)
-
+        instruction = random_stuff.qna_random_magic(lang, word1, word2, con_type, score_percent, random_num, True)
+        response = random_stuff.qna_random_magic(lang, word1, word2, con_type, score_percent, random_num, False)
         source = "WordSim353"
         metadata = {
             "language": lang,

diff --git a/data/datasets/semantics_ws_qna_oa/random_stuff.py b/data/datasets/semantics_ws_qna_oa/random_stuff.py
@@ -24,6 +24,28 @@
 
 # sim_answers:
 
+# same words;
+random_list_sim_en_a_same = [
+    "Yes, because it's the same word.",
+    'Of course, we\'re talking about the same word: "{word1}".',
+    "You repeated '{word1}' twice.",
+]
+random_list_sim_ru_a_same = [
+    "Да, ведь это одно и то же слово.",
+    'Конечно, ведь речь идёт об одном слове: "{word1}".',
+    "Вы повторили '{word1}' дважды.",
+]
+random_list_sim_de_a_same = [
+    "Ja, denn es ist dasselbe Wort.",
+    'Natürlich, wir sprechen über dasselbe Wort: "{word1}".',
+    "Du hast '{word1}' zweimal wiederholt.",
+]
+random_list_sim_it_a_same = [
+    "Sì, perché è la stessa parola.",
+    'Certo, stiamo parlando della stessa parola: "{word1}".',
+    "Hai ripetuto '{word1}' due volte.",
+]
+
 # yes;
 random_list_sim_en_a_y = [
     "Yes, {word1} and {word2} are synonymous.",
@@ -40,13 +62,34 @@
     '"{word1}" und "{word2}" sind Synonyme.',
     "Ja. Der Typ der Verbindung zwischen den Wörtern '{word1}' und '{word2}' ist synonym.",
 ]
-
 random_list_sim_it_a_y = [
     "Sì, {word1} e {word2} sono sinonimi.",
     '"{word1}" e "{word2}" sono sinonimi.',
     "Sì. Il tipo di connessione tra le parole '{word1}' e '{word2}' è sinonimico.",
 ]
 
+# 75%;
+random_list_sim_en_a_75 = [
+    "There is a big conceptual meaning similarity between the words {word1} and {word2}, but they are not exactly synonymous.",
+    'The words "{word1}" and "{word2}" share a significant similarity, but they cannot be considered true synonyms.',
+    "While {word1} and {word2} are not interchangeable, they do have a substantial overlap in meaning.",
+]
+random_list_sim_ru_a_75 = [
+    "Между словами {word1} и {word2} есть большое сходство, но они не являются полноценными синонимами.",
+    'Слова "{word1}" и "{word2}" имеют значительное сходство, но не могут считаться полноценными синонимами.',
+    "Хотя {word1} и {word2} не являются взаимозаменяемыми, у них есть значительное пересечение в значении.",
+]
+random_list_sim_de_a_75 = [
+    "Es gibt eine große Ähnlichkeit zwischen den Wörtern {word1} und {word2}, aber sie sind nicht exakt synonym.",
+    'DieWörter "{word1}" und "{word2}" weisen eine erhebliche Ähnlichkeit auf, können aber nicht als vollwertige Synonyme betrachtet werden.',
+    "Obwohl {word1} und {word2} nicht austauschbar sind, haben sie eine wesentliche Überlappung in der Bedeutung.",
+]
+random_list_sim_it_a_75 = [
+    "C'è una grande somiglianza tra le parole {word1} e {word2}, ma non sono esattamente sinonimi.",
+    'Le parole "{word1}" e "{word2}" condividono una notevole somiglianza, ma non possono essere considerate sinonimi veri e propri.',
+    "Anche se {word1} e {word2} non sono interscambiabili, hanno una sostanziale sovrapposizione di significato.",
+]
+
 # 50%;
 random_list_sim_en_a_50 = [
     "There is some connection between the words {word1} and {word2}, but they are not full-fledged synonyms.",
@@ -69,6 +112,28 @@
     "Sì, c'è una connessione tra le parole '{word1}' e '{word2}', ma non possono essere chiamate sinonimi.",
 ]
 
+# 25%;
+random_list_sim_en_a_25 = [
+    "No, {word1} and {word2} are not really synonymous, and they have very little conceptual meaning in common.",
+    'The words "{word1}" and "{word2}" do not have the same meaning, and they share only a small amount of conceptual overlap.',
+    "While there is some similarity between {word1} and {word2}, they cannot be considered synonyms as their conceptual meaning has very little overlap.",
+]
+random_list_sim_ru_a_25 = [
+    "Нет, {word1} и {word2} не являются совсем синонимами, и у них очень мало общего в плане концептуального значения.",
+    'Слова "{word1}" и "{word2}" не имеют одинакового значения, и у них есть только небольшое концептуальное пересечение.',
+    "Хотя между {word1} и {word2} есть некоторое сходство, они не могут считаться синонимами, поскольку их концептуальное значение имеет очень мало общего.",
+]
+random_list_sim_de_a_25 = [
+    "Nein, {word1} und {word2} sind nicht wirklich Synonyme, und sie haben sehr wenig konzeptionelle Bedeutung gemeinsam.",
+    'Die Wörter "{word1}" und "{word2}" haben nicht dieselbe Bedeutung, und sie teilen nur eine geringe konzeptuelle Überschneidung.',
+    "Obwohl {word1} und {word2} einige Ähnlichkeiten aufweisen, können sie nicht als Synonyme betrachtetwerden, da ihr konzeptuelles Bedeutungsfeld nur sehr wenig gemeinsam hat.",
+]
+random_list_sim_it_a_25 = [
+    "No, {word1} e {word2} non sono veri e propri sinonimi, e hanno molto poco in comune a livello concettuale.",
+    'Le parole "{word1}" e "{word2}" non hanno lo stesso significato, e condividono solo una piccola quantità di sovrapposizione concettuale.',
+    "Anche se c'è una certa somiglianza tra {word1} e {word2}, non possono essere considerati sinonimi poiché il loro campo semantico ha molto poco in comune.",
+]
+
 # no;
 random_list_sim_en_a_n = [
     "No, the words {word1} and {word2} are not synonyms.",
@@ -116,6 +181,31 @@
 
 # rel_answers:
 
+# same words;
+random_list_rel_en_a_same = [
+    "Yes, because it's the same word.",
+    'Of course, we\'re talking about the same word: "{word1}".',
+    "You repeated '{word1}' twice.",
+]
+
+random_list_rel_ru_a_same = [
+    "Да, ведь это одно и то же слово.",
+    'Конечно, ведь речь идёт об одном слове: "{word1}".',
+    "Вы повторили '{word1}' дважды.",
+]
+
+random_list_rel_de_a_same = [
+    "Ja, denn es ist dasselbe Wort.",
+    'Natürlich, wir sprechen über dasselbe Wort: "{word1}".',
+    "Du hast '{word1}' zweimal wiederholt.",
+]
+
+random_list_rel_it_a_same = [
+    "Sì, perché è la stessa parola.",
+    'Certo, stiamo parlando della stessa parola: "{word1}".',
+    "Hai ripetuto '{word1}' due volte.",
+]
+
 # yes;
 random_list_rel_en_a_y = [
     "Yes, there is an association between the words {word1} and {word2}.",
@@ -138,6 +228,28 @@
     "C'è un legame associativo diretto tra le parole '{word1}' e '{word2}'.",
 ]
 
+# 75%;
+random_list_rel_en_a_75 = [
+    "There is a significant association between {word1} and {word2}, but the level of relatedness is not really high, about 75%.",
+    'While "{word1}" and "{word2}" are related to some extent, their conceptual overlap is not very strong.',
+    "There is a moderate association between '{word1}' and '{word2}', indicating that they are related a lot, but not completely.",
+]
+random_list_rel_ru_a_75 = [
+    "Между словами {word1} и {word2} существует значительная связь, но уровень связанности не превышает 75%.",
+    'Хотя слова "{word1}" и "{word2}" имеют некоторую связь, их концептуальное сходство не так сильно высоко, чтобы их можно было назвать полностью ассоциативными.',
+    "Существует умеренная связь между словами '{word1}' и '{word2}', что указывает на то, что они сильно связаны между собой, но не полностью.",
+]
+random_list_rel_de_a_75 = [
+    "Es besteht eine signifikante Assoziation zwischen {word1} und {word2}, aber das Maß der Verwandtschaft ist nicht sehr hoch.",
+    'Obwohl "{word1}" und "{word2}" in gewisser Weise miteinander verbunden sind, ist ihre konzeptuelle Überlappung nicht sehr stark.',
+    "Es besteht eine mäßige Assoziation zwischen '{word1}' und '{word2}', was darauf hinweist, dass sie stark miteinander verbunden sind, aber nicht vollständig.",
+]
+random_list_rel_it_a_75 = [
+    "C'è una significativa associazione tra {word1} e {word2}, ma il livello di relazione non è molto alto.",
+    'Anche se "{word1}" e "{word2}" sono in qualche modo correlati, il loro sovrapporsi concettuale non è molto forte.',
+    "C'è una moderata associazione tra '{word1}' e '{word2}', indicando che sono molto correlati, ma non completamente.",
+]
+
 # 50%;
 random_list_rel_en_a_50 = [
     "There is a slight association between the words {word1} and {word2}.",
@@ -160,6 +272,28 @@
     "C'è una certa associazione tra le parole '{word1}' e '{word2}'.",
 ]
 
+# 25%;
+random_list_rel_en_a_25 = [
+    "There is very little conceptual related meaning in common between {word1} and {word2}, with a low level of relatedness.",
+    "The association between {word1} and {word2} is weak, suggesting that they are not very related.",
+    "While there is some association between {word1} and {word2}, the level of relatedness is quite low.",
+]
+random_list_rel_ru_a_25 = [
+    "Между словами {word1} и {word2} очень мало общего в плане концептуальной связи, уровень связанности низкий.",
+    "Связь между словами {word1} и {word2} слабая, что указывает на то, что они не очень связаны между собой.",
+    "Хотя между словами {word1} и {word2} есть некоторая связь, уровень связанности довольно низкий.",
+]
+random_list_rel_de_a_25 = [
+    "Es gibt sehr wenig konzeptuell verwandte Bedeutung zwischen den Wörtern {word1} und {word2}, mit einem niedrigen Verwandtheitsgrad.",
+    "Die Assoziation zwischen {word1} und {word2} ist schwach, was darauf hindeutet, dass sie nicht sehr verwandt sind.",
+    "Obwohl es eine gewisse Assoziation zwischen {word1} und {word2} gibt, ist das Maß der Verwandtschaft recht gering.",
+]
+random_list_rel_it_a_25 = [
+    "C'è molto poco significato concettualmente correlato tra {word1} e {word2}, con un basso livello di correlazione.",
+    "L'associazione tra {word1} e {word2} è debole, suggerendo che non sono molto correlati.",
+    "Anche se c'è una certa associazione tra {word1} e {word2}, il livello di correlazione è piuttosto basso.",
+]
+
 # no;
 random_list_rel_en_a_n = [
     "No, there is no associative relationship between the words {word1} and {word2}",
@@ -201,49 +335,108 @@
 }
 
 # dicts for a
-# sim - random_dict_sim_a["ru"][0]  # returns the list of "yes" answers for Russian
+# sim - random_dict_sim_a["ru"][0
 random_dict_sim_a = {
     "en": {
-        0: random_list_sim_en_a_y,
-        1: random_list_sim_en_a_50,
-        2: random_list_sim_en_a_n,
+        0: random_list_sim_en_a_same,
+        1: random_list_sim_en_a_y,
+        2: random_list_sim_en_a_75,
+        3: random_list_sim_en_a_50,
+        4: random_list_sim_en_a_25,
+        5: random_list_sim_en_a_n,
     },
     "ru": {
-        0: random_list_sim_ru_a_y,
-        1: random_list_sim_ru_a_50,
-        2: random_list_sim_ru_a_n,
+        0: random_list_sim_ru_a_same,
+        1: random_list_sim_ru_a_y,
+        2: random_list_sim_ru_a_75,
+        3: random_list_sim_ru_a_50,
+        4: random_list_sim_ru_a_25,
+        5: random_list_sim_ru_a_n,
     },
     "de": {
-        0: random_list_sim_de_a_y,
-        1: random_list_sim_de_a_50,
-        2: random_list_sim_de_a_n,
+        0: random_list_sim_de_a_same,
+        1: random_list_sim_de_a_y,
+        2: random_list_sim_de_a_75,
+        3: random_list_sim_de_a_50,
+        4: random_list_sim_de_a_25,
+        5: random_list_sim_de_a_n,
     },
     "it": {
-        0: random_list_sim_it_a_y,
-        1: random_list_sim_it_a_50,
-        2: random_list_sim_it_a_n,
+        0: random_list_sim_it_a_same,
+        1: random_list_sim_it_a_y,
+        2: random_list_sim_it_a_75,
+        3: random_list_sim_it_a_50,
+        4: random_list_sim_it_a_25,
+        5: random_list_sim_it_a_n,
     },
 }
 # rel
 random_dict_rel_a = {
     "en": {
-        0: random_list_rel_en_a_y,
-        1: random_list_rel_en_a_50,
-        2: random_list_rel_en_a_n,
+        0: random_list_rel_en_a_same,
+        1: random_list_rel_en_a_y,
+        2: random_list_rel_en_a_75,
+        3: random_list_rel_en_a_50,
+        4: random_list_rel_en_a_25,
+        5: random_list_rel_en_a_n,
     },
     "ru": {
-        0: random_list_rel_ru_a_y,
-        1: random_list_rel_ru_a_50,
-        2: random_list_rel_ru_a_n,
+        0: random_list_rel_ru_a_same,
+        1: random_list_rel_ru_a_y,
+        2: random_list_rel_ru_a_75,
+        3: random_list_rel_ru_a_50,
+        4: random_list_rel_ru_a_25,
+        5: random_list_rel_ru_a_n,
     },
     "de": {
-        0: random_list_rel_de_a_y,
-        1: random_list_rel_de_a_50,
-        2: random_list_rel_de_a_n,
+        0: random_list_rel_de_a_same,
+        1: random_list_rel_de_a_y,
+        2: random_list_rel_de_a_75,
+        3: random_list_rel_de_a_50,
+        4: random_list_rel_de_a_25,
+        5: random_list_rel_de_a_n,
     },
     "it": {
-        0: random_list_rel_it_a_y,
-        1: random_list_rel_it_a_50,
-        2: random_list_rel_it_a_n,
+        0: random_list_rel_it_a_same,
+        1: random_list_rel_it_a_y,
+        2: random_list_rel_it_a_75,
+        3: random_list_rel_it_a_50,
+        4: random_list_rel_it_a_25,
+        5: random_list_rel_it_a_n,
     },
 }
+
+
+# 0 - same, 1 - yes, 2 - 75, 3 - 50, 4 - 25, 5 - no
+def qna_random_magic(lang, word1, word2, con_type, score_percent, random_num, isQuestion):
+    if con_type == "sim":
+        instruction = random_dict_sim_q[lang][random_num].format(word1=word1, word2=word2)
+    else:
+        instruction = random_dict_rel_q[lang][random_num].format(word1=word1, word2=word2)
+    if score_percent < 1.85 and con_type == "sim":
+        response = random_dict_sim_a[lang][5][random_num].format(word1=word1, word2=word2)
+    elif score_percent < 1.85 and con_type == "rel":
+        response = random_dict_rel_a[lang][5][random_num].format(word1=word1, word2=word2)
+    elif score_percent < 3.85 and con_type == "sim":
+        response = random_dict_sim_a[lang][4][random_num].format(word1=word1, word2=word2)
+    elif score_percent < 3.85 and con_type == "rel":
+        response = random_dict_rel_a[lang][4][random_num].format(word1=word1, word2=word2)
+    elif score_percent < 6.3 and con_type == "sim":
+        response = random_dict_sim_a[lang][3][random_num].format(word1=word1, word2=word2)
+    elif score_percent < 6.3 and con_type == "rel":
+        response = random_dict_rel_a[lang][3][random_num].format(word1=word1, word2=word2)
+    elif score_percent < 8.85 and con_type == "sim":
+        response = random_dict_sim_a[lang][2][random_num].format(word1=word1, word2=word2)
+    elif score_percent < 8.85 and con_type == "rel":
+        response = random_dict_rel_a[lang][2][random_num].format(word1=word1, word2=word2)
+    elif score_percent < 10 and con_type == "sim":
+        response = random_dict_sim_a[lang][1][random_num].format(word1=word1, word2=word2)
+    elif score_percent < 10 and con_type == "rel":
+        response = random_dict_rel_a[lang][1][random_num].format(word1=word1, word2=word2)
+    elif score_percent == 10 and con_type == "sim":
+        response = random_dict_sim_a[lang][0][random_num].format(word1=word1, word2=word2)
+    elif score_percent == 10 and con_type == "rel":
+        response = random_dict_rel_a[lang][0][random_num].format(word1=word1, word2=word2)
+    else:
+        assert "Error"
+    return instruction if isQuestion else response