In [1]:
import csv
import json
import os

import deepl
import numpy as np
import pandas as pd
import wn
from dotenv import load_dotenv
from tqdm.notebook import tqdm

load_dotenv()

True

In [2]:
auth_key = os.getenv("DEEPL_AUTH_KEY")
translator = deepl.Translator(auth_key)
filename = "data/gaps_translation.csv"

with open("trees_path/path_with_gaps.json", "r") as f:
    data = json.load(f)

wordnet = pd.read_json(path_or_buf="data/wordnet_translated_clean.jsonl", lines=True)

In [3]:
def transform_list(input_list):
    return "; ".join([", ".join(group) for group in input_list])


def get_translation(text_data):
    return translator.translate_text(text_data, target_lang="UK").text

In [5]:
header = [
    "PWN",
    "ILI",
    "POS",
    "Tree File",
    "Gap",
    "DeepL Direct",
    "DeepL Contextualized",
    "Translated Wordnet",
    "Lemmas",
    "Translated Wordnet Gloss",
    "Gloss",
    "Hypernyms",
    "Hyponyms",
]

seen = set()

with open(filename, "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(header)
    for elem in tqdm(data, total=len(data)):
        tree_pwn = list(elem["path"].keys())[0]
        for key, value in elem["path"].items():
            if "*" in value["title"]:
                gap = value["title"][1:-1]
                ss = wn.synset(key)
                gloss = ss.definition()
                ili = value["ili"]
                lemmas = ", ".join(ss.lemmas())
                hypernyms = [hypernym.lemmas() for hypernym in ss.hypernyms()]
                hyponyms = [hyponym.lemmas() for hyponym in ss.hyponyms()]
                sentence = f"{lemmas} â€” {gloss}."

                deepL_direct = get_translation(gap)
                sentence_translation = get_translation(sentence)
                deepL_contextualized = sentence_translation.split(" - ")[0].split(",")[
                    0
                ]
                try:
                    elem = wordnet.loc[
                        (wordnet.ili == ili) & (wordnet.freq > 0.1), "synsets"
                    ].iloc[0][0]
                    wordnet_gloss = max(
                        elem["glosses_raw"], key=elem["glosses_raw"].get,
                    )
                    translated_wordnet = ", ".join(
                        wordnet.loc[
                            (wordnet.ili == ili) & (wordnet.freq > 0.1), "lemma_raw",
                        ].to_list()
                    )
                except IndexError:
                    translated_wordnet = np.nan
                gap_data = [
                    key,
                    ili,
                    ss.pos,
                    tree_pwn,
                    gap,
                    deepL_direct,
                    deepL_contextualized,
                    translated_wordnet,
                    lemmas,
                    wordnet_gloss,
                    gloss,
                    transform_list(hypernyms),
                    transform_list(hyponyms),
                ]

                if (ili, ss.pos) in seen:
                    continue
                writer.writerow(gap_data)
                seen.add((ili, ss.pos))

  0%|          | 0/449 [00:00<?, ?it/s]

In [7]:
f"{pd.read_json('trees_path/path_with_gaps.json')['gaps_to_fill'].sum()} gaps should be translated."

'793 gaps should be translated.'