In [1]:
import wn
from tqdm.notebook import tqdm
from google.cloud import translate_v2 as translate


import logging

logger = logging.getLogger("wordnet_translator")
logger.setLevel(logging.INFO)

consoleHandler = logging.StreamHandler()
consoleHandler.setLevel(logging.INFO)

# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# add formatter to ch
consoleHandler.setFormatter(formatter)

# add ch to logger
logger.addHandler(consoleHandler)

In [2]:
wn.download("pwn:3.1")

[KCached file found: /home/mars/.wn_data/downloads/49903b7403676be7b2b7463448ce7a9699599f09
[KSkipping pwn:3.1 (Princeton WordNet 3.1); already added



PosixPath('/home/mars/.wn_data/downloads/49903b7403676be7b2b7463448ce7a9699599f09')

In [3]:
pwn = wn.Wordnet("pwn:3.1")

for pos in tqdm(wn.constants.PARTS_OF_SPEECH):
    print(
        pos,
        len(pwn.synsets(pos=pos)),
        len(wn.taxonomy.roots(pwn, pos=pos)),
        len(wn.taxonomy.leaves(pwn, pos=pos)),
    )

  0%|          | 0/10 [00:00<?, ?it/s]

a 7468 18185 18185
r 3625 3625 3625
n 82192 1 65031
p 0 0 0
c 0 0 0
x 0 0 0
t 0 0 0
s 10717 18185 18185
u 0 0 0
v 13789 566 10466


In [4]:
from collections import Counter, defaultdict

synset_cardinality = Counter()
synset_level = defaultdict(lambda: 1000000)

for synset in tqdm(wn.taxonomy.leaves(pwn, pos="n")):
    for path in synset.hypernym_paths():
        path.reverse()

        synset_cardinality.update([p.id for p in path])

        for level, p in enumerate(path):
            synset_level[p.id] = min(synset_level[p.id], level)
    
        synset_level[synset.id] = min(synset_level[synset.id], len(path))

  0%|          | 0/65031 [00:00<?, ?it/s]

In [5]:
synset_cardinality.most_common(20)

[('pwn-00001740-n', 89366),
 ('pwn-00001930-n', 55400),
 ('pwn-00002684-n', 34073),
 ('pwn-00002137-n', 33958),
 ('pwn-00003553-n', 28554),
 ('pwn-00007846-n', 20650),
 ('pwn-00004258-n', 18149),
 ('pwn-00004475-n', 18031),
 ('pwn-00007347-n', 11371),
 ('pwn-00023280-n', 10538),
 ('pwn-00022119-n', 9341),
 ('pwn-00031563-n', 7699),
 ('pwn-00029677-n', 6953),
 ('pwn-00021007-n', 6427),
 ('pwn-00019793-n', 6094),
 ('pwn-00024444-n', 5511),
 ('pwn-07957410-n', 5381),
 ('pwn-08008892-n', 5336),
 ('pwn-00030657-n', 5293),
 ('pwn-00027365-n', 4706)]

In [6]:
Counter(synset_level.values())

Counter({0: 1,
         1: 3,
         2: 22,
         3: 228,
         4: 2023,
         5: 6282,
         6: 12345,
         7: 18881,
         8: 14151,
         9: 11028,
         10: 7209,
         11: 4270,
         12: 2524,
         13: 1394,
         14: 847,
         15: 449,
         16: 341,
         17: 164,
         18: 30})

In [7]:
for k, v in synset_cardinality.most_common():
    pths = pwn.synset(k).hypernym_paths()
    level = 0
    if pths:
        level = min(map(len, pths))
    print(k, v, level)
    if v < 100:
        break
        

pwn-00001740-n 89366 0
pwn-00001930-n 55400 1
pwn-00002684-n 34073 2
pwn-00002137-n 33958 1
pwn-00003553-n 28554 3
pwn-00007846-n 20650 3
pwn-00004258-n 18149 4
pwn-00004475-n 18031 5
pwn-00007347-n 11371 2
pwn-00023280-n 10538 2
pwn-00022119-n 9341 4
pwn-00031563-n 7699 2
pwn-00029677-n 6953 3
pwn-00021007-n 6427 2
pwn-00019793-n 6094 3
pwn-00024444-n 5511 2
pwn-07957410-n 5381 3
pwn-08008892-n 5336 4
pwn-00030657-n 5293 4
pwn-00027365-n 4706 3
pwn-03580409-n 4658 5
pwn-14604877-n 4368 4
pwn-00032220-n 4320 2
pwn-00033319-n 3919 2
pwn-00017402-n 3909 6
pwn-08648560-n 3739 4
pwn-13104346-n 3706 7
pwn-08125938-n 3571 5
pwn-00023451-n 3555 3
pwn-13831419-n 3279 3
pwn-00015568-n 3233 6
pwn-00024900-n 2956 3
pwn-00408356-n 2638 5
pwn-08643858-n 2402 7
pwn-01468898-n 2362 7
pwn-01474323-n 2356 8
pwn-00020270-n 2297 3
pwn-00002452-n 2287 2
pwn-09633690-n 2230 4
pwn-14831008-n 2200 5
pwn-03187746-n 2150 6
pwn-14842408-n 2026 6
pwn-00033914-n 1949 2
pwn-13943868-n 1945 4
pwn-09655706-n 1860 4


pwn-04572661-n 147 8
pwn-04537861-n 147 9
pwn-11566165-n 146 9
pwn-08671935-n 145 7
pwn-05952149-n 144 6
pwn-04173902-n 144 7
pwn-12122387-n 144 9
pwn-00524569-n 142 7
pwn-07085982-n 142 4
pwn-02397129-n 142 12
pwn-09280855-n 142 5
pwn-01507078-n 142 6
pwn-03806455-n 142 7
pwn-15016494-n 142 6
pwn-14724906-n 142 6
pwn-09824898-n 142 5
pwn-13155061-n 142 8
pwn-14359944-n 142 7
pwn-03302664-n 140 6
pwn-09758057-n 140 5
pwn-10078585-n 140 7
pwn-13129421-n 140 10
pwn-06954406-n 139 5
pwn-05081943-n 138 3
pwn-07723196-n 138 6
pwn-13129600-n 138 11
pwn-14396987-n 136 5
pwn-09657682-n 136 4
pwn-11612597-n 136 6
pwn-01026334-n 135 7
pwn-03742461-n 135 8
pwn-14779256-n 135 6
pwn-00672594-n 134 8
pwn-12122650-n 133 10
pwn-01183965-n 132 5
pwn-14816873-n 132 6
pwn-04854001-n 132 4
pwn-11445694-n 132 6
pwn-09901459-n 132 6
pwn-14918632-n 132 7
pwn-00376871-n 131 7
pwn-04957303-n 131 4
pwn-05097645-n 130 4
pwn-03494245-n 130 8
pwn-07385893-n 130 5
pwn-10725264-n 130 6
pwn-06046810-n 129 9
pwn-07324

In [8]:
res = set()
for k, v in synset_level.items():
    if v <= 11 and synset_cardinality[k] > 2:
        
        
        res.add(k)

print(len(res))

# IDEALNO

8680


In [9]:
import pymongo

client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client.wordnet
collection = db["tasks"]

In [15]:
from collections import defaultdict

def populate_tasks_in_mongo(lexicon="pwn:3.1", filter_func=None):
    if filter_func is None:

        filter_func = lambda synset: True

    tasks_created = 0
    tasks_updated = 0

    for synset in tqdm(wn.synsets(lexicon=lexicon)):
        if filter_func(synset):
            words = {w.id: w.lemma() for w in synset.words()}
            res = (
                collection.update_one(
                    {"_id": synset.id},
                    {
                        "$set": {
                            "ili": synset.ili.id,
                            "pos": synset.pos,
                            "words": words,
                            "definition": list(
                                map(str.strip, synset.definition().split(";"))
                            ),
                        }
                    },
                    upsert=True,
                )
            )
            
            if res.upserted_id:
                tasks_created += 1
            else:
                tasks_updated += 1

                
    logger.info(
        f"{tasks_created} was created and {tasks_updated} was updated from '{lexicon}'"
    )

    
def filter_only_big_synsets_with_description(synset):
    return len(synset.lemmas()) >= 1 and synset.definition()

export_samples = defaultdict(lambda: defaultdict(int))


for pos in ["a", "v", "n"]:
    for lemmas_count in range(1, 5):
        export_samples[pos][lemmas_count] = 1


def filter_to_have_a_nice_sample(synset):
    global export_samples

    if not synset.definition():
        return False

    if export_samples[synset.pos][len(synset.lemmas())] > 0:
        export_samples[synset.pos][len(synset.lemmas())] -= 1
        return True
    
    return False


def filter_top_level_and_big(synset):
    if synset_level.get(synset.id, 1000) <= 15:
        return True

    return False

populate_tasks_in_mongo(filter_func=filter_only_big_synsets_with_description)
# populate_tasks_in_mongo(filter_func=filter_to_have_a_nice_sample)
# populate_tasks_in_mongo(filter_func=filter_top_level_and_big)

  0%|          | 0/117791 [00:00<?, ?it/s]

2021-10-31 21:19:24,096 - wordnet_translator - INFO - 21965 was created and 95826 was updated from 'pwn:3.1'


In [16]:
import requests, uuid, json
from urllib.parse import urljoin

class BingTranslationException(Exception):
    pass

class BingTranslator:
    translate_path = '/translate'
    dictionary_lookup_path = '/dictionary/lookup'


    def __init__(self, key_file, endpoint="https://api.cognitive.microsofttranslator.com"):
        self.endpoint = endpoint

        with open(key_file) as fp:
            self.headers = json.load(fp)
    
    def _get_headers(self):
        headers = self.headers.copy()
        headers['X-ClientTraceId'] = str(uuid.uuid4())

        return headers
    
    def _request(self, path, phrase, source_language="en", target_language="uk"):
        constructed_url = urljoin(self.endpoint, path)

        headers = self._get_headers()
        params = {
            'api-version': '3.0',
            "from": source_language,
            'to': target_language
        }

        body = [{
            'text': phrase
        }]

        request = requests.post(constructed_url, params=params, headers=headers, json=body)

        try:
            response = request.json()
        except json.JSONDecodeError:
            raise BingTranslationException(f"Cannot translate phrase '{phrase}' cannot parse the response as json")

        if "error" in response:
            raise BingTranslationException(f"Cannot translate phrase '{phrase}' because of an error: {response['error']}")
        
        if request.status_code != 200:
            raise BingTranslationException(f"Cannot translate phrase '{phrase}', status code was {request.status_code}")
        
        return response

    def translate(self, phrase, source_language="en", target_language="uk"):
        response = self._request(self.translate_path, phrase, source_language, target_language)

        for l in response:
            for translation in l.get("translations", []):
                return translation["text"]
        
        raise BingTranslationException(f"Cannot find a translation for a phrase '{phrase}'")

    def dictionary_lookup(self, word, source_language="en", target_language="uk"):
        response = self._request(self.dictionary_lookup_path, word, source_language, target_language)

        for l in response:
            return l.get("translations", [])
        
        raise BingTranslationException(f"Cannot find a translation for a phrase '{phrase}'")

In [18]:
from time import sleep
import itertools
import re
import html
from collections import Counter


def sliding_window(iterable, n=2):
    iterables = itertools.tee(iterable, n)

    for iterable, num_skipped in zip(iterables, itertools.count()):
        for _ in range(num_skipped):
            next(iterable, None)

    return zip(*iterables)


class AbstractTranslator:
    def __init__(self, source_language="en", target_language="uk"):
        self.source_language = source_language
        self.target_language = target_language

    def generate_samples(self, task):
        raise NotImplementedError()

    def translate(self, task, sleep_between_samples=1):
        raise NotImplementedError()

    def parse_results(self, task, results):
        raise NotImplementedError()

    def method_id(self):
        raise NotImplementedError()


class AbstractSlidingWindowTranslator(AbstractTranslator):
    def __init__(
        self,
        group_by=3,
        add_or=True,
        add_quotes=True,
        combine_in_one=True,
        add_aux_words=True,
        source_language="en",
        target_language="uk",
    ):
        super().__init__(source_language=source_language, target_language=target_language)

        self.group_by = group_by
        self.add_or = add_or
        self.add_quotes = add_quotes
        self.combine_in_one = combine_in_one
        self.add_aux_words = add_aux_words

    def _unwrap_results(self, response):
        raise NotImplementedError()
        
    def method_id(self):
        return f"{type(self).__name__}(group_by={self.group_by},add_or={self.add_or},add_quotes={self.add_quotes},combine_in_one={self.combine_in_one},add_aux_words={self.add_aux_words})"

    def generate_samples(self, task):
        samples = []
        total_samples = 0
        words = list(task["words"].values())

        if self.add_aux_words:
            if task["pos"] == "v":
                words = [f"to {w}" for w in words]
            elif task["pos"] == "n":
                words = [f"the {w}" for w in words]

        if self.add_quotes:
            words = [f'"{w}"' for w in words]

        if len(words) < self.group_by:
            chunks = [words]
        else:
            chunks = sliding_window(words, self.group_by)

        for chunk in chunks:
            total_samples += len(chunk)

            if self.add_or and len(chunk) > 1:
                lemmas = ", ".join(chunk[:-1]) + f" or {chunk[-1]}"
            else:
                lemmas = ", ".join(chunk)

            if task["definition"]:
                samples.append(f"{lemmas}: {task['definition'][0]}")
            else:
                samples.append(lemmas)

        if self.combine_in_one:
            return {"samples": ["<br/>\n\n".join(samples)], "total_lemmas": total_samples}
        else:
            return {"samples": samples, "total_lemmas": total_samples}

    def estimate_tasks(self, tasks, price_per_mb=1.0 / 1024 / 1024):
        total_len = 0
        for task in tasks:
            samples = self.generate_samples(task)["samples"]
            for sample in samples:
                total_len += len(sample)

        return (float(total_len) / 1024 / 1024) * price_per_mb

    def parse_results(self, task, results):
        terms = Counter()
        definitions = Counter()
        raw_translations = []
        parsed_results = []

        for r in results:
            answer = self._unwrap_results(r)
            parsed = self._parse_result(task, answer)
            terms.update(parsed["all_terms"])
            definitions.update(parsed["all_definitions"])
            raw_translations.append(answer)
            parsed_results.append(parsed)

        return {
            "raw": parsed_results,
            "terms": terms.most_common(),
            "definitions": definitions.most_common(),
            "raw_translations": raw_translations,
            "type": "translator",
        }


    def _parse_result(self, task, result):
        all_terms = []
        all_definitions = []
        for l in filter(None, result.replace("<br/>", "\n").split("\n")):
            for separator in [":", "–", "-", "—"]:
                if separator in l:
                    terms, definition = l.split(separator, 1)
                    break
            else:
                logger.warning(f"Cannot find a semicolon or dash in the translated text for task {task['_id']}")
                continue

            terms = list(map(str.strip, terms.split(",")))

            if self.add_or:
                for or_word in ["чи то", "чи", "або", "альбо", "or"]:
                    splits = re.split(f"[,\s]+{or_word}[,\s]+", terms[-1], flags=re.I)
                    if len(splits) > 1:
                        terms = terms[:-1] + list(map(lambda x: x.strip(", "), splits))
                        break
                else:
                    if self.group_by > 1 and len(task["words"]) > 1:
                        logger.warning(f"Cannot find 'or' in the last chunk for task {task['_id']}")

            if self.add_quotes:
                terms = [term.strip('"\'"«»') for term in terms]

            all_terms += terms
            all_definitions.append(definition.strip())

        return {"all_terms": all_terms, "all_definitions": all_definitions}


class SlidingWindowGoogleTranslator(AbstractSlidingWindowTranslator):
    def __init__(
        self,
        gcloud_credentials,
        group_by=3,
        add_or=True,
        add_quotes=True,
        combine_in_one=True,
        add_aux_words=True,
        source_language="en",
        target_language="uk",
    ):

        self.gtrans_client = translate.Client.from_service_account_json(gcloud_credentials)
        super().__init__(
            group_by=group_by,
            add_or=add_or,
            add_quotes=add_quotes,
            combine_in_one=combine_in_one,
            add_aux_words=add_aux_words,
            source_language=source_language,
            target_language=target_language,
        )

    def translate(self, task, sleep_between_samples=1):
        results = []
        sampled = self.generate_samples(task)
        for sample in sampled["samples"]:
            results.append(
                self.gtrans_client.translate(
                    sample,
                    source_language=self.source_language,
                    target_language=self.target_language,
                )
            )
            sleep(sleep_between_samples)

        return self.parse_results(task, results)

    def _unwrap_results(self, response):
        return html.unescape(response.get("translatedText", ""))

    def estimate_tasks(self, tasks, price_per_mb=20):
        return super().estimate_tasks(tasks, price_per_mb)


class SlidingWindowBingTranslator(AbstractSlidingWindowTranslator):
    def __init__(
        self,
        bing_apikey,
        group_by=3,
        add_or=True,
        add_quotes=True,
        combine_in_one=True,
        add_aux_words=True,
        source_language="en",
        target_language="uk",
    ):
        self.bing_apikey = bing_apikey
        self.bing_translator = BingTranslator(self.bing_apikey)

        super().__init__(
            group_by=group_by,
            add_or=add_or,
            add_quotes=add_quotes,
            combine_in_one=combine_in_one,
            add_aux_words=add_aux_words,
            source_language=source_language,
            target_language=target_language,
        )

    def estimate_tasks(self, tasks, price_per_mb=10):
        return super().estimate_tasks(tasks, price_per_mb)
    
    def _unwrap_results(self, response):
        return html.unescape(response)

    def translate(self, task, sleep_between_samples=1):
        results = []
        sampled = self.generate_samples(task)
        for sample in sampled["samples"]:
            results.append(
                self.bing_translator.translate(
                    sample,
                    source_language=self.source_language,
                    target_language=self.target_language,
                )
            )
            sleep(sleep_between_samples)

        return self.parse_results(task, results)


class AbstractDictionaryTranslator(AbstractTranslator):
    def generate_samples(self, task):
        return {"samples": list(task["words"].values()), "total_lemmas": len(task["words"]), "pos": task["pos"]}


class DictionaryBingTranslator(AbstractDictionaryTranslator):
    def __init__(
        self,
        bing_apikey,
        source_language="en",
        target_language="uk",
    ):
        self.bing_apikey = bing_apikey
        self.bing_translator = BingTranslator(self.bing_apikey)

        super().__init__(
            source_language=source_language,
            target_language=target_language,
        )

#     [ "a", "n", "r", "s", "v" ]
# a ADJ
# r ADV
# c CONJ
# n NOUN
# v VERB
# x OTHER

# DET
# MODAL
# PREP
# PRON
# Марьяна Романишин, [12 жовт. 2021 р., 09:18:00]:
# Так, у цьому випадку adposition - це preposition. У різних мовах прийменники можуть стояти перед іменником (preposition) або після іменника (postposition). Термін adposition об'єднує одне і друге.

# s також можна змапити на ADJ.

    def translate(self, task, sleep_between_samples=1):
        results = []
        sampled = self.generate_samples(task)
        for sample in sampled["samples"]:
            results.append(
                self.bing_translator.dictionary_lookup(
                    sample,
                    source_language=self.source_language,
                    target_language=self.target_language,
                )
            )
            sleep(sleep_between_samples)

        return self.parse_results(results)

    def parse_results(self, results):
        terms = Counter()
        parsed_results = []

        for r in results:
            if "normalizedTarget" in r:
                terms.update(r["normalizedTarget"])
            parsed_results.append(r)

        return {
            "raw": parsed_results,
            "terms": terms.most_common(),
            "definitions": [],
            "type": "dictionary",
        }
    def method_id(self):
        return f"{type(self).__name__}()"

translators = [
    SlidingWindowGoogleTranslator("../api_keys/beaming-source-330617-f93c630102d5.json", group_by=1),
    SlidingWindowGoogleTranslator("../api_keys/beaming-source-330617-f93c630102d5.json", group_by=3),
    SlidingWindowBingTranslator("../api_keys/khrystyna_skopyk_gmail_com.json", group_by=1),
    SlidingWindowBingTranslator("../api_keys/khrystyna_skopyk_gmail_com.json", group_by=3),
#     DictionaryBingTranslator("../api_keys/khrystyna_skopyk_gmail_com.json"),
]

# tasks = list(collection.find(
#     {
#         "_id": {
#             "$in": [
#                 # VERBS
#                 "pwn-00006238-v",
#                 "pwn-00009140-v",
#                 "pwn-00014735-v",
# #                 "pwn-00018151-v",
# #                 "pwn-00022309-v",
# #                 "pwn-00023466-v",
# #                 "pwn-00050369-v",
# #                 "pwn-00056644-v",
# #                 "pwn-00058790-v",
# #                 "pwn-00067045-v",
                
#                 # NOUNS:
#                 "pwn-00109001-n",
#                 "pwn-00284945-n",
#                 "pwn-00224850-n",
#                 # ADJS:
#                 "pwn-00102561-a",
#             ]
#         }
#     }
# ))

tasks = list(collection.find({"results": {"$exists": 0}}))

for translator in tqdm(translators, desc="Applying translators"):
    for t in tqdm(tasks, desc=f"Processing tasks of {translator.method_id()[:50]}"):
        if translator.method_id() not in t.get("results", {}):
            res = translator.translate(t, sleep_between_samples=0.33)
            collection.update_one(
                {"_id": t["_id"]}, {"$set": {f"results.{translator.method_id()}": res}}, upsert=True
            )

Applying translators:   0%|          | 0/4 [00:00<?, ?it/s]

Processing tasks of SlidingWindowGoogleTranslator(group_by=1,add_or=Tr: 0it [00:00, ?it/s]

Processing tasks of SlidingWindowGoogleTranslator(group_by=3,add_or=Tr: 0it [00:00, ?it/s]

Processing tasks of SlidingWindowBingTranslator(group_by=1,add_or=True: 0it [00:00, ?it/s]

Processing tasks of SlidingWindowBingTranslator(group_by=3,add_or=True: 0it [00:00, ?it/s]

In [None]:
broken_tasks = ["pwn-05404676-n", "pwn-10104404-n", "pwn-10104404-n", "pwn-10392072-n", "pwn-01468898-n"]

tasks = list(collection.find(
    {
        "_id": {
            "$in": broken_tasks
        }
    }
))



for translator in translators:
    for t in tasks:
        print(translator.method_id(), "\n".join(translator.generate_samples(t)["samples"]))

In [None]:
from csv import DictWriter

def render_counter(cnt):
    return "\n".join(f"{k}: {v}" for k, v in cnt.most_common())

answered = list(collection.find({"results": {"$exists": 1}}))
methods = set()

for l in answered:
    methods |= set(l["results"].keys())

columns = ["pwn", "lemmas", "pos", "definition"]

for method in sorted(methods):
    columns.append(f"Terms, {method}")
    columns.append(f"Definitions, {method}")

columns.append("Terms combined")
columns.append("Definitions combined")


with open("/tmp/translations.csv", "w") as fp:
    w = DictWriter(fp, fieldnames=columns)

    w.writeheader()

    for t in answered:
        to_export = {
            "pwn": t["_id"],
            "definition": "\n".join(t["definition"]),
            "pos": t["pos"],
            "lemmas": "\n".join(t["words"].values()),
        }

        combined_terms = Counter()
        combined_definitions = Counter()

        for method, r in t.get("results", {}).items():
            terms = Counter(dict(r.get("terms", [])))
            definitions = Counter(dict(r.get("definitions", [])))
            combined_terms.update({k.lower(): v for k, v in terms.items()})
            combined_definitions.update({k.lower(): v for k, v in definitions.items()})

            to_export[f"Terms, {method}"] = render_counter(terms)
            to_export[f"Definitions, {method}"] =  render_counter(definitions)


        to_export["Terms combined"] = render_counter(combined_terms)
        to_export["Definitions combined"] =  render_counter(combined_definitions)

        w.writerow(to_export)