In [1]:
import wn
from google.cloud import translate_v2 as translate


import logging

logger = logging.getLogger("wordnet_translator")
logger.setLevel(logging.INFO)

consoleHandler = logging.StreamHandler()
consoleHandler.setLevel(logging.INFO)

# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# add formatter to ch
consoleHandler.setFormatter(formatter)

# add ch to logger
logger.addHandler(consoleHandler)

In [2]:
wn.download("pwn:3.1")

[KCached file found: /Users/dchaplinsky/.wn_data/downloads/49903b7403676be7b2b7463448ce7a9699599f09
[KSkipping pwn:3.1 (Princeton WordNet 3.1); already added/T/tmptm_4sxz6/pwn31/wn31.xml



PosixPath('/Users/dchaplinsky/.wn_data/downloads/49903b7403676be7b2b7463448ce7a9699599f09')

In [3]:
import pymongo

client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client.wordnet
collection = db["tasks"]

In [40]:
from tqdm.notebook import tqdm
from collections import defaultdict

def populate_tasks_in_mongo(lexicon="pwn:3.1", filter_func=None):
    if filter_func is None:

        filter_func = lambda synset: True

    tasks_created = 0
    tasks_updated = 0

    for synset in tqdm(wn.synsets(lexicon=lexicon)):
        if filter_func(synset):
            words = {w.id: w.lemma() for w in synset.words()}
            res = (
                collection.update_one(
                    {"_id": synset.id},
                    {
                        "$set": {
                            "ili": synset.ili.id,
                            "pos": synset.pos,
                            "words": words,
                            "definition": list(
                                map(str.strip, synset.definition().split(";"))
                            ),
                        }
                    },
                    upsert=True,
                )
            )
            
            if res.upserted_id:
                tasks_created += 1
            else:
                tasks_updated += 1

                
    logger.info(
        f"{tasks_created} was created and {tasks_updated} was updated from '{lexicon}'"
    )

    
def filter_only_big_synsets_with_description(synset):
    return len(synset.lemmas()) == 5 and synset.definition()

export_samples = defaultdict(lambda: defaultdict(int))


for pos in ["a", "v", "n"]:
    for lemmas_count in range(1, 5):
        export_samples[pos][lemmas_count] = 2


def filter_to_have_a_nice_sample(synset):
    global export_samples

    if not synset.definition():
        return False

    if export_samples[synset.pos][len(synset.lemmas())] > 0:
        export_samples[synset.pos][len(synset.lemmas())] -= 1
        return True
    
    return False

# populate_tasks_in_mongo(filter_func=filter_only_big_synsets_with_description)
populate_tasks_in_mongo(filter_func=filter_to_have_a_nice_sample)

  0%|          | 0/117791 [00:00<?, ?it/s]

2021-10-12 14:11:44,756 - wordnet_translator - INFO - 0 was created and 24 was updated from 'pwn:3.1'


In [14]:
import requests, uuid, json
from urllib.parse import urljoin

class BingTranslationException(Exception):
    pass

class BingTranslator:
    translate_path = '/translate'
    dictionary_lookup_path = '/dictionary/lookup'


    def __init__(self, key_file, endpoint="https://api.cognitive.microsofttranslator.com"):
        self.endpoint = endpoint

        with open(key_file) as fp:
            self.headers = json.load(fp)
    
    def _get_headers(self):
        headers = self.headers.copy()
        headers['X-ClientTraceId'] = str(uuid.uuid4())

        return headers
    
    def _request(self, path, phrase, source_language="en", target_language="uk"):
        constructed_url = urljoin(self.endpoint, path)

        headers = self._get_headers()
        params = {
            'api-version': '3.0',
            "from": source_language,
            'to': target_language
        }

        body = [{
            'text': phrase
        }]

        request = requests.post(constructed_url, params=params, headers=headers, json=body)

        try:
            response = request.json()
        except json.JSONDecodeError:
            raise BingTranslationException(f"Cannot translate phrase '{phrase}' cannot parse the response as json")

        if "error" in response:
            raise BingTranslationException(f"Cannot translate phrase '{phrase}' because of an error: {response['error']}")
        
        if request.status_code != 200:
            raise BingTranslationException(f"Cannot translate phrase '{phrase}', status code was {request.status_code}")
        
        return response

    def translate(self, phrase, source_language="en", target_language="uk"):
        response = self._request(self.translate_path, phrase, source_language, target_language)

        for l in response:
            for translation in l.get("translations", []):
                return translation["text"]
        
        raise BingTranslationException(f"Cannot find a translation for a phrase '{phrase}'")

    def dictionary_lookup(self, word, source_language="en", target_language="uk"):
        response = self._request(self.dictionary_lookup_path, word, source_language, target_language)

        for l in response:
            return l.get("translations", [])
        
        raise BingTranslationException(f"Cannot find a translation for a phrase '{phrase}'")

In [41]:
from time import sleep
import itertools
import re
import html
from collections import Counter


def sliding_window(iterable, n=2):
    iterables = itertools.tee(iterable, n)

    for iterable, num_skipped in zip(iterables, itertools.count()):
        for _ in range(num_skipped):
            next(iterable, None)

    return zip(*iterables)


class AbstractTranslator:
    def __init__(self, source_language="en", target_language="uk"):
        self.source_language = source_language
        self.target_language = target_language

    def generate_samples(self, task):
        raise NotImplementedError()

    def translate(self, task, sleep_between_samples=1):
        raise NotImplementedError()

    def parse_results(self, results):
        raise NotImplementedError()

    def method_id(self):
        raise NotImplementedError()


class AbstractSlidingWindowTranslator(AbstractTranslator):
    def __init__(
        self,
        group_by=3,
        add_or=True,
        add_quotes=True,
        combine_in_one=True,
        add_aux_words=True,
        source_language="en",
        target_language="uk",
    ):
        super().__init__(source_language=source_language, target_language=target_language)

        self.group_by = group_by
        self.add_or = add_or
        self.add_quotes = add_quotes
        self.combine_in_one = combine_in_one
        self.add_aux_words = add_aux_words

    def method_id(self):
        return f"{type(self).__name__}(group_by={self.group_by},add_or={self.add_or},add_quotes={self.add_quotes},combine_in_one={self.combine_in_one},add_aux_words={self.add_aux_words})"

    def generate_samples(self, task):
        samples = []
        total_samples = 0
        words = list(task["words"].values())

        if self.add_aux_words:
            if task["pos"] == "v":
                words = [f"to {w}" for w in words]
            elif task["pos"] == "n":
                words = [f"the {w}" for w in words]

        if self.add_quotes:
            words = [f'"{w}"' for w in words]

        if len(words) < self.group_by:
            chunks = [words]
        else:
            chunks = sliding_window(words, self.group_by)

        for chunk in chunks:
            total_samples += len(chunk)

            if self.add_or and len(chunk) > 1:
                lemmas = ", ".join(chunk[:-1]) + f" or {chunk[-1]}"
            else:
                lemmas = ", ".join(chunk)

            if task["definition"]:
                samples.append(f"{lemmas}: {task['definition'][0]}")
            else:
                samples.append(lemmas)

        if self.combine_in_one:
            return {"samples": ["<br/>\n\n".join(samples)], "total_lemmas": total_samples}
        else:
            return {"samples": samples, "total_lemmas": total_samples}

    def estimate_tasks(self, tasks, price_per_mb=1.0 / 1024 / 1024):
        total_len = 0
        for task in tasks:
            samples = self.generate_samples(task)["samples"]
            for sample in samples:
                total_len += len(sample)

        return (float(total_len) / 1024 / 1024) * price_per_mb

    def _parse_result(self, result):
        all_terms = []
        all_definitions = []
        for l in filter(None, result.replace("<br/>", "\n").split("\n")):
            if ":" not in l:
                logger.warning("Cannot find a semicolon in the translated text")
                continue

            terms, definition = l.split(":", 1)
            terms = list(map(str.strip, terms.split(",")))

            if self.add_or:
                for or_word in ["чи то", "чи", "або", "альбо", "or"]:
                    splits = re.split(f"[,\s]+{or_word}[,\s]+", terms[-1], flags=re.I)
                    if len(splits) > 1:
                        terms = terms[:-1] + list(map(lambda x: x.strip(", "), splits))
                        break
                else:
                    if self.group_by > 1:
                        logger.warning("Cannot find 'or' in the last chunk")

            if self.add_quotes:
                terms = [term.strip('"\'"«»') for term in terms]

            all_terms += terms
            all_definitions.append(definition.strip())

        return {"all_terms": all_terms, "all_definitions": all_definitions}


class SlidingWindowGoogleTranslator(AbstractSlidingWindowTranslator):
    def __init__(
        self,
        gcloud_credentials,
        group_by=3,
        add_or=True,
        add_quotes=True,
        combine_in_one=True,
        add_aux_words=True,
        source_language="en",
        target_language="uk",
    ):

        self.gtrans_client = translate.Client.from_service_account_json(gcloud_credentials)
        super().__init__(
            group_by=group_by,
            add_or=add_or,
            add_quotes=add_quotes,
            combine_in_one=combine_in_one,
            add_aux_words=add_aux_words,
            source_language=source_language,
            target_language=target_language,
        )

    def translate(self, task, sleep_between_samples=1):
        results = []
        sampled = self.generate_samples(task)
        for sample in sampled["samples"]:
            results.append(
                self.gtrans_client.translate(
                    sample,
                    source_language=self.source_language,
                    target_language=self.target_language,
                )
            )
            sleep(sleep_between_samples)

        return self.parse_results(results)

    def parse_results(self, results):
        terms = Counter()
        definitions = Counter()
        parsed_results = []

        for r in results:
            parsed = self._parse_result(html.unescape(r.get("translatedText", "")))
            terms.update(parsed["all_terms"])
            definitions.update(parsed["all_definitions"])
            parsed_results.append(parsed)

        return {
            "raw": parsed_results,
            "terms": terms.most_common(),
            "definitions": definitions.most_common(),
            "type": "translator",
        }

    def estimate_tasks(self, tasks, price_per_mb=20):
        return super().estimate_tasks(tasks, price_per_mb)


class SlidingWindowBingTranslator(AbstractSlidingWindowTranslator):
    def __init__(
        self,
        bing_apikey,
        group_by=3,
        add_or=True,
        add_quotes=True,
        combine_in_one=True,
        add_aux_words=True,
        source_language="en",
        target_language="uk",
    ):
        self.bing_apikey = bing_apikey
        self.bing_translator = BingTranslator(self.bing_apikey)

        super().__init__(
            group_by=group_by,
            add_or=add_or,
            add_quotes=add_quotes,
            combine_in_one=combine_in_one,
            add_aux_words=add_aux_words,
            source_language=source_language,
            target_language=target_language,
        )

    def estimate_tasks(self, tasks, price_per_mb=10):
        return super().estimate_tasks(tasks, price_per_mb)

    def translate(self, task, sleep_between_samples=1):
        results = []
        sampled = self.generate_samples(task)
        for sample in sampled["samples"]:
            results.append(
                self.bing_translator.translate(
                    sample,
                    source_language=self.source_language,
                    target_language=self.target_language,
                )
            )
            sleep(sleep_between_samples)

        return self.parse_results(results)

    def parse_results(self, results):
        terms = Counter()
        definitions = Counter()
        parsed_results = []

        for r in results:
            parsed = self._parse_result(html.unescape(r))
            terms.update(parsed["all_terms"])
            definitions.update(parsed["all_definitions"])
            parsed_results.append(parsed)

        return {
            "raw": parsed_results,
            "terms": terms.most_common(),
            "definitions": definitions.most_common(),
            "type": "translator",
        }

class AbstractDictionaryTranslator(AbstractTranslator):
    def generate_samples(self, task):
        return {"samples": list(task["words"].values()), "total_lemmas": len(task["words"]), "pos": task["pos"]}


class DictionaryBingTranslator(AbstractDictionaryTranslator):
    def __init__(
        self,
        bing_apikey,
        source_language="en",
        target_language="uk",
    ):
        self.bing_apikey = bing_apikey
        self.bing_translator = BingTranslator(self.bing_apikey)

        super().__init__(
            source_language=source_language,
            target_language=target_language,
        )

#     [ "a", "n", "r", "s", "v" ]
# a ADJ
# r ADV
# c CONJ
# n NOUN
# v VERB
# x OTHER

# DET
# MODAL
# PREP
# PRON
# Марьяна Романишин, [12 жовт. 2021 р., 09:18:00]:
# Так, у цьому випадку adposition - це preposition. У різних мовах прийменники можуть стояти перед іменником (preposition) або після іменника (postposition). Термін adposition об'єднує одне і друге.

# s також можна змапити на ADJ.

    def translate(self, task, sleep_between_samples=1):
        results = []
        sampled = self.generate_samples(task)
        for sample in sampled["samples"]:
            results.append(
                self.bing_translator.dictionary_lookup(
                    sample,
                    source_language=self.source_language,
                    target_language=self.target_language,
                )
            )
            sleep(sleep_between_samples)

        return self.parse_results(results)

    def parse_results(self, results):
        terms = Counter()
        parsed_results = []

        for r in results:
            if "normalizedTarget" in r:
                terms.update(r["normalizedTarget"])
            parsed_results.append(r)

        return {
            "raw": parsed_results,
            "terms": terms.most_common(),
            "definitions": [],
            "type": "dictionary",
        }
    def method_id(self):
        return f"{type(self).__name__}()"

translators = [
    SlidingWindowGoogleTranslator("../api_keys/dchaplynskyi_gmail_com.json", group_by=1),
    SlidingWindowGoogleTranslator("../api_keys/dchaplynskyi_gmail_com.json", group_by=3),
    SlidingWindowBingTranslator("../api_keys/khrystyna_skopyk_gmail_com.json", group_by=1),
    SlidingWindowBingTranslator("../api_keys/khrystyna_skopyk_gmail_com.json", group_by=3),
    DictionaryBingTranslator("../api_keys/khrystyna_skopyk_gmail_com.json"),
]

# tasks = list(collection.find(
#     {
#         "_id": {
#             "$in": [
#                 # VERBS
#                 "pwn-00006238-v",
#                 "pwn-00009140-v",
#                 "pwn-00014735-v",
# #                 "pwn-00018151-v",
# #                 "pwn-00022309-v",
# #                 "pwn-00023466-v",
# #                 "pwn-00050369-v",
# #                 "pwn-00056644-v",
# #                 "pwn-00058790-v",
# #                 "pwn-00067045-v",
                
#                 # NOUNS:
#                 "pwn-00109001-n",
#                 "pwn-00284945-n",
#                 "pwn-00224850-n",
#                 # ADJS:
#                 "pwn-00102561-a",
#             ]
#         }
#     }
# ))

tasks = list(collection.find())

for translator in tqdm(translators):
    for t in tqdm(tasks):
        if translator.method_id() not in t.get("results", {}):
            res = translator.translate(t)
            collection.update_one(
                {"_id": t["_id"]}, {"$set": {f"results.{translator.method_id()}": res}}, upsert=True
            )


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]

In [53]:
from csv import DictWriter

def render_counter(cnt):
    return "\n".join(f"{k}: {v}" for k, v in cnt.most_common())

answered = list(collection.find({"results": {"$exists": 1}}))
methods = set()

for l in answered:
    methods |= set(l["results"].keys())

columns = ["pwn", "lemmas", "pos", "definition"]

for method in sorted(methods):
    columns.append(f"Terms, {method}")
    columns.append(f"Definitions, {method}")

columns.append("Terms combined")
columns.append("Definitions combined")


with open("/tmp/translations.csv", "w") as fp:
    w = DictWriter(fp, fieldnames=columns)

    w.writeheader()

    for t in answered:
        to_export = {
            "pwn": t["_id"],
            "definition": "\n".join(t["definition"]),
            "pos": t["pos"],
            "lemmas": "\n".join(t["words"].values()),
        }

        combined_terms = Counter()
        combined_definitions = Counter()

        for method, r in t.get("results", {}).items():
            terms = Counter(dict(r.get("terms", [])))
            definitions = Counter(dict(r.get("definitions", [])))
            combined_terms.update({k.lower(): v for k, v in terms.items()})
            combined_definitions.update({k.lower(): v for k, v in definitions.items()})

            to_export[f"Terms, {method}"] = render_counter(terms)
            to_export[f"Definitions, {method}"] =  render_counter(definitions)


        to_export["Terms combined"] = render_counter(combined_terms)
        to_export["Definitions combined"] =  render_counter(combined_definitions)

        w.writerow(to_export)