In [2]:
import os, sys
sys.path = ["/Users/dchaplinsky/Projects/lang-uk/lang.org.ua/languk/"] + sys.path
os.environ["DJANGO_SETTINGS_MODULE"] = "languk.settings.dev"
import django

django.setup()

In [3]:
import pymongo
from django.conf import settings
from corpus.mongodb import db
from corpus.udpipe_model import Model as UDPipeModel

In [4]:
model = UDPipeModel(settings.UDPIPE_MODEL_FILE)

In [5]:
test_doc = db.fiction.find_one()

In [6]:
import logging
logger = logging.getLogger("decompress")
logger.setLevel(logging.INFO)

from collections import OrderedDict

from corpus.ud_converter import (
    DECOMPRESS_UPOS_MAPPING,
    DECOMPRESS_FEATURES_MAPPING,
    DECOMPRESS_FEATURE_VALUES_MAPPING,
    grouper
)


def unpack_values(param_name, s):
    def _unpack_value(v):
        if param_name == "ud_postags":
            try:
                return DECOMPRESS_UPOS_MAPPING[v]
            except KeyError:
                logger.warning(
                    f"Cannot find the upos '{v}' in the mapping, skipping it for now"
                )
                return "UNK"

        elif param_name == "ud_features":
            res = []

            for c_cat, c_val in grouper(v, 2):
                try:
                    cat = DECOMPRESS_FEATURES_MAPPING[c_cat]
                except KeyError:
                    logger.warning(
                        f"Cannot find the feature '{c_cat}' in the mapping, skipping it for now"
                    )
                    cat = "UNK"

                try:
                    val = DECOMPRESS_FEATURE_VALUES_MAPPING[cat][c_val]
                except KeyError:
                    logger.warning(
                        f"Cannot find the value '{c_val}' for the feature '{cat}' in the mapping, skipping it for now"
                    )
                    
                    val = "UNK"

                res.append((cat, val))
            return OrderedDict(res)

        else:
            return v

    if param_name == "ud_postags":
        return [[_unpack_value(w) for w in l] for l in s.split("\n")]
    else:
        return [[_unpack_value(w) for w in l.split(" ")] for l in s.split("\n")]


def decompress(tokens=None, ud_lemmas=None, ud_features=None, ud_postags=None):
    params = locals()

    assert any(
        map(lambda x: x is not None, params.values())
    ), "at least one param should be not None"

    zipped = {}

    for param_name, param_value in params.items():
        if param_value is not None:
            if param_name == "tokens":
                # TODO: validate if this workaround can be properly fixed
                param_value = param_value.strip()
            zipped[param_name] = unpack_values(param_name, param_value)

    sentences_length = set(map(len, zipped.values()))
    assert len(sentences_length) == 1, f"Text contains different number of sentences: {sentences_length}"

    res = []
    param_names = list(zipped.keys())
    param_values = list(zipped.values())

    for sent in zip(*param_values): 
        word_length = set(map(len, sent))

        assert len(sentences_length) == 1, f"Text contains different number of words in sentence: {sent}"

        res.append(
            [OrderedDict(zip(param_names, word_info)) for word_info in zip(*sent)]
        )


    return res


decompress(
    tokens=test_doc["nlp"]["text"]["tokens"],
    ud_lemmas=test_doc["nlp"]["text"]["ud_lemmas"],
    ud_features=test_doc["nlp"]["text"]["ud_features"],
    ud_postags=test_doc["nlp"]["text"]["ud_postags"],
)

[[OrderedDict([('tokens', 'В'),
               ('ud_lemmas', 'в'),
               ('ud_features', OrderedDict([('Case', 'Loc')])),
               ('ud_postags', 'ADP')]),
  OrderedDict([('tokens', 'горах'),
               ('ud_lemmas', 'гора'),
               ('ud_features',
                OrderedDict([('Animacy', 'Inan'),
                             ('Case', 'Loc'),
                             ('Gender', 'Fem'),
                             ('Number', 'Plur')])),
               ('ud_postags', 'NOUN')]),
  OrderedDict([('tokens', 'де'),
               ('ud_lemmas', 'де'),
               ('ud_features', OrderedDict([('PronType', 'Rel')])),
               ('ud_postags', 'ADV')]),
  OrderedDict([('tokens', 'ближче'),
               ('ud_lemmas', 'ближче'),
               ('ud_features', OrderedDict([('Degree', 'Cmp')])),
               ('ud_postags', 'ADV')]),
  OrderedDict([('tokens', 'сонця'),
               ('ud_lemmas', 'сонце'),
               ('ud_features',
                Order

In [12]:
from deepdiff import DeepDiff

def compare_article(article):
    for f in ["title", "text"]:
        #     for f in [
        #         "title",
        #     ]:
        decompressed_result = decompress(
#             tokens=article["nlp"][f]["tokens"],
            ud_lemmas=article["nlp"][f]["ud_lemmas"],
            ud_features=article["nlp"][f]["ud_features"],
            ud_postags=article["nlp"][f]["ud_postags"],
        )
        udpipe_res = []

        if f not in article["nlp"]:
            logger.warning(f"Cannot find field {f} in the document {article['_id']}")
            continue

        if "tokens" not in article["nlp"][f]:
            logger.warning(
                f"Cannot find tokenized version of field {f} in the document {article['_id']}"
            )
            continue

        for s in article["nlp"][f]["tokens"].split("\n"):
            tokenized = model.tokenize(s)
            for tok_sent in tokenized:
                model.tag(tok_sent)

                udpipe_res.append(
                    [
                        {
#                             "tokens": w.form,
                            "ud_lemmas": w.lemma,
                            "ud_postags": w.upostag,
                            "ud_features": OrderedDict(
                                (f.split("=") for f in w.feats.split("|")  if not f.startswith("Animacy[gram]"))
                            ) if w.feats else OrderedDict(),
                        }
                        for w in tok_sent.words[1:]
                    ]
                )

        comparison = udpipe_res == decompressed_result
        print(f"Comparing {f} for the {article['_id']}: {comparison}")
        
        if not comparison:
            with open("udpipe_res.json", "w") as fp_out:
                json.dump(udpipe_res, fp_out, indent=4, ensure_ascii=False, sort_keys=True)
            with open("decompressed_result.json", "w") as fp_out:
                json.dump(decompressed_result, fp_out, indent=4, ensure_ascii=False, sort_keys=True)

            return False

    return True

compare_article(test_doc)

Comparing title for the e0555ead86335ee058bf2eef5758839f0f124ee7: True
Comparing text for the e0555ead86335ee058bf2eef5758839f0f124ee7: False


False

In [11]:
from tqdm.notebook import tqdm
for test_doc in tqdm(db.fiction.find()[:200]):
    if not compare_article(test_doc):
        print(f"Comparison failed on {test_doc['_id']}")
        break

0it [00:00, ?it/s]

Comparing title for the 08cc63c1f837a4ba4d39798980a2066144dc403a: True
Comparing text for the 08cc63c1f837a4ba4d39798980a2066144dc403a: True
Comparing title for the 78f15a9926de833faac848d2128e3ba22b2aa288: True
Comparing text for the 78f15a9926de833faac848d2128e3ba22b2aa288: True
Comparing title for the a7aaa513f8f7d43ce334e8ff71d60afc5b5fc17d: True
Comparing text for the a7aaa513f8f7d43ce334e8ff71d60afc5b5fc17d: True
Comparing title for the 448b8bf3b04d9d22fc6c3d6aef94c2eb0ee8d5ba: True
Comparing text for the 448b8bf3b04d9d22fc6c3d6aef94c2eb0ee8d5ba: True
Comparing title for the 1a93989f21e69b76ee008669afe27fff4f874411: True
Comparing text for the 1a93989f21e69b76ee008669afe27fff4f874411: True
Comparing title for the ab3e60df49aae0342236ed29c012ffd174ae90c1: True
Comparing text for the ab3e60df49aae0342236ed29c012ffd174ae90c1: True
Comparing title for the 3deca5d0bfaa75f391530c35e5fcc1d8505003e4: True
Comparing text for the 3deca5d0bfaa75f391530c35e5fcc1d8505003e4: True
Comparing tit

In [16]:
s = """Звернімося просто до його творчої біографії яка починається ще на порозі юності збіркою 1910 року На білих островах і поки що доходить до книжок віршів виданих у 1957 1959 роках
За підрахунком одного з критиків М. Рильського у цілому це складає більше 25 збірок оригінальних поезій і понад 250 тисяч рядків поетичних перекладів а до того слід додати ще численні статті і дослідження з історії літератури народної творчості театру багато публіцистичних виступів
Початкове формування таланту М. Рильського припадає на роки 1907 1917"""

s = "так-сяк понад 250 тисяч"

In [18]:
for ss in s.split("\n"):
    tokenized = model.tokenize(ss)
    for tok_sent in tokenized:
        sent_lemmas = []
        sent_postags = []
        sent_features = []

        model.tag(tok_sent)

        for w in tok_sent.words[1:]:
            print(w.form, w.lemma, w.upostag)

так так ADV
- - PUNCT
сяк сяк ADV
понад 250 понад 250 NUM
тисяч тисяча NOUN
