In [1]:
from collections import namedtuple, Counter, defaultdict
from itertools import combinations
from random import choices
from typing import Set, Dict, List
import bz2
import json
import csv
import unicodedata
from tqdm.notebook import tqdm
import lzma

### Trying to prepare synonym/antonym/random dataset from the ULIF datasource

https://svc2.ulif.org.ua/dictua/

In [2]:
def deaccent(text: str) -> str:
    """
    Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.

    Return input string with accents removed, as unicode.

    >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
    u'Sef chomutovskych komunistu dostal postou bily prasek'

    """

    res = []

    for c in text:
        if c in "їйЇЙ":
            res.append(c)
        else:
            norm = unicodedata.normalize("NFD", c)
            res.append(
                unicodedata.normalize(
                    "NFC",
                    "".join(ch for ch in norm if unicodedata.category(ch) != "Mn"),
                )
            )
    
    return "".join(res)

### Relation types to export

In [3]:
REL_ANTONYM: str = "antonym"
REL_RANDOM: str = "random"
REL_SYNONYM: str = "synonym"

Relation = namedtuple("Relation", ["lemma_left", "lemma_right", "rel"])

### Collecting lemma pairs and normalizing them
 * `all_words` will be used to draw random pairs for neg sampling
 * `all_relations` will have all possible combinations of synonyms in "synsets" and antonyms
 
 Important thing: we are sorting words in pairs to prevent symmetric pairs like synonym1/synonym2 and synonym2/synonym1

In [4]:
all_words: Set[str] = set()
words_by_pos: Dict[str, set] = defaultdict(set)
all_relations: Set[Relation] = set()

with bz2.open("aux data/from_1_try5.jsonlines.bz2", "rt") as fp:
    for i, l in enumerate(tqdm(fp)):
        data: Dict = json.loads(l)
        base_lemma: str = deaccent(data["base"]["value"]).lower().strip(" 0123456789")

        all_words.add(base_lemma)
        words_by_pos[data["base"]["type"]].add(base_lemma)

        if "antonyms" in data:
            antonym = data["antonyms"].get("antonyms")
            lemma = data["antonyms"].get("lemmas")

            if antonym is not None and lemma is not None:
                antonyms_pair = sorted(
                    [deaccent(lemma).lower(), deaccent(antonym).lower()]
                )
                all_relations.add(
                    Relation(
                        lemma_left=antonyms_pair[0],
                        lemma_right=antonyms_pair[1],
                        rel=REL_ANTONYM,
                    )
                )

        for synonym_blocks in data.get("synonyms", []):
            for synonyms in synonym_blocks:
                for syn1, syn2 in combinations(synonyms.get("synonyms", []), 2):
                    if syn1 is None or syn2 is None:
                        break

                    synonyms_pair = sorted(
                        [deaccent(syn1).lower(), deaccent(syn2).lower()]
                    )
                    all_relations.add(
                        Relation(
                            lemma_left=synonyms_pair[0],
                            lemma_right=synonyms_pair[1],
                            rel=REL_SYNONYM,
                        )
                    )

0it [00:00, ?it/s]

### Sanity check for the number of pairs under relations and duplicates
Also we'll use `word_pair_counter` later to not to draw random synonyms as a neg sample

In [5]:
word_pair_counter = Counter()
rel_type_counter = Counter()

for rel in tqdm(all_relations):
    word_pair_counter.update([tuple(sorted([rel.lemma_left, rel.lemma_right]))])
    rel_type_counter.update([rel.rel])

  0%|          | 0/278008 [00:00<?, ?it/s]

In [6]:
word_pair_counter.most_common(10)

[(('вилазити', 'залазити'), 2),
 (('де прийшлося', 'де припало'), 1),
 (('живоїд', 'шкуролуп'), 1),
 (('обілляти', 'полляти'), 1),
 (('бебехнути', 'хвиснути'), 1),
 (('обшарпанець', 'харпак'), 1),
 (('протнути', 'розлягтися'), 1),
 (('високомовність', 'пишномовність'), 1),
 (('відсапнутися', 'дихнути'), 1),
 (('згнітити', 'стримувати'), 1)]

### Adding lemma frequency dict to filter out infrequent pairs

In [7]:
lemma_freqs: Dict[str, float] = defaultdict(float)

def fix_nulls(fp_in):
    for line in fp_in:
        yield line.replace('\0', '')

with lzma.open("aux data/lemma_freqs.csv.xz", "rt") as fp_in:
    r = csv.DictReader(fix_nulls(fp_in))
    
    for l in tqdm(r):
        lemma_freqs[l["lemma"]] += float(l["freq_in_corpus"])

0it [00:00, ?it/s]

### Ignoring antonyms cause we don't have many of them and low freq lemmas

In [8]:
ULIF_HF_SYNONYMS: List[Relation] = []

for rel in tqdm(all_relations):
    if rel.rel != REL_SYNONYM:
        continue
    
    if lemma_freqs[rel.lemma_left] < 5e-6:
        continue

    if lemma_freqs[rel.lemma_right] < 5e-6:
        continue
    
    ULIF_HF_SYNONYMS.append(rel)

  0%|          | 0/278008 [00:00<?, ?it/s]

### Drawing NEG_RATIO more random samples

In [9]:
NEG_RATIO: int = 5
NEG_SAMPLES_COUNT: int = len(ULIF_HF_SYNONYMS) * NEG_RATIO
NEG_SAMPLES_POS_ALIGNED_COUNT: int = int(NEG_SAMPLES_COUNT * 0.8)
NEG_SAMPLES_RANDOM_COUNT: int = NEG_SAMPLES_COUNT - NEG_SAMPLES_POS_ALIGNED_COUNT

In [10]:
pos_counts: Counter[str, int] = Counter({p: len(w) for p, w in words_by_pos.items()})

### First we draw truly random pairs from different POS combinations

In [11]:
NEG_SAMPLES: Set[Relation] = set()
ALL_WORDS: List[str] = list(all_words)

while True:
    candidates = tuple(sorted(choices(ALL_WORDS, k=2)))
    
    if candidates[0] == candidates[1]:
        continue

    if " " in candidates[0] or " " in candidates[1]:
        continue

    if candidates in word_pair_counter:
        continue
    
    NEG_SAMPLES.add(Relation(lemma_left=candidates[0], lemma_right=candidates[1], rel=REL_RANDOM))
    if len(NEG_SAMPLES) == NEG_SAMPLES_RANDOM_COUNT:
        break

In [12]:
while True:
    pos = choices(list(pos_counts.keys()), weights = list(pos_counts.values()), k = 1)[0]

    candidates = tuple(sorted(choices(list(words_by_pos[pos]), k=2)))
    
    if candidates[0] == candidates[1]:
        continue

    if " " in candidates[0] or " " in candidates[1]:
        continue

    if candidates in word_pair_counter:
        continue

    NEG_SAMPLES.add(Relation(lemma_left=candidates[0], lemma_right=candidates[1], rel=REL_RANDOM))
    if len(NEG_SAMPLES) == NEG_SAMPLES_POS_ALIGNED_COUNT:
        break

### Here we adhere to the naming convention from the original RUMEN dataset


```python
def get_names(cat):
    if cat == 0:
        return "RANDOM"
    if cat == 1:
        return "HYPER"
    if cat == 2:
        return "SYN"
```

### Exporting ulif hf synonyms

In [13]:
with open("aux data/ulif_hf_synonyms.csv", "w") as fp_out:
    w = csv.DictWriter(fp_out, fieldnames=["W1", "W2", "rel"])
    w.writeheader()
    for syn in ULIF_HF_SYNONYMS:
        w.writerow({
            "W1": syn.lemma_left,
            "W2": syn.lemma_right,
            "rel": 2,
        })

### Exporting random pairs

In [14]:
with open("aux data/ulif_random.csv", "w") as fp_out:
    w = csv.DictWriter(fp_out, fieldnames=["W1", "W2", "rel"])
    w.writeheader()
    for syn in NEG_SAMPLES:
        w.writerow({
            "W1": syn.lemma_left,
            "W2": syn.lemma_right,
            "rel": 0,
        })

### Now let's try another source of synonyms: https://synonimy.info

In [15]:
with open("aux data/synonimy_info_clean.json", "r") as fp_in:
    synonimy_info: Dict = json.load(fp_in)

In [16]:
SYNONIMY_INFO_SYNONYMS: Set[Relation] = set()

for lemma_rec in tqdm(synonimy_info):
    synset = set([deaccent(lemma_rec["lemma"]).lower()] + [deaccent(clean).lower() for clean in lemma_rec["synsets"][0]["clean"]])

    for syn1, syn2 in combinations(synset, 2):
        synonyms_pair = sorted(
            [syn1, syn2]
        )
        
        SYNONIMY_INFO_SYNONYMS.add(
            Relation(
                lemma_left=synonyms_pair[0],
                lemma_right=synonyms_pair[1],
                rel=REL_SYNONYM,
            )
        )


  0%|          | 0/15157 [00:00<?, ?it/s]

In [17]:
SYNONIMY_INFO_HF_SYNONYMS: List[Relation] = []

for rel in tqdm(SYNONIMY_INFO_SYNONYMS):
    if lemma_freqs[rel.lemma_left] < 5e-6:
        continue

    if lemma_freqs[rel.lemma_right] < 5e-6:
        continue
    
    SYNONIMY_INFO_HF_SYNONYMS.append(rel)

  0%|          | 0/282411 [00:00<?, ?it/s]

In [18]:
with open("aux data/synonimy_info_hf_synonyms.csv", "w") as fp_out:
    w = csv.DictWriter(fp_out, fieldnames=["W1", "W2", "rel"])
    w.writeheader()
    for syn in SYNONIMY_INFO_HF_SYNONYMS:
        w.writerow({
            "W1": syn.lemma_left,
            "W2": syn.lemma_right,
            "rel": 2,
        })

### Now let's combine ulif synonyms/random and synonimy_info/random datasets for the training

In [19]:
with open("aux data/ulif_hf_synonyms_and_random.csv", "w") as fp_out:
    w = csv.DictWriter(fp_out, fieldnames=["W1", "W2", "rel"])
    w.writeheader()
    for syn in ULIF_HF_SYNONYMS:
        w.writerow({
            "W1": syn.lemma_left,
            "W2": syn.lemma_right,
            "rel": 2,
        })

    for syn in NEG_SAMPLES:
        w.writerow({
            "W1": syn.lemma_left,
            "W2": syn.lemma_right,
            "rel": 0,
        })

with open("aux data/synonimy_info_hf_synonyms_and_random.csv", "w") as fp_out:
    w = csv.DictWriter(fp_out, fieldnames=["W1", "W2", "rel"])
    w.writeheader()
    for syn in SYNONIMY_INFO_HF_SYNONYMS:
        w.writerow({
            "W1": syn.lemma_left,
            "W2": syn.lemma_right,
            "rel": 2,
        })

    for syn in NEG_SAMPLES:
        w.writerow({
            "W1": syn.lemma_left,
            "W2": syn.lemma_right,
            "rel": 0,
        })