In [1]:
import csv
import itertools

import altair as alt
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from utils import get_hyponyms, get_instance_hyponyms
from tqdm.notebook import tqdm
from utils import merge_csv, title_by_url

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [14]:
!mkdir pairs
!mkdir data

In [3]:
OUTPUT_FN = "./data/merged_pairs.csv"

## Load and prepare data

In [4]:
data = pd.read_csv("./data/pwn_friends.csv")

filtered_uk = data.loc[data["rel"] == "pwn31_to_uk_wiki"].reset_index(drop=True)
filtered_uk.head()

Unnamed: 0,id_from,id_to,rel
0,omw-en31-00001740-n,https://uk.wikipedia.org/wiki/%D0%A1%D1%83%D1%...,pwn31_to_uk_wiki
1,omw-en31-00021914-n,https://uk.wikipedia.org/wiki/%D0%9F%D0%BE%D0%...,pwn31_to_uk_wiki
2,omw-en31-00021445-n,https://uk.wikipedia.org/wiki/%D0%9B%D1%83%D0%...,pwn31_to_uk_wiki
3,omw-en31-00021445-n,https://uk.wikipedia.org/wiki/%D0%87%D0%B6%D0%B0,pwn31_to_uk_wiki
4,omw-en31-00021007-n,https://uk.wikipedia.org/wiki/%D0%9C%D0%B0%D1%...,pwn31_to_uk_wiki


## Helper functions

In [5]:
def create_write_pairs(rel_type_list, rel_type, df, hypernym_title, csv_writer):
    """
    Writes to csv files pairs of all relation types.
    :param rel_type_list: list
    :param rel_type: string
    :param df: DataFrame
    :param hypernym_title: string
    :param csv_writer: writer object
    :return: None
    """
    if not rel_type_list:
        return
    titles = [
        title_by_url(df.loc[df["id_from"] == hyp_id]["id_to"].iloc[0])
        for hyp_id in rel_type_list
        if hyp_id in df["id_from"].unique()
    ]
    if not titles:
        return
    [
        csv_writer.writerow([hypernym_title, elem, f"hypernym-{rel_type}"])
        for elem in titles
    ]
    if len(titles) > 1:
        pairs = list(itertools.combinations(titles, 2))
        [csv_writer.writerow(list(pair) + [f"co-{rel_type}s"]) for pair in pairs]

## Create hyponym-hypernym pairs for Ukrainian Wiki page

In [6]:
def run_step(index, row, file_name):
    """
    Create a csv file with pairs of relation types for 1 Wikipedia page.
    :param index: int
    :param row: pd.Series
    :param file_name: string
    :return: None
    """
    with open(file_name, "w", encoding="UTF8") as f:
        csv_writer = csv.writer(f, delimiter=";")
        url, synset_id = row["id_to"], row["id_from"]
        urk_title = title_by_url(url)
        if not urk_title:
            return
        hyponyms = get_hyponyms(synset_id)
        instances = get_instance_hyponyms(synset_id)
        create_write_pairs(hyponyms, "hyponym", filtered_uk, urk_title, csv_writer)
        create_write_pairs(instances, "instance", filtered_uk, urk_title, csv_writer)

## Parallel run for all Wiki pages

In [7]:
run = Parallel(n_jobs=10)(
    delayed(run_step)(index, row, f"./pairs/page_{index}.csv")
    for index, row in tqdm(filtered_uk.iterrows(), total=len(filtered_uk))
)

  0%|          | 0/21354 [00:00<?, ?it/s]

## Merge separate files into one dataset

In [8]:
header = ["word_left", "word_right", "relation_type"]
merge_csv("pairs", OUTPUT_FN, header, ";")

  0%|          | 0/21354 [00:00<?, ?it/s]

In [None]:
!rm -rf pairs

## Final clean-up

In [4]:
new_df = pd.read_csv(OUTPUT_FN, delimiter=";")
new_df.drop_duplicates(ignore_index=True, inplace=True)
new_df.drop(np.where(new_df["word_left"] == new_df["word_right"])[0], inplace=True)
new_df.to_csv(OUTPUT_FN, sep=";", encoding="utf-8", index=False)
new_df.head()

Unnamed: 0,word_left,word_right,relation_type
0,Організм,Бенталь,hypernym-hyponym
1,Організм,Гетеротрофи,hypernym-hyponym
2,Організм,Тварини,hypernym-hyponym
3,Організм,Рослини,hypernym-hyponym
4,Організм,Мікроорганізм,hypernym-hyponym


## Dataset statistics

In [23]:
bar = (
    alt.Chart(new_df.groupby("relation_type", as_index=False).size())
    .mark_bar(opacity=0.7)
    .encode(
        x=alt.X("size:Q", title="num. of pairs"),
        y=alt.Y("relation_type:N", title="relation type", sort="-x"),
        tooltip=[
            alt.Tooltip("relation_type:N", title="relation type"),
            alt.Tooltip("size:Q", title="num. of pairs"),
        ],
        color=alt.ColorValue("darkgreen"),
    )
)
bar.properties(width=600, height=300).configure_view(strokeWidth=0)