In [None]:
import pyoxigraph as ox
import pandas as pd
import json

In [None]:
store = ox.Store("data/wety.db")
# store = ox.Store("data/test_output/wety.db")

In [None]:
def get_item_id(term, lang, n=0):
    return [s['w'].value for s in store.query(f'SELECT ?w WHERE {{ ?w <p:term> "{term}" . ?w <p:lang> "{lang}" .}}')][n]

def print_item_info(id):
    results = store.query(
        f"""SELECT ?term ?lang ?gloss WHERE {{ 
            <{id}> <p:term> ?term .
            <{id}> <p:lang> ?lang .
            OPTIONAL {{ <{id}> <p:gloss> ?gloss }}
        }} LIMIT 1"""
    )
    for result in results:
        print(result['term'].value)
        print(result['lang'].value)
        if result['gloss']: print(result['gloss'].value)
    print()

def get_item(id):
    item = {"id": id}
    results = store.query(
        f"""SELECT ?term ?lang ?url ?pos ?gloss ?isImputed ?mode ?isReconstructed ?headProgenitor ?progenitor WHERE {{ 
            <{id}> <p:term> ?term .
            <{id}> <p:lang> ?lang .
            OPTIONAL {{ <{id}> <p:url> ?url }} .
            OPTIONAL {{ <{id}> <p:pos> ?pos }} .
            OPTIONAL {{ <{id}> <p:gloss> ?gloss }} .
            OPTIONAL {{ <{id}> <p:isImputed> ?isImputed }} .
            OPTIONAL {{ <{id}> <p:mode> ?mode }} .
            OPTIONAL {{ <{id}> <p:isReconstructed> ?isReconstructed }} .
            OPTIONAL {{ <{id}> <p:headProgenitor> ?headProgenitor }} .
            OPTIONAL {{ <{id}> <p:progenitor> ?progenitor }} .
        }}"""
    )
    preds = ["term", "lang", "url", "pos", "gloss", "isImputed", "mode", "isReconstructed", "headProgenitor", "progenitor"]
    item = {p: None if p != "progenitor" else [] for p in preds} 
    for result in results:
        for p in preds:
            if result[p] is not None:
                if p == "progenitor":
                    item[p].append(result[p].value)
                else:
                    if item[p] is None:
                        item[p] = result[p].value
    return item

# we only continue expanding items if they are in only_expand_these. if this arg is none, expand all
def get_item_expansion(id, only_expand_these=None, terminal_lang=None):
    item = get_item(id)
    if item['lang'] != terminal_lang:
        children_expansion = []
        should_expand = (lambda id: id in only_expand_these) if only_expand_these else (lambda id: True)
        for child in filter(should_expand, get_children(id)):
            child_expansion = get_item_expansion(child, only_expand_these, terminal_lang)
            children_expansion.append(child_expansion)
        item['children'] = children_expansion
    return item

# optional lang arg restricts terminal children to first terms in lang
def get_item_json(id, lang=None):
    only_expand_these = get_all_lang_descendant_ancestors(id, lang) if lang else None
    item_expansion = get_item_expansion(id, only_expand_these, lang)
    return json.dumps(item_expansion, ensure_ascii=False, indent=True)

def get_head_source(id):
    results = store.query(
        f"""SELECT ?source ?order ?head WHERE {{ 
            <{id}> <p:head> ?head .
            <{id}> <p:source> ?sourceNode .
            ?sourceNode <p:item> ?source .
            ?sourceNode <p:order> ?order .
        }}"""
    )
    # for whatever reason doing this filtering outside the query is way faster
    # than using FILTER inside the query
    for result in results:
        if result['order'].value == result['head'].value:
            return result['source'].value
    return None

def get_head_progenitor(id):
    results = store.query(
        f"""SELECT ?headProgenitor WHERE {{ 
            <{id}> <p:headProgenitor> ?headProgenitor .
        }}"""
    )
    for result in results:
        return result['headProgenitor'].value
    return None

def get_ancestors(id):
    results = store.query(
        f"""SELECT ?item WHERE {{ 
            <{id}> (<p:source>/<p:item>)+ ?item .
        }}"""
    )
    items = set()
    for result in results:
        items.add(result['item'].value)
    return items if items else None

# get all ancestors of all items that both share the same headProgenitor and are
# in lang
def get_all_lang_descendant_ancestors(headProgenitor, lang):
    results = store.query(
        f"""SELECT ?ancestor WHERE {{ 
            ?item <p:headProgenitor> <{headProgenitor}> .
            ?item (<p:source>/<p:item>)* ?ancestor .
            ?item <p:lang> ?lang .
            FILTER ( ?lang = "{lang}" ) .
        }}"""
    )
    return {result['ancestor'].value for result in results}

def get_children(id):
    return [w['child'].value for w in store.query(
        f"""SELECT ?child WHERE {{ 
            ?sourceNode <p:item> <{id}> .
            ?sourceNode <p:order> ?order .
            ?child <p:source> ?sourceNode .
            ?child <p:head> ?order . 
        }}"""
    )]

In [None]:
item = get_item_id("mainą", "Proto-Germanic", 0)
print_item_info(item)
head_progenitor = get_head_progenitor(item)
print_item_info(head_progenitor)
print_item_info(get_head_source(item))
for child in get_children(item):
    print_item_info(child)

In [None]:
term = "mind"
lang = "English"
item = get_item_id("mind", "English", 0)
print_item_info(item)
head_progenitor = get_head_progenitor(item)
print_item_info(head_progenitor)
d = get_item_json(head_progenitor, "English")
with open(f"data/test_output/{term}_{lang}.json", 'w') as f:
    f.write(d)

In [None]:
import re

term_re = re.compile(r'wiki/(.+)%23.+')
recon_term_re = re.compile(r'wiki/Reconstruction:(.+)%2F.+')

def json_lang_terms(item_json):
    full_item = json.loads(item_json)
    langterms = set()
    def recurse(item):
        url = item["url"]
        if url:
            m = re.search(term_re, url) or re.search(recon_term_re, url)
            term = m.group(1)
            langterms.add((item["lang"], term))
            if "children" in item: 
                for child in item["children"]:
                    recurse(child)
    recurse(full_item)
    return langterms


def write_wiktextract_items(item_json):
    lang_terms = json_lang_terms(item_json)
    with open("data/data.json", "r", encoding="utf-8") as fin, open("data/test/descendants.jsonl", "w") as fout:
        for line in fin:
            wikt = json.loads(line)
            if "lang" in wikt and "word" in wikt and(wikt["lang"], wikt["word"]) in lang_terms:
                fout.write(line)


In [None]:
write_wiktextract_items(d)

In [None]:
item = get_item_id("munaną", "Proto-Germanic", 0)
print(get_item_json(item))

In [None]:
data = pd.DataFrame(
    [[w['lang'].value, w['pos'].value, w['term'].value] for w in store.query(
        f"""SELECT DISTINCT ?lang ?pos ?term WHERE {{ 
            ?item <p:glossNum> ?glossNum .
            ?item <p:glossNum> ?glossNum .
            ?item <p:lang> ?lang .
            ?item <p:pos> ?pos .
            ?item <p:term> ?term .
        }}"""
    )],
    columns=["lang", "pos", "term"]
)
data.sort_values(["lang", "pos", "term"], inplace=True)
data.to_csv("data/nonzero_glossNum.csv", index=False)
print(len(data))

In [None]:
data = pd.DataFrame(
    [[w['lang'].value, w['pos'].value, w['term'].value] for w in store.query(
        f"""SELECT DISTINCT ?lang ?pos ?term WHERE {{ 
            ?item <p:glossNum> ?glossNum .
            ?item <p:etyNum> ?etyNum .
            ?item <p:lang> ?lang .
            ?item <p:pos> ?pos .
            ?item <p:term> ?term .
        }}"""
    )],
    columns=["lang", "pos", "term"]
)
data.sort_values(["lang", "pos", "term"], inplace=True)
data.to_csv("data/nonzero_glossNum_and_etyNum.csv", index=False)
print(len(data))

In [None]:
d = pd.read_csv("data/descendants_langs.csv", names=["lang","n"])
d["n"].sum()

In [None]:
fas = pd.read_csv("data/feedback_arc_set_pass_1.tsv", sep="\t")