In [1]:
import xml.etree.ElementTree as ET
from typing import Optional, List, Literal, Dict
from typing import NamedTuple
from sys import stderr
from difflib import SequenceMatcher
import csv

In [2]:
def get_required_edits(a, b):
    for operation, a_start, a_end, b_start, b_end in SequenceMatcher(
        a=a, b=b, autojunk=False
    ).get_opcodes():
        if operation == "replace":
            yield (operation, a[a_start:a_end], b[b_start:b_end])
        elif operation == "insert":
            yield (operation, a[a_start:a_end], b[b_start:b_end])
        else:
            continue

In [3]:
namespaces = {
    "": "http://www.tei-c.org/ns/1.0",
    "custom": "http://www.tei-c.org/ns/1.0",
    "xml": "http://www.w3.org/XML/1998/namespace",
}

In [4]:
input_path = "Comprehensive_Coptic_Lexicon-v1.2-2020.xml"
tree = ET.parse(input_path)
root = tree.getroot()

In [5]:
entries_with_variants = []
for e in root.iterfind(".//entry", namespaces):
    entry_id = e.attrib["{" + namespaces["xml"] + "}" + "id"]
    lemma_node = e.find('.//form[@type="lemma"]/orth', namespaces)
    try:
        lemma = lemma_node.text
        orthographic_variants = set(
            node.text
            for node in e.findall(
                ".//form/orth",
                namespaces,
            )
            if node.text != lemma
        )
        if orthographic_variants:
            for variant in orthographic_variants:
                for operation, edit_in, edit_out in get_required_edits(
                    lemma, variant
                ):
                    entries_with_variants.append({
                        "id": entry_id,
                        "lemma": lemma,
                        "variant": variant,
                        "operation": operation,
                        "norm": edit_in,
                        "var": edit_out
                    })
    except AttributeError:  # if a node has no text
        continue


In [6]:
import pandas as pd
import numpy as np
import seaborn as sns

In [7]:
df = pd.DataFrame(entries_with_variants)

In [8]:
df

Unnamed: 0,id,lemma,variant,operation,norm,var
0,C2,ⲁ-,ⲁ⸗,replace,-,⸗
1,C3,ⲁ-,ⲉ-,replace,ⲁ,ⲉ
2,C4,ⲁ-,ⲁⲩ-,insert,,ⲩ
3,C7,ⲁⲃⲱ,ⲁⲃⲟⲟⲩⲉ,replace,ⲱ,ⲟⲟⲩⲉ
4,C7,ⲁⲃⲱ,ⲁⲃⲟⲩ,replace,ⲱ,ⲟⲩ
...,...,...,...,...,...,...
6534,C11263,ϩⲱⲥ,ϩⲉⲱⲥ,insert,,ⲉ
6535,C11263,ϩⲱⲥ,ϩⲟⲥ,replace,ⲱ,ⲟ
6536,C11266,ϭⲓⲛⲥ⳨ⲟ︦ⲩ︦,ⲥⲧⲣⲟ︦ⲩ︦,replace,⳨,ⲧⲣ
6537,C11268,ϯⲁⲑⲩⲥⲓⲥ,ϯⲁⲑⲏⲥⲓⲥ,replace,ⲩ,ⲏ


In [9]:
df = df[df.id.str[1:].apply(lambda i: int(i) >= 8043)]

In [10]:
vocalic_variants = df[(df["norm"] + df["var"]).str.match(r"^[ⲁⲉⲓⲟⲩⲏⲱ]{,3}$").fillna(False)]

In [11]:
vocalic_variants

Unnamed: 0,id,lemma,variant,operation,norm,var
245,C11276,ⲁⲡⲉ,ⲁⲡⲏⲩⲉ,insert,,ⲏⲩ
246,C11276,ⲁⲡⲉ,ⲁⲡⲏⲟⲩⲉ,insert,,ⲏⲟⲩ
1146,C11284,ⲗⲟⲓϭⲉ,ⲗⲟⲓⲉϭⲉ,insert,,ⲉ
2108,C8047,ⲁⲅⲁⲑⲟⲛ,ⲁⲅⲁⲑⲱⲛ,replace,ⲟ,ⲱ
2111,C8047,ⲁⲅⲁⲑⲟⲛ,ⲁⲕⲁⲑⲱⲛ,replace,ⲟ,ⲱ
...,...,...,...,...,...,...
6530,C11262,ϩⲱⲥⲧⲉ,ϩⲱⲥⲧⲏ,replace,ⲉ,ⲏ
6531,C11262,ϩⲱⲥⲧⲉ,ϩⲟⲥⲇⲉ,replace,ⲱ,ⲟ
6534,C11263,ϩⲱⲥ,ϩⲉⲱⲥ,insert,,ⲉ
6535,C11263,ϩⲱⲥ,ϩⲟⲥ,replace,ⲱ,ⲟ


In [12]:
vocalic_variants[["norm", "var"]].value_counts().head(30)

norm  var
ⲟ     ⲱ      374
      ⲉ      198
ⲱ     ⲟ      146
ⲏ     ⲉ      141
ⲉ     ⲏ      121
ⲩ     ⲏ       98
      ⲓ       90
ⲉ     ⲓ       83
ⲁⲓ    ⲉ       80
ⲏ     ⲩ       79
ⲓ     ⲉ       74
ⲏ     ⲓ       60
      ⲩ       60
ⲁ     ⲉ       51
ⲩ     ⲉ       48
ⲓ     ⲏ       43
ⲉ     ⲩ       38
      ⲁ       37
ⲟ     ⲁ       36
ⲩ     ⲓ       32
      ⲟ       31
ⲉ     ⲁ       28
ⲉⲓ    ⲏ       28
ⲓ     ⲩ       28
ⲁ     ⲟ       25
      ⲏ       22
ⲉ     ⲁⲓ      22
ⲟⲓ    ⲉ       17
ⲟ     ⲉ       16
ⲟⲓ    ⲏ       15
dtype: int64

In [13]:
vocalic_crosstab = pd.crosstab(vocalic_variants.norm, vocalic_variants.var)

In [14]:
df[((df["norm"] == "ⲩ") & (df["var"] == "ⲉ")) | ((df["norm"] == "ⲉ") & (df["var"] == "ⲩ"))]

Unnamed: 0,id,lemma,variant,operation,norm,var
2852,C8710,ⲇⲉⲕⲁⲛⲟⲥ,ⲇⲩⲕⲁⲛⲟⲥ,replace,ⲉ,ⲩ
2964,C8749,ⲇⲓⲁⲗⲩⲥⲓⲥ,ⲇⲉⲁⲗⲉⲥⲓⲥ,replace,ⲩ,ⲉ
2966,C8749,ⲇⲓⲁⲗⲩⲥⲓⲥ,ⲧⲓⲁⲗⲉⲥⲓⲥ,replace,ⲩ,ⲉ
3097,C8838,ⲇⲩⲕⲁⲛⲟⲥ,ⲇⲉⲕⲁⲛⲟⲥ,replace,ⲩ,ⲉ
3105,C8841,ⲇⲩⲛⲁⲙⲓⲥ,ⲇⲉⲛⲁⲙⲓⲥ,replace,ⲩ,ⲉ
...,...,...,...,...,...,...
6425,C11210,ϩⲩⲙⲛⲓ,ϩⲉⲙⲛⲉⲥⲓ,replace,ⲩ,ⲉ
6467,C11226,ϩⲩⲡⲟⲅⲣⲁⲫⲉ,ϩⲉⲡⲟⲅⲣⲁⲫⲏ,replace,ⲩ,ⲉ
6472,C11227,ϩⲩⲡⲟⲅⲣⲁⲫⲏ,ϩⲉⲡⲟⲅⲣⲁⲫⲏ,replace,ⲩ,ⲉ
6511,C11250,ϩⲩⲡⲟⲧⲁⲥⲥⲉ,ϩⲉⲡⲱⲧⲁϩⲥⲓ,replace,ⲩ,ⲉ
