# Imports

In [1]:
import os

from typing import List, Dict
from pandas import read_csv
import lxml.etree as ET

# General setup (constants et al.)

## Steps to run

In [2]:
WRITE = True # Generate WEMBS training files
GENSIM = True # Train Gensim models
AEMB = False # Train Attentional WEMBS

## Retrieve the datation information

In [3]:
DATATION = read_csv("../../data/raw/datation.tsv", dialect="excel-tab").fillna("")
DATATION = DATATION[DATATION["Ignore"] == ""]
TEXT_SPLITS = {
    row["URN"]: row["Citation level"]
    for _, row in DATATION.iterrows()
}

## Prepare the parsing of file

In [4]:
def expand_msd(attrib):
    attrib = dict([tuple(sub.split("=")) for sub in attrib.split("|") if "=" in sub])
    return {
        morph_key: attrib.get(morph_key, "_")
        for morph_key in ("Case", "Numb", "Gend", "Mood", "Tense", "Voice", "Person", "Deg")
    }

def get_chunks(
    urn: str,
    template="/home/thibault/dev/latin-lemmatized-texts/lemmatized/xml/{file}.xml") -> List[str]:
    
    fp = template.format(file=urn)
    if not os.path.isfile(fp):
        return []
    
    data = []
    
    xml = ET.parse(fp)
    last_n = None
    for w in xml.xpath("//tei:w", namespaces={"tei": "http://www.tei-c.org/ns/1.0"}):
        n = w.attrib["n"].split(".")[:TEXT_SPLITS[urn]]
        
        # If the last level is not the same, we create a new "chunk"
        if last_n != n: 
            data.append([])
            last_n = n
            
        if w.text[0] == "{":
            continue
        try:
            data[-1].append({
                "token": w.text,
                "lemma": w.attrib["lemma"],
                "pos": w.attrib["pos"],
                **expand_msd(w.attrib["msd"])
            })
        except ValueError:
            print(expand_msd(w.attrib["msd"]))
            raise
        
        if w.text in {".", "?", "!", "..."}:
            data.append([])
    
    return [d for d in data if d and len(d) > 1]

def listdict_to_dictlist(listdict: List[Dict[str, str]]):
    return {k: [dic[k] for dic in listdict] for k in listdict[0]}

# Create data for wembs

In [5]:
if WRITE:
    task_files = {
        key: open(f"data/embs_data/{key}.txt", "w")
        for key in ("Case", "Numb", "Gend", "Mood", "Tense", "Voice", "Person", "Deg", "token", "pos", "lemma")
    }



    for file in list(TEXT_SPLITS.keys()):
        chunks = get_chunks(file)
        if chunks:
            for chunk in chunks:
                for task, content in listdict_to_dictlist(chunk).items():
                    task_files[task].write(" ".join(content)+"\n")

    for file in task_files.values():
        file.close()


# Train Wembs with gensim

In [6]:
from gensim.models import Word2Vec, FastText

class MySentences(object):
    def __init__(self, file):
        self.file = file
 
    def __iter__(self):
        for line in open(self.file):
            yield line.split()

            
smalls = ("Case", "Numb", "Gend", "Mood", "Tense", "Voice", "Person", "Deg", "pos")
large = ("token", "lemma")

if GENSIM:
    for task in task_files:
        s = MySentences(f"data/embs_data/{task}.txt")
        print(task)
        if task in large:
            model = Word2Vec(
                sentences=s,
                vector_size=200, window=10, min_count=1, workers=12
            )
            model = model.wv
            model.save_word2vec_format(f"data/embs_models/model.{task}.word2vec.kv", write_header=False)
            model = FastText(
                sentences=s,
                vector_size=200, window=10, min_count=1, workers=12
            )
            model = model.wv
            model.save_word2vec_format(f"data/embs_models/model.{task}.fasttext.kv", write_header=False)
        else:
            model = Word2Vec(
                sentences=s,
                vector_size=10, window=10, min_count=1, workers=12
            )
            model = model.wv
            model.save_word2vec_format(f"data/embs_models/model.{task}.word2vec.kv", write_header=False)
            
            model = Word2Vec(
                sentences=s,
                vector_size=3, window=15, min_count=1, workers=12
            )
            model = model.wv
            model.save_word2vec_format(f"data/embs_models/model.{task}.size3.w10.word2vec.kv", write_header=False)

Case
Numb
Gend
Mood
Tense
Voice
Person
Deg
token
pos
lemma


In [7]:
print(model)

<gensim.models.fasttext.FastTextKeyedVectors object at 0x7f0a25b19eb0>


In [1]:
from gensim.models import KeyedVectors

In [2]:
ft = KeyedVectors.load_word2vec_format("data/embs_models/model.lemma.fasttext.kv.header")
wv = KeyedVectors.load_word2vec_format("data/embs_models/model.lemma.word2vec.kv.header")

In [3]:
ft.most_similar_cosmul(positive=["lasciuus"], topn=20)

[('Lasciuus', 0.9449629187583923),
 ('lasciuibundus', 0.9286326169967651),
 ('lasciuiosus', 0.9245330691337585),
 ('lasciue', 0.914734959602356),
 ('lasciuio', 0.9087250828742981),
 ('nesciuus', 0.9015768766403198),
 ('uaciuus', 0.899739146232605),
 ('nociuus', 0.8996517658233643),
 ('luxuus', 0.8978460431098938),
 ('mollitiuus', 0.897210419178009),
 ('miuus', 0.8940095901489258),
 ('iuus', 0.8930966258049011),
 ('puxus', 0.891620397567749),
 ('lasciuia', 0.8905083537101746),
 ('lixiuus', 0.8902315497398376),
 ('luxus', 0.8876140713691711),
 ('landus', 0.8870022296905518),
 ('lasciuis', 0.8867684602737427),
 ('seriuus', 0.8865979909896851),
 ('asiuus', 0.8855615854263306)]

In [4]:
wv.most_similar_cosmul(positive=["lasciuus"])

[('lasciuio', 0.869053065776825),
 ('procax', 0.8673139810562134),
 ('libidinosus', 0.8551190495491028),
 ('lusus', 0.8514569401741028),
 ('blandus', 0.8478466868400574),
 ('garrulus', 0.8348168730735779),
 ('proteruus', 0.8346595168113708),
 ('salto', 0.8332166075706482),
 ('obscenus', 0.8296343684196472),
 ('petulans', 0.8249545693397522)]

In [12]:
att = KeyedVectors.load_word2vec_format("/home/thibault/dev/attention-word-embedding/models/vectors200-10.txt")

In [13]:
att.most_similar_cosmul(positive=["lasciuus"])

[('carmen1', 0.9400041103363037),
 ('Hispala', 0.9341803193092346),
 ('Astronus', 0.9333240985870361),
 ('Pacatianus', 0.9331989884376526),
 ('Insigne', 0.9320818781852722),
 ('Aegeus', 0.931224524974823),
 ('Rami', 0.9310905337333679),
 ('Bomin', 0.9305999279022217),
 ('hypodorius', 0.930131733417511),
 ('Clitiphus', 0.9299523234367371)]