Rôle du notebook: conversion des fichiers XMLs en dataset TSV

In [1]:
import pandas as pd
from itertools import chain

citations = []

df = pd.read_csv("/home/thibault/dev/these/data/raw/datation.tsv", delimiter="\t")
df = df[df["Ignore"] != "X"]
for row in df["Name of citation level"]:
    if isinstance(row, str):
        citations.extend([x.split("|") for x in row.lower().split(",")])
        
names = sorted(list(set(chain.from_iterable((citations)))))

final = {
    "chapteer": "chapter",
    "chapitre": "chapter",
    "scholia": "comment",
    "epistula": "letter",
    "carmen": "poem",
    "p": "paragraph",
    "paragra": "paragraph",
    "poeme": "poem"
}
for name in names:
    if name.endswith("s"):
        final[name] = name[:-1]
final
sorted(list(set([final.get(n, n) for n in names])))

df["Name of citation level"] = df["Name of citation level"].apply(
    lambda x: sorted([final.get(n, n) for part in x.lower().split(",") for n in part.split("|")])\
    if isinstance(x, str) else []
)
df["type"] = df["Name of citation level"].apply(
    lambda n: "versified" if "line" in n or "poem" in n else "prose"
)
#df["Century"] = list(range()
# -254.0
# 1107
# df["Death"].max()
centuries = list(range(-3, 13))

df["centuries"] = None
df_cents = []
for idx, row in df.iterrows():
    birth, death = row["Birth"], row["Death"]
    if death % 100 == 0:
        death -= 1
    df_cents.append([
        cent 
        for cent in centuries 
        if cent*100 <= birth <= cent*100+99 or \
        cent*100 <= death <= cent*100+99 or \
        birth <= cent*100 <= death
    ])
df["centuries"] = df_cents

df[["URN", "Name of citation level", "type", "centuries"]].to_csv("metadata.csv")


In [2]:
METADATAS = {
    d["URN"]: {k:d[k] for k in d if k != "URN"}
    for d in df[["URN", "Name of citation level", "type", "centuries"]].to_dict("index").values()
}
for key, value in list(METADATAS.items()):
    if len(key.split(".")) == 3:
        METADATAS[".".join(key.split(".")[:2])] = value

## Function to parse

In [19]:
import lxml.etree as ET
import tqdm
import glob
import os
import os
from typing import List, Tuple

from constants import CATS

def msd_to_tsv(attrib, cats=CATS):
    local_values = dict([
        tuple(elem.split("="))
        for elem in attrib.split("|")
        if elem.split("=")[0] in cats
    ])
    return "\t".join([
        local_values.get(cat, "-")
        for cat in cats
    ])

import regex as re

FORBIDDEN_TOKENS = re.compile(".*[^\w\.,;?!\"':\(\)]+.*")
NORMALIZE = re.compile("['‘’“”«»]+")

def keep(string):
    string = NORMALIZE.sub('"', string)
    if FORBIDDEN_TOKENS.match(string):
        if string[0] == "{" and string[-1] == "}":
            return True
        print("Ignored %s" % string)
        return False
    return True

def norma(string):
    if string[0] == "{" and string[-1] == "}":
        if string[-4:] == "que}":
            return "-que"
        if string[-4:] == "cum}":
            return "-cum"
        elif string[-3:] == "ne}" or  string[-2:] == "n}":
            return "-ne"
        elif string[-3:] == "ue}" or string[-3:] == "ve}":
            return "-ve"
        elif string[-3:] == "st}":
            return "est"
        else:
            print(string)
    return NORMALIZE.sub('"', string)

def make_metadata_token(key, val):
    return f"{key}={val}"

def get_ana(xml):
    if xml.xpath("//@ana"):
        return list([x for x in xml.xpath("//@ana")[0].split() if x.strip()])
    return []

def file_to_string(fp: str, label: str, cats=CATS) -> str:
    """
    """
    with open(fp) as f:
        xml = ET.parse(f)
    urn = xml.xpath("//idno[@type='CTS_URN']")[0].text
    key = urn
    if urn not in METADATAS:
        key = ".".join(urn.split(".")[:2])
    if key in METADATAS:
        metadata = {
            "urn": urn,
            "fp": fp,
            "tags": get_ana(xml),
            "metadata_token": [
                make_metadata_token("WrittenType", METADATAS[key]['type']),
                *[make_metadata_token("Century", c) for c in METADATAS[key]["centuries"]],
                *[make_metadata_token("CitationTypes", t) for t in METADATAS[key]["Name of citation level"]]
            ]
        }
    else:
        print(key, fp)
    return f"#[TAG]{label}\n" + \
           "\n".join([f'[GENERIC-METADATA]{k}={metadata[k]}' for k in ("urn", "fp")])+ "\n" + \
           "\n".join([f'[TAGS-METADATA]TAG={tag}' for tag in metadata.get("tags", [])])+ "\n" + \
           "\n".join(["[TOKEN-METADATA]"+ m for m in metadata["metadata_token"]]) + \
           f"\n[TOKEN-METADATA]{make_metadata_token('Textgroup', urn.split('.')[0])}\n" + \
           "\n".join([
                f"{norma(token.text)}\t"
                f"{token.attrib['lemma']}\t"
                f"{token.attrib['pos']}\t"
                f"{msd_to_tsv(token.attrib['msd'], cats=cats)}"
                for token in xml.xpath("//w")
                if keep(token.text)
            ])

def split_dataset(sample, ratio=0.8):
    ms = int(ratio*len(sample))
    return sample[:ms], sample[ms:]

## Generate full

In [9]:
positive = []

for file in glob.glob("/home/thibault/dev/these-corpus/data/*.xml"):
    positive.append(file_to_string(file, "positive"))

negative = []

for file in glob.glob("./dataset/negative-examples/*.xml"):
    negative.append(file_to_string(file, "negative"))


def write_dataset(dataset: List[str], filepath: str, cats: Tuple[str, ...] = CATS, delimiter="\t"):
    with open(filepath, "w") as f:
        f.write(f"[header]\ttoken\tlemma\tpos\t{delimiter.join(cats)}\n")
        f.write("\n\n\n".join(dataset))
    return True
    
os.makedirs("./dataset/raw/", exist_ok=True)

write_dataset(positive, "dataset/raw/positive.txt")
write_dataset(negative, "dataset/raw/negative.txt")

Ignored [
Ignored ]
Ignored —
Ignored |
Ignored [
Ignored ]
Ignored [
Ignored ]
Ignored -
Ignored —
Ignored [
Ignored ]
Ignored /
Ignored —
Ignored *
Ignored *
Ignored *
Ignored -
Ignored *
Ignored [
Ignored ]
Ignored —
Ignored —
Ignored —
Ignored —
Ignored —
Ignored —
Ignored †
Ignored †
Ignored —
Ignored -
Ignored [
Ignored ]
Ignored =
Ignored =
Ignored [
Ignored ]
Ignored *
Ignored [
Ignored ]
Ignored —
Ignored *
Ignored *
Ignored †
Ignored -
Ignored -
Ignored -
Ignored —
Ignored [
Ignored ]
Ignored —
Ignored —
Ignored —
Ignored —
Ignored †
Ignored —
Ignored —
Ignored —
{sis}
Ignored —
Ignored —
Ignored [
Ignored ]
Ignored *
Ignored —
Ignored —
Ignored †
{sis}
Ignored *
Ignored *
{sis}
{semetipsum}
Ignored —
Ignored —
{sis}
Ignored —
Ignored —
Ignored =
Ignored -
Ignored —
Ignored —
Ignored /
Ignored -
Ignored -
Ignored †
Ignored †
Ignored —
Ignored —
Ignored —
Ignored \
Ignored \
Ignored [
Ignored ]
Ignored -
Ignored —
Ignored —
Ignored -
Ignored —
Ignored —
Ignored [
Ignored ]
Ign

True

## Generate dataset

### Random train, dev, test

In [12]:
import random
import math

random.shuffle(positive)
random.shuffle(negative)

train_ratio, dev_ratio = 0.8, 0.1

def cut(dataset: float, r_train: float, r_dev: float):
    length = len(dataset)
    r_train = math.ceil(length*r_train)
    r_dev = r_train + math.ceil(length*r_dev)
    return dataset[:r_train], dataset[r_train:r_dev], dataset[r_dev:]


cut_positive = cut(positive, r_train=train_ratio, r_dev=dev_ratio)
cut_negative = cut(negative, r_train=train_ratio, r_dev=dev_ratio)

train, dev, test = [pos+neg for pos, neg in zip(cut_positive, cut_negative)]

random.shuffle(train)
random.shuffle(dev)
random.shuffle(test)

os.makedirs("./dataset/split/", exist_ok=True)
write_dataset(train, "./dataset/split/train.txt")
write_dataset(dev, "./dataset/split/dev.txt")
write_dataset(test, "./dataset/split/test.txt")

True

### Metaphore dataset

In [20]:
positives_meta = {
    True: [],
    False: []
}
for elem in positive:
    positives_meta["#metaphor" in elem].append(elem)
    
for key in positives_meta:
    print(f"{key}: {len(positives_meta[key])}")
    

True: 459
False: 2057


In [22]:
# 90 / 10 ?
m_train, m_dev = split_dataset(positives_meta[True], ratio=.9)
m_test = positives_meta[False]

# Add a little of generic test
neg_train_dev, neg_test = split_dataset(negative, ratio=.7)
neg_train, neg_dev = split_dataset(neg_train_dev, ratio=0.9)


write_dataset(m_train+neg_train, "./dataset/metaphors/train.txt")
write_dataset(m_dev+neg_dev, "./dataset/metaphors/dev.txt")
write_dataset(m_test+neg_test, "./dataset/metaphors/test.txt")

True

### Inversed metaphore dataset

In [27]:
# 90 / 10 ?
m_train, m_dev = split_dataset(positives_meta[False], ratio=.7)
m_test = positives_meta[True]

# Add a little of generic test
neg_train_dev, neg_test = split_dataset(negative, ratio=.7)
neg_train, neg_dev = split_dataset(neg_train_dev, ratio=0.9)


write_dataset(m_train+neg_train, "./dataset/inversed-metaphors/train.txt")
write_dataset(m_dev+neg_dev, "./dataset/inversed-metaphors/dev.txt")
write_dataset(m_test+neg_test, "./dataset/inversed-metaphors/test.txt")

True

### Terms dataset

In [28]:
positives_meta = {
    True: [],
    False: []
}
for elem in positive:
    positives_meta[True in [term in elem for term in ("futuo", "mentul", "pedico", "cunnus", "culus")]].append(elem)
    
for key in positives_meta:
    print(f"{key}: {len(positives_meta[key])}")
    

# 90 / 10 ?
m_train, m_dev = split_dataset(positives_meta[False], ratio=.7)
m_test = positives_meta[True]

# Add a little of generic test
neg_train_dev, neg_test = split_dataset(negative, ratio=.7)
neg_train, neg_dev = split_dataset(neg_train_dev, ratio=0.9)


write_dataset(m_train+neg_train, "./dataset/terms/train.txt")
write_dataset(m_dev+neg_dev, "./dataset/terms/dev.txt")
write_dataset(m_test+neg_test, "./dataset/terms/test.txt")

True: 998
False: 1518


True

### Stats about the partial corpus

In [31]:
for file in glob.glob("dataset/main-partial/*.txt"):
    with open(file) as f:
        t = f.read()
        print(f"{file}\n\tPositives: {t.count('TAG]positive')}\n\tNegatives: {t.count('TAG]negative')}")

dataset/main-partial/siamese2.txt
	Positives: 0
	Negatives: 1
dataset/main-partial/test.txt
	Positives: 251
	Negatives: 426
dataset/main-partial/test-full.txt
	Positives: 1783
	Negatives: 3057
dataset/main-partial/siamese.txt
	Positives: 1
	Negatives: 1
dataset/main-partial/dev.txt
	Positives: 107
	Negatives: 171
dataset/main-partial/train.txt
	Positives: 393
	Negatives: 660
