Rôle du notebook: conversion des fichiers XMLs en dataset TSV

## Function to parse

In [20]:
import lxml.etree as ET
import tqdm
import glob
import os
import os
from typing import List, Tuple

from constants import CATS

def msd_to_tsv(attrib, cats=CATS):
    local_values = dict([
        tuple(elem.split("="))
        for elem in attrib.split("|")
        if elem.split("=")[0] in cats
    ])
    return "\t".join([
        local_values.get(cat, "-")
        for cat in cats
    ])

def file_to_string(fp: str, label: str, cats=CATS) -> str:
    """
    """
    with open(fp) as f:
        xml = ET.parse(f)
    return f"#[TAG]{label}\n"+"\n".join([
        f"{token.text}\t"
        f"{token.attrib['lemma']}\t"
        f"{token.attrib['pos']}\t"
        f"{msd_to_tsv(token.attrib['msd'], cats=cats)}"
        for token in xml.xpath("//w")
    ])

## Generate full

In [21]:
positive = []

for file in glob.glob("/home/thibault/dev/these-corpus/data/*.xml"):
    positive.append(file_to_string(file, "positive"))

negative = []

for file in glob.glob("negative-examples/*.xml"):
    negative.append(file_to_string(file, "negative"))


def write_dataset(dataset: List[str], filepath: str, cats: Tuple[str, ...] = CATS, delimiter="\t"):
    with open(filepath, "w") as f:
        f.write(f"[header]\ttoken\tlemma\tpos\t{delimiter.join(cats)}\n")
        f.write("\n\n\n".join(dataset))
    return True
    
os.makedirs("./dataset/raw/", exist_ok=True)

write_dataset(positive, "dataset/raw/positive.txt")
write_dataset(negative, "dataset/raw/negative.txt")

True

## Generate train, dev, test

In [25]:
import random
import math

random.shuffle(positive)
random.shuffle(negative)

train_ratio, dev_ratio = 0.2, 0.1

def cut(dataset: float, r_train: float, r_dev: float):
    length = len(dataset)
    r_train = math.ceil(length*r_train)
    r_dev = r_train + math.ceil(length*r_dev)
    return dataset[:r_train], dataset[r_train:r_dev], dataset[r_dev:]


cut_positive = cut(positive, r_train=train_ratio, r_dev=dev_ratio)
cut_negative = cut(negative, r_train=train_ratio, r_dev=dev_ratio)

train, dev, test = [pos+neg for pos, neg in zip(cut_positive, cut_negative)]

random.shuffle(train)
random.shuffle(dev)
random.shuffle(test)

os.makedirs("./dataset/split/", exist_ok=True)
write_dataset(train, "./dataset/split/train.txt")
write_dataset(dev, "./dataset/split/dev.txt")
write_dataset(test, "./dataset/split/test.txt")

True