Rôle du notebook: conversion des fichiers XMLs en dataset TSV

In [21]:
import lxml.etree as ET

CATS = ("Case", "Numb", "Gend", "Mood", "Tense", "Voice", "Person", "Deg")

def msd_to_tsv(attrib, cats=CATS):
    local_values = dict([
        tuple(elem.split("="))
        for elem in attrib.split("|")
        if elem.split("=")[0] in cats
    ])
    return "\t".join([
        local_values.get(cat, "-")
        for cat in cats
    ])

def file_to_string(fp: str, label: str, cats=CATS) -> str:
    """
    """
    with open(fp) as f:
        xml = ET.parse(f)
    return f"#{label}\n"+"\n".join([
        f"{token.text}\t"
        f"{token.attrib['lemma']}\t"
        f"{token.attrib['pos']}\t"
        f"{msd_to_tsv(token.attrib['msd'], cats=cats)}"
        for token in xml.xpath("//w")
    ])


file_to_string("/home/thibault/dev/these-corpus/data/555.xml", "sexual")

'#sexual\nqui\tqui1\tPROrel\tNom\tSing\tMasc\t-\t-\t-\t-\t-\nciet\tcieo\tVER\t-\tSing\t-\tInd\tPres\tAct\t3\t-\ninritans\tirrito\tVER\tNom\tSing\tCom\tPar\tPres\tAct\t-\t-\nloca\tlocus\tNOMcom\tAcc\tPlur\t-\t-\t-\t-\t-\t-\nturgida\tturgidus\tADJqua\tAcc\tPlur\tNeut\t-\t-\t-\t-\tPos\nsemine\tsemen1\tNOMcom\tAbl\tSing\t-\t-\t-\t-\t-\t-\nmulto\tmultus\tADJqua\tAbl\tSing\tMascNeut\t-\t-\t-\t-\tPos\n,\t,\tPUNC\t-\t-\t-\t-\t-\t-\t-\t-'

In [27]:
from collections import Counter, defaultdict
from typing import Dict

from allennlp.data.fields import TextField, LabelField, SequenceLabelField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer, TokenCharactersIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary


In [24]:
# To create fields, simply pass the data to constructor.
# NOTE: Don't worry about the token_indexers too much for now. We have a whole
# chapter on why TextFields are set up this way, and how they work.
tokens = [Token("The"), Token("best"), Token("movie"), Token("ever"), Token("!")]
token_indexers: Dict[str, TokenIndexer] = {"tokens": SingleIdTokenIndexer()}
text_field = TextField(tokens, token_indexers=token_indexers)

label_field = LabelField("pos")

sequence_label_field = SequenceLabelField(
    ["DET", "ADJ", "NOUN", "ADV", "PUNKT"], text_field
)

# You can use print() fields to see their content
print(text_field)
print(label_field)
print(sequence_label_field)

# Many of the fields implement native python methods in intuitive ways
print(len(sequence_label_field))
print(label for label in sequence_label_field)

# Fields know how to create empty fields of the same type
print(text_field.empty_field())
print(label_field.empty_field())
print(sequence_label_field.empty_field())


# You can count vocabulary items in fields
counter: Dict[str, Dict[str, int]] = defaultdict(Counter)
text_field.count_vocab_items(counter)
print(counter)

label_field.count_vocab_items(counter)
print(counter)

sequence_label_field.count_vocab_items(counter)
print(counter)

# Create Vocabulary for indexing fields
vocab = Vocabulary(counter)

# Fields know how to turn themselves into tensors
text_field.index(vocab)
# NOTE: in practice, we will batch together instances and use the maximum padding
# lengths, instead of getting them from a single instance.
# You can print this if you want to see what the padding_lengths dictionary looks
# like, but it can sometimes be a bit cryptic.
padding_lengths = text_field.get_padding_lengths()
print(text_field.as_tensor(padding_lengths))

label_field.index(vocab)
print(label_field.as_tensor(label_field.get_padding_lengths()))

sequence_label_field.index(vocab)
padding_lengths = sequence_label_field.get_padding_lengths()
print(sequence_label_field.as_tensor(padding_lengths))

# Fields know how to batch tensors
tensor1 = label_field.as_tensor(label_field.get_padding_lengths())

label_field2 = LabelField("pos")
label_field2.index(vocab)
tensor2 = label_field2.as_tensor(label_field2.get_padding_lengths())

batched_tensors = label_field.batch_tensors([tensor1, tensor2])
print(batched_tensors)

TextField of length 5 with text: 
 		[The, best, movie, ever, !]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'}
LabelField with label: pos in namespace: 'labels'.
SequenceLabelField of length 5 with labels:
 		['DET', 'ADJ', 'NOUN', 'ADV', 'PUNKT']
 		in namespace: 'labels'.
5
<generator object <genexpr> at 0x7fb979fd6bf8>
TextField of length 0 with text: 
 		[]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'}
LabelField with label: -1 in namespace: 'labels'.
SequenceLabelField of length 0 with labels:
 		[]
 		in namespace: 'labels'.
defaultdict(<class 'collections.Counter'>, {'tokens': Counter({'The': 1, 'best': 1, 'movie': 1, 'ever': 1, '!': 1})})
defaultdict(<class 'collections.Counter'>, {'tokens': Counter({'The': 1, 'best': 1, 'movie': 1, 'ever': 1, '!': 1}), 'labels': Counter({'pos': 1})})
defaultdict(<class 'collections.Counter'>, {'tokens': Counter({'The': 1, 'best': 1, 'movie': 1, 'ever': 1, '!': 1}), 'labels': Counter({'pos': 1, 'DET': 1, 'ADJ': 1, 'NOUN'

In [29]:
CATS
cats = CATS
token_indexers: Dict[str, TokenIndexer] = {
    "tokens": SingleIdTokenIndexer(namespace="token_vocab"),
    "lemmas": SingleIdTokenIndexer(namespace="lemma_vocab"),
    "token_characters": TokenCharactersIndexer(namespace="character_vocab"),
    **{
        task.lower(): SingleIdTokenIndexer(namespace=f"{task.lower()}_vocab")
        for task in cats
    }
}

vocab = Vocabulary()
vocab.add_tokens_to_namespace(
    ["This", "is", "some", "text", "."], namespace="token_vocab"
)
vocab.add_tokens_to_namespace(
    ["T", "h", "i", "s", " ", "o", "m", "e", "t", "x", "."], namespace="character_vocab"
)

Besoin de forker un Tokenizer sachant que `, feature_name="tag_"` indique d'où vient la donnée


[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]