In [1]:
import datasets
from datasets import load_dataset

In [6]:
datasets_to_parse = [
    ("coref-data/preco_indiscrim", "default"),
    ("coref-data/litbank_indiscrim", "split_0"),
    ("coref-data/arrau_indiscrim", "default"),
    ("coref-data/phrase_detectives_indiscrim", "default"),
    ("coref-data/mmc_indiscrim", "mmc_en"),
]

# ("coref-data/gum_indiscrim", "ontogum"), # just constituency
# ("coref-data/gum_indiscrim", "original"), # just constituency

# "misc": { "parse_tree"
# { "deprel": "prep", "head": 2, "id": 3, "text": "of", "upos": "IN", "xpos": "IN" }

In [7]:
dataset_name, dataset_config = datasets_to_parse[0]
dataset = load_dataset(dataset_name, dataset_config)

In [9]:
import stanza

nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse,constituency', tokenize_pretokenized=True)

2024-02-12 21:20:22 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-02-12 21:20:23 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| mwt          | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |

2024-02-12 21:20:23 INFO: Using device: cpu
2024-02-12 21:20:23 INFO: Loading: tokenize
2024-02-12 21:20:23 INFO: Loading: mwt
2024-02-12 21:20:23 INFO: Loading: pos
2024-02-12 21:20:24 INFO: Loading: lemma
2024-02-12 21:20:24 INFO: Loading: constituency
2024-02-12 21:20:24 INFO: Loading: depparse
2024-02-12 21:20:24 INFO: Done loading processors!


In [28]:
def add_parse_to_example(example):
    sentences = example["sentences"]
    words = [[t["text"] for t in s["tokens"]] for s in sentences]
    doc = nlp(words)
    
    for stanza_sentence, sentence in zip(doc.sentences, sentences):
        sentence["misc"] = {"parse_tree": str(stanza_sentence.constituency)}
        for stanza_word, token in zip(stanza_sentence.words, sentence["tokens"]):
            assert stanza_word.text == token["text"] and stanza_word.id == token["id"]
            word_dict = stanza_word.to_dict()
            for key, value in word_dict.items():
                token[key] = value

In [27]:
validation_split = dataset["validation"]

examples = validation_split.to_list()
for example in examples:
    add_parse_to_example(example)

{'id': 1, 'speaker': None, 'text': 'Dear Sir or Madam.', 'tokens': [{'id': 1, 'text': 'Dear', 'lemma': 'dear', 'upos': 'ADJ', 'xpos': 'JJ', 'feats': 'Degree=Pos', 'head': 2, 'deprel': 'amod', 'misc': '', 'start_char': 0, 'end_char': 4}, {'id': 2, 'text': 'Sir', 'lemma': 'Sir', 'upos': 'PROPN', 'xpos': 'NNP', 'feats': 'Number=Sing', 'head': 0, 'deprel': 'root', 'misc': '', 'start_char': 5, 'end_char': 8}, {'id': 3, 'text': 'or', 'lemma': 'or', 'upos': 'CCONJ', 'xpos': 'CC', 'head': 4, 'deprel': 'cc', 'misc': '', 'start_char': 9, 'end_char': 11}, {'id': 4, 'text': 'Madam', 'lemma': 'Madam', 'upos': 'PROPN', 'xpos': 'NNP', 'feats': 'Number=Sing', 'head': 2, 'deprel': 'conj', 'misc': '', 'start_char': 12, 'end_char': 17}, {'id': 5, 'text': '.', 'lemma': '.', 'upos': 'PUNCT', 'xpos': '.', 'head': 2, 'deprel': 'punct', 'misc': '', 'start_char': 18, 'end_char': 19}], 'misc': {'parse_tree': '(ROOT (NP (JJ Dear) (NNP Sir) (CC or) (NNP Madam) (. .)))'}}
