# Extract NEs from TEIs

In [1]:
from spacytei.tei import TeiReader

In [2]:
file = 'data/goldstandard_kurz.xml'# path to you file

In [3]:
teidoc = TeiReader(file)

### map your tei encoding to NE-tags

In [4]:
NER_TAG_MAP = {
    "persRef": "PER",
    "placeRef": "LOC",
    "date": 'DATE'
}

### define the tags you used for NEs via xpath
* be aware that those xpaths are relativ to a parent node (defaults to tei:p)

In [5]:
ne_xpath = './/tei:persRef | .//tei:placeRef | .//tei:date'

In [6]:
ner_samples = teidoc.extract_ne_offsets(ne_xpath=ne_xpath, NER_TAG_MAP=NER_TAG_MAP)

In [7]:
ner_samples[:5]

[('Wieder zurückerhalten aus der bibliothek des herren kreishauptmanns, freyherrn von Stiebar zu Kröllendorf - in Wien den 14. Novembris 1823. Iohann Leopold Metzger freyherr von Metzburg, k. k hofrath',
  {'entities': [(69, 105, 'PER'),
    (94, 105, 'LOC'),
    (111, 115, 'LOC'),
    (120, 138, 'DATE'),
    (140, 184, 'PER')]}),
 ('Niederösterreichischer regierungs-, auch bankodeputazions rath, nieder- und oberösterreichischer landtstand 1718.',
  {'entities': [(108, 112, 'DATE')]}),
 ('D IV 1', {'entities': []}),
 ('Iohann Georg Mezger von Breysgaw',
  {'entities': [(0, 32, 'PER'), (24, 32, 'LOC')]}),
 ('Scydack: ein kocher luck: ein bogen Tschenzi: ein seiten, Meiran: handtheben miten am bogen. unleserliche Kommentare, die im Nachhinnein eingefügt wurden.',
  {'entities': []})]

## Extract NEs from TEIs (with sent-splitting)
* The samples above are by paragraph. In case of long(er) paragraphs, you could crate NE samples splitted by sents
* Sent splitting is done by a spacy model (default 'de_core_news_sm'), so make sure you have spacy and the model you'd like to use properly installed. 

In [8]:
ner_samples = teidoc.ne_offsets_by_sent(ne_xpath=ne_xpath, NER_TAG_MAP=NER_TAG_MAP)

In case the cell above threw an error, complaining about not finding a model, try to
* install the german model `!python -m spacy download de`
* and pass in the model name to `teidoc.ne_offsets_by_sent(ne_xpath=ne_xpath, NER_TAG_MAP=NER_TAG_MAP, model='de')`

In [9]:
# !python -m spacy download de

In [10]:
# ner_samples = teidoc.ne_offsets_by_sent(ne_xpath=ne_xpath, NER_TAG_MAP=NER_TAG_MAP, model='de')

In [24]:
ner_samples[:5]

[('Wieder zurückerhalten aus der bibliothek des herren kreishauptmanns, freyherrn von Stiebar zu Kröllendorf - in Wien den 14. Novembris',
  {'entities': [(69, 105, 'PER'), (94, 105, 'LOC'), (111, 115, 'LOC')]}),
 ('1823. Iohann Leopold Metzger freyherr von Metzburg, k. k hofrath',
  {'entities': [(6, 50, 'PER')]}),
 ('Niederösterreichischer regierungs-, auch bankodeputazions rath, nieder- und oberösterreichischer landtstand',
  {'entities': []}),
 ('1718.', {'entities': [(0, 4, 'DATE')]}),
 ('D IV 1', {'entities': []})]

## Extract NEs from TEIs in bulk
and save results to file

In [12]:
import os
import glob

In [13]:
from spacytei.tei_process import teis_to_traindata, teis_to_traindata_sents

In [14]:
tei_dir = './data/Dipko/XML_TEI' # define path to directory containing TEI's

In [15]:
files = glob.glob("{}/*.xml".format(tei_dir)) # store list of relative file names of TEI's

In [16]:
samples = teis_to_traindata(files, parent_node='.//tei:body//tei:p', ne_xpath=ne_xpath, NER_TAG_MAP=NER_TAG_MAP)

In [17]:
import pandas as pd

In [18]:
df = pd.DataFrame(samples, columns=['text', 'entities'])

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 412 entries, 0 to 411
Data columns (total 2 columns):
text        412 non-null object
entities    412 non-null object
dtypes: object(2)
memory usage: 6.5+ KB


In [20]:
df.to_csv('./out/samples_out.csv', index=False)

In [21]:
samples = teis_to_traindata_sents(files, parent_node='.//tei:body//tei:p', ne_xpath=ne_xpath, NER_TAG_MAP=NER_TAG_MAP)

In [22]:
df = pd.DataFrame(samples)
df.info()
df.to_csv('./out/samples_out_sents.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2067 entries, 0 to 2066
Data columns (total 2 columns):
0    2067 non-null object
1    2067 non-null object
dtypes: object(2)
memory usage: 32.4+ KB
