In [1]:
from tf.app import use
from tf.core.files import dirMake

# Load TF data

In order to do this experiment you have to do

```
mkdir -p ~/github/ETCBC
cd ~/github/ETCBC
git clone https://github.com/ETCBC/bhsa
```

This notebook is then in file

```
~/github/ETCBC/bhsa/programs/stam.ipynb
```

In [4]:
A = use("ETCBC/bhsa:clone", checkout="clone", hoist=globals())

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


In [2]:
A = use("ETCBC/bhsa:clone", checkout="clone", version="2021x", hoist=globals())

**Locating corpus resources ...**

  0.56s Dataset without sections in otext:no section functions in the T-API


Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


App config error(s) in clause:
	label: feature typ not loaded
	label: feature rela not loaded
App config error(s) in clause_atom:
	label: feature code not loaded
App config error(s) in half_verse:
	label: feature label not loaded
App config error(s) in lex:
	template: feature voc_lex_utf8 not loaded
	label: feature voc_lex_utf8 not loaded
	featuresBare: feature gloss not loaded
App config error(s) in phrase:
	label: feature typ not loaded
	label: feature function not loaded
App config error(s) in phrase_atom:
	label: feature typ not loaded
	label: feature rela not loaded
App config error(s) in sentence:
	label: feature number not loaded
App config error(s) in sentence_atom:
	label: feature number not loaded
App config error(s) in subphrase:
	label: feature number not loaded
App config error(s) in word:
	featuresBare: feature gloss not loaded
	features: feature pdp not loaded
	features: feature vs not loaded
	features: feature vt not loaded


# Produce full text with offsets of the words.

In [3]:
text = []
pos = {}
curpos = 0

for w in F.otype.s("word"):
    value = F.g_word_utf8.v(w) + F.trailer.v(w)
    lValue = len(value)
    pos[w] = (curpos, curpos + lValue)
    curpos += lValue
    text.append(value)

text = "".join(text)


# Export to STAM

In [4]:
import stam

stamTextSel = stam.Selector.textselector
stamOffset = stam.Offset.simple
stamCompSel = stam.Selector.compositeselector
stamMultiSel = stam.Selector.multiselector
stamAnnoSel = stam.Selector.annotationselector

## General

In [5]:
setId = "features"
textId = "hebrew_unicode"

slotType = F.otype.slotType
otype = slotType
eoslots = E.oslots.s

## Higher order

*  slots are translated to annotations with data `type = "word"` and target the text segment of that word;
*  non-slot nodes `n` are translated to annotations with
   *  data `type = F.otype.v(n)`
   *  target: composite selector of the annotations of the slots that `n` is linked to


In [6]:
storeId = "ETCBC/bhsa-nodes-ho"
print(f"store (corpus {storeId}) ...")

store = stam.AnnotationStore(id=storeId)
storeAnnotate = store.annotate
storeAnno = store.annotation

dataset = store.add_annotationset(setId)

print(f"text (format {textId}) ...")
textResource = store.add_resource(id=textId, text=text)

otypeKey = dataset.add_key("otype")

annoIdFromNode = {}

print(f"nodes (type {slotType}) ...")

otype = slotType

for w in F.otype.s(otype):
    typeData = dict(key=otypeKey, value=otype, set=dataset)
    anno = storeAnnotate(
        target=stamTextSel(textResource, stamOffset(*pos[w])), data=typeData
    )
    annoIdFromNode[w] = anno.id()

for otype in F.otype.all:
    if otype == F.otype.slotType:
        continue
    print(f"nodes (type {otype}) ...")
    typeData = dict(key=otypeKey, value=otype, set=dataset)
    for n in F.otype.s(otype):
        slots = E.oslots.s(n)
        slotsSel = stamCompSel(
            *[stamAnnoSel(storeAnno(annoIdFromNode[slot])) for slot in slots]
        )
        anno = store.annotate(target=slotsSel, data=typeData)
        annoIdFromNode[n] = anno.id()

store (corpus ETCBC/bhsa-nodes-ho) ...
text (format hebrew_unicode) ...
nodes (type word) ...
nodes (type book) ...
nodes (type chapter) ...
nodes (type lex) ...
nodes (type verse) ...
nodes (type half_verse) ...
nodes (type sentence) ...
nodes (type sentence_atom) ...
nodes (type clause) ...
nodes (type clause_atom) ...
nodes (type phrase) ...
nodes (type phrase_atom) ...
nodes (type subphrase) ...


# Serializing

Lets serialize the STAM dataset to disk, in JSON and CSV.

In [7]:
workDir = f"{A.tempDir}/stam/ho"
dirMake(workDir)

## JSON

In [8]:
store.set_filename(f"{workDir}/bhsa-nodes-ho.json")
store.save()

1.17 GB

TF equivalent: only `otype` and `oslots`: 13.5MB

## CSV

In [9]:
store.set_filename(f"{workDir}/bhsa-nodes-ho.csv")
store.save()

346MB

## Text oriented

*  slots are translated to annotations with
   *  data `type = "word"` and
   *  target the multi selector of the text segments that correspond with all words;
*  non-slot nodes `n` are translated to annotations with
   *  data `type = F.otype.v(n)`
   *  target: the multi selector of the composite selector of the text segments of the slots that `n` is linked to

In [10]:
storeId = "ETCBC/bhsa-nodes-to"
print(f"store (corpus {storeId}) ...")

store = stam.AnnotationStore(id=storeId)
storeAnnotate = store.annotate
storeAnno = store.annotation

dataset = store.add_annotationset(setId)

print(f"text (format {textId}) ...")
textResource = store.add_resource(id=textId, text=text)

otypeKey = dataset.add_key("otype")

for otype in F.otype.all:
    print(f"nodes (type {otype}) ...")
    typeData = dict(key=otypeKey, value=otype, set=dataset)
    if otype == slotType:
        target = stamMultiSel(
            *[stamTextSel(textResource, stamOffset(*pos[w])) for w in F.otype.s(otype)]
        )
    else:
        target = stamMultiSel(
            *[
                stamCompSel(
                    *[
                        stamTextSel(textResource, stamOffset(*pos[w]))
                        for w in eoslots(n)
                    ]
                )
                for n in F.otype.s(otype)
            ]
        )
    anno = storeAnnotate(target=target, data=typeData)

store (corpus ETCBC/bhsa-nodes-to) ...
text (format hebrew_unicode) ...
nodes (type book) ...
nodes (type chapter) ...
nodes (type lex) ...
nodes (type verse) ...
nodes (type half_verse) ...
nodes (type sentence) ...
nodes (type sentence_atom) ...
nodes (type clause) ...
nodes (type clause_atom) ...
nodes (type phrase) ...
nodes (type phrase_atom) ...
nodes (type subphrase) ...
nodes (type word) ...


# Serializing

Lets serialize the STAM dataset to disk, in JSON and CSV.

In [11]:
workDir = f"{A.tempDir}/stam/to"
dirMake(workDir)

## JSON

In [12]:
store.set_filename(f"{workDir}/bhsa-nodes-to.json")
store.save()

2.61 GB

TF equivalent: only `otype` and `oslots`: 13.5MB

## CSV

In [13]:
store.set_filename(f"{workDir}/bhsa-nodes-to.csv")
store.save()

116MB

## Loading from disk

We load the BHSA from disk, both from JSON and from CSV.

## Higher Order

### JSON


When I did this, I restarted the kernel first.

In [1]:
import os
import stam

workDir = os.path.expanduser("~/github/ETCBC/bhsa/_temp/stam/ho")
storeH = stam.AnnotationStore(file=f"{workDir}/bhsa-nodes-ho.json")

### CSV


When I did this, I restarted the kernel first.

In [1]:
import os
import stam

workDir = os.path.expanduser("~/github/ETCBC/bhsa/_temp/stam/ho")
storeH = stam.AnnotationStore(file=f"{workDir}/bhsa-nodes-ho.store.stam.csv")

## Text-oriented

### JSON


When I did this, I restarted the kernel first.

In [1]:
import os
import stam

workDir = os.path.expanduser("~/github/ETCBC/bhsa/_temp/stam/to")
storeT = stam.AnnotationStore(file=f"{workDir}/bhsa-nodes-to.json")

PyStamError: [StamError] WrongSelectorType: Selector is not of the right type here (Complex selectors may not be nested)

### CSV


When I did this, I restarted the kernel first.

In [2]:
import os
import stam

workDir = os.path.expanduser("~/github/ETCBC/bhsa/_temp/stam/to")
storeT = stam.AnnotationStore(file=f"{workDir}/bhsa-nodes-to.store.stam.csv")

PyStamError: [StamError] CsvError: Parsing failed: SelectorType can't be a subselector under a complex selector: CompositeSelector ()

The text-oriented approach is not (yet) supported by STAM because complex selectors may not be nested. 

# Statistics

contender | load time (sec) | save time (sec) | mem usage (MB) | disk usage (MB)
--- | --- | --- | --- | ---
`STAM-ho` | | | 1840 |
`STAM-ho` JSON | 18.4 | 2.3 | | 1200 |
`STAM-ho` CSV | 9.0 | 1.3 | | 346
`STAM-to` | | | X |
`STAM-to` JSON | X | 4.9 | | 2600
`STAM-to` CSV | X | 1.1 | | 117
`TF` | | | 940 |
`TF text` | 50 | | | 14
`TF opt` | 2.2 | | | 51

