Skip to content

Commit

Permalink
Merge pull request #68 from djbpitt/tei
Browse files Browse the repository at this point in the history
group TEI <rdg> elements independently of trailing whitespace
  • Loading branch information
rhdekker committed Aug 19, 2018
2 parents 9840044 + 075df1b commit 5d50fbf
Showing 1 changed file with 16 additions and 14 deletions.
30 changes: 16 additions & 14 deletions collatex-pythonport/collatex/core_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
@author: Ronald Haentjens Dekker
"""
import re
from xml.etree import ElementTree as etree
from xml.dom.minidom import Document
from collections import defaultdict
Expand Down Expand Up @@ -42,7 +41,8 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n

# assume collation is collation (by now); no error trapping
if not astar:
algorithm = EditGraphAligner(collation, near_match=False, detect_transpositions=detect_transpositions, debug_scores=debug_scores, properties_filter=properties_filter)
algorithm = EditGraphAligner(collation, near_match=False, detect_transpositions=detect_transpositions,
debug_scores=debug_scores, properties_filter=properties_filter)
else:
algorithm = ExperimentalAstarAligner(collation, near_match=False, debug_scores=debug_scores)

Expand Down Expand Up @@ -116,17 +116,22 @@ def export_alignment_table_as_xml(table):
readings.append(result)
return "<root>" + "".join(readings) + "</root>"


def export_alignment_table_as_tei(table, indent=None):
d = Document()
root = d.createElementNS("http://interedition.eu/collatex/ns/1.0", "cx:apparatus") # fake namespace declarations
root.setAttribute("xmlns:cx","http://interedition.eu/collatex/ns/1.0")
root.setAttribute("xmlns:cx", "http://interedition.eu/collatex/ns/1.0")
root.setAttribute("xmlns", "http://www.tei-c.org/ns/1.0")
d.appendChild(root)
for column in table.columns:
value_dict = defaultdict(list)
ws_flag = False
for key, value in sorted(column.tokens_per_witness.items()):
# key is reading, value is list of witnesses
value_dict["".join(str(item.token_data["t"]) for item in value)].append(key)
# value_dict key is reading, value is list of witnesses
t_readings = "".join(item.token_data["t"] for item in value)
if ws_flag == False and t_readings.endswith((" ", r"\u0009", r"\000a")): # space, tab, lf
ws_flag = True
value_dict[t_readings.strip()].append(key)

# REVIEW [RHD]: Isn't there a method on table that can be used instead of this len(next(iter() etc?
# otherwise I think there should be. Not sure what len(next(iter(etc))) represents.
Expand All @@ -142,21 +147,18 @@ def export_alignment_table_as_tei(table, indent=None):
root.appendChild(text_node)
else:
# variation is either more than one reading, or one reading plus nulls
ws_flag = False # add space after <app> if any <rdg> ends in whitespace
app = d.createElementNS("http://www.tei-c.org/ns/1.0", "app")
root.appendChild(app)
for key,value in value_dict.items():
# key is reading, value is list of witnesses
for key, value in value_dict.items():
# key is reading (with trailing whitespace stripped), value is list of witnesses
rdg = d.createElementNS("http://www.tei-c.org/ns/1.0", "rdg")
rdg.setAttribute("wit", " ".join(["#" + item for item in value_dict[key]]))
if ws_flag == False and key.endswith((" ", r"\u0009", r"\u000a")): # space, tab, linefeed
ws_flag = True
text_node = d.createTextNode(key.strip())
text_node = d.createTextNode(key)
rdg.appendChild(text_node)
app.appendChild(rdg)
if ws_flag:
text_node = d.createTextNode(" ")
root.appendChild(text_node)
if ws_flag:
text_node = d.createTextNode(" ")
root.appendChild(text_node)
if indent:
result = d.toprettyxml()
else:
Expand Down

0 comments on commit 5d50fbf

Please sign in to comment.