-
Notifications
You must be signed in to change notification settings - Fork 36
/
core_functions.py
166 lines (153 loc) · 7.7 KB
/
core_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
Created on May 3, 2014
@author: Ronald Haentjens Dekker
"""
from xml.etree import ElementTree as etree
from xml.dom.minidom import Document
from collections import defaultdict
from collatex.core_classes import Collation, VariantGraph, join, AlignmentTable, VariantGraphRanking
from collatex.exceptions import SegmentationError
from collatex.experimental_astar_aligner import ExperimentalAstarAligner
import json
from collatex.edit_graph_aligner import EditGraphAligner
from collatex.display_module import display_alignment_table_as_html, visualize_table_vertically_with_colors
from collatex.display_module import display_variant_graph_as_svg
from collatex.display_module import display_alignment_table_as_csv
from collatex.near_matching import perform_near_match
# Valid options for output are:
# "table" for the alignment table (default)
# "graph" for the variant graph
# "json" for the alignment table exported as JSON
# "csv", "tsv" for CSV and TSV output
# "xml" for the alignment table as pseudo-TEI XML
# All columns are output as <app> elements, regardless of whether they have variation
# Each witness is in a separate <rdg> element with the siglum in a @wit attribute
# (i.e, witnesses with identical readings are nonetheless in separate <rdg> elements)
# "tei" for the alignment table as TEI XML parallel segmentation (but in no namespace)
# Wrapper element is always <cx:apparatus> in the CollateX namespace
# indent=True pretty-prints the output
# (for proofreading convenience only; does not observe proper white-space behavior)
def collate(collation, output="table", layout="horizontal", segmentation=True, near_match=False, astar=False,
detect_transpositions=False, debug_scores=False, properties_filter=None, indent=False):
# collation may be collation or json; if it's the latter, use it to build a real collation
if isinstance(collation, dict):
json_collation = Collation()
for witness in collation["witnesses"]:
json_collation.add_witness(witness)
collation = json_collation
# assume collation is collation (by now); no error trapping
if not astar:
algorithm = EditGraphAligner(collation, near_match=False, detect_transpositions=detect_transpositions,
debug_scores=debug_scores, properties_filter=properties_filter)
else:
algorithm = ExperimentalAstarAligner(collation, near_match=False, debug_scores=debug_scores)
# build graph
graph = VariantGraph()
algorithm.collate(graph)
ranking = VariantGraphRanking.of(graph)
if near_match:
# Segmentation not supported for near matching; raise exception if necessary
# There is already a graph ('graph', without near-match edges) and ranking ('ranking')
if segmentation:
raise SegmentationError('segmentation must be set to False for near matching')
ranking = perform_near_match(graph, ranking)
# join parallel segments
if segmentation:
join(graph)
ranking = VariantGraphRanking.of(graph)
# check which output format is requested: graph or table
if output == "svg" or output == "svg_simple":
return display_variant_graph_as_svg(graph, output)
if output == "graph":
return graph
# create alignment table
table = AlignmentTable(collation, graph, layout, ranking)
if output == "json":
return export_alignment_table_as_json(table)
if output == "html":
return display_alignment_table_as_html(table)
if output == "html2":
return visualize_table_vertically_with_colors(table, collation)
if output == "table":
return table
if output == "xml":
return export_alignment_table_as_xml(table)
if output == "tei":
return export_alignment_table_as_tei(table, indent)
if output == "csv" or output == "tsv":
return display_alignment_table_as_csv(table, output)
else:
raise Exception("Unknown output type: " + output)
def export_alignment_table_as_json(table, indent=None, status=False):
json_output = {"table": []}
sigli = []
for row in table.rows:
sigli.append(row.header)
json_output["table"].append(
[[listItem.token_data for listItem in cell] if cell else None for cell in row.cells])
json_output["witnesses"] = sigli
if status:
variant_status = []
for column in table.columns:
variant_status.append(column.variant)
json_output["status"] = variant_status
return json.dumps(json_output, sort_keys=True, indent=indent, ensure_ascii=False)
def export_alignment_table_as_xml(table):
readings = []
for column in table.columns:
app = etree.Element('app')
for key, value in sorted(column.tokens_per_witness.items()):
child = etree.Element('rdg')
child.attrib['wit'] = "#" + key
child.text = "".join(str(item.token_data["t"]) for item in value)
app.append(child)
# Without the encoding specification, outputs bytes instead of a string
result = etree.tostring(app, encoding="unicode")
readings.append(result)
return "<root>" + "".join(readings) + "</root>"
def export_alignment_table_as_tei(table, indent=None):
d = Document()
root = d.createElementNS("http://interedition.eu/collatex/ns/1.0", "cx:apparatus") # fake namespace declarations
root.setAttribute("xmlns:cx", "http://interedition.eu/collatex/ns/1.0")
root.setAttribute("xmlns", "http://www.tei-c.org/ns/1.0")
d.appendChild(root)
for column in table.columns:
value_dict = defaultdict(list)
ws_flag = False
for key, value in sorted(column.tokens_per_witness.items()):
# value_dict key is reading, value is list of witnesses
t_readings = "".join(item.token_data["t"] for item in value)
if ws_flag == False and t_readings.endswith((" ", r"\u0009", r"\000a")): # space, tab, lf
ws_flag = True
value_dict[t_readings.strip()].append(key)
# REVIEW [RHD]: Isn't there a method on table that can be used instead of this len(next(iter() etc?
# otherwise I think there should be. Not sure what len(next(iter(etc))) represents.
#
# See https://stackoverflow.com/questions/4002874/non-destructive-version-of-pop-for-a-dictionary
# It returns the number of witnesses that attest the one reading in the dictionary, that is, it peeks
# nondestructively at the value of the single dictionary item, which is a list, and counts the members
# of the list
if len(value_dict) == 1 and len(next(iter(value_dict.values()))) == len(table.rows):
# len(table.rows) is total number of witnesses; guards against nulls, which aren't in table
key, value = value_dict.popitem() # there's just one item
text_node = d.createTextNode(key)
root.appendChild(text_node)
else:
# variation is either more than one reading, or one reading plus nulls
app = d.createElementNS("http://www.tei-c.org/ns/1.0", "app")
root.appendChild(app)
for key, value in value_dict.items():
# key is reading (with trailing whitespace stripped), value is list of witnesses
rdg = d.createElementNS("http://www.tei-c.org/ns/1.0", "rdg")
rdg.setAttribute("wit", " ".join(["#" + item for item in value_dict[key]]))
text_node = d.createTextNode(key)
rdg.appendChild(text_node)
app.appendChild(rdg)
if ws_flag:
text_node = d.createTextNode(" ")
root.appendChild(text_node)
if indent:
result = d.toprettyxml()
else:
result = d.toxml()
return result