Skip to content

Commit

Permalink
Merge pull request #376 from Anaphory/fix-phylo
Browse files Browse the repository at this point in the history
Fix error in phylogenetics exporter
  • Loading branch information
Anaphory committed Sep 22, 2022
2 parents 47a7c32 + cabc7e0 commit 3cad3ae
Show file tree
Hide file tree
Showing 9 changed files with 102 additions and 49 deletions.
6 changes: 3 additions & 3 deletions .zenodo.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
}
],
"upload_type": "software",
"version": "1.0.6",
"publication_date": "2022-04-28",
"version": "1.0.7",
"publication_date": "2022-09-22",
"creators": [
{
"orcid": "0000-0002-8155-9089",
Expand All @@ -30,7 +30,7 @@
"related_identifiers": [
{
"scheme": "url",
"identifier": "https://github.com/Anaphory/lexedata/tree/v1.0.6",
"identifier": "https://github.com/Anaphory/lexedata/tree/v1.0.7",
"relation": "isSupplementTo"
},
{
Expand Down
4 changes: 2 additions & 2 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ authors:
orcid: "https://orcid.org/0000-0002-5693-975X"
title: "Lexedata"
subtitle: "Lexical Data Editing tools"
version: 1.0.6
version: 1.0.7
license: GPL-3.0+
doi: 10.5281/zenodo.5541167
date-released: 2022-04-28
date-released: 2022-09-22
url: "https://github.com/Anaphory/lexedata"
2 changes: 1 addition & 1 deletion src/lexedata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@

__all__ = ["util"]

__version__ = "1.0.6"
__version__ = "1.0.7"
19 changes: 14 additions & 5 deletions src/lexedata/edit/simplify_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
import pycldf
from lexedata.util.simplify_ids import simplify_table_ids_and_references

if __name__ == "__main__":

def parser():
"""Construct the CLI argument parser for this script."""
parser = cli.parser(__package__ + "." + Path(__file__).stem, __doc__)
parser.add_argument(
"--transparent",
Expand All @@ -30,12 +32,17 @@
nargs="+",
help="Only fix the IDs of these tables.",
)
args = parser.parse_args()
return parser


if __name__ == "__main__":
args = parser().parse_args()
logger = cli.setup_logging(args)

if args.uppercase:
# TODO: implement this
raise NotImplementedError
normalize = str.upper
else:
normalize = str.lower

ds = pycldf.Wordlist.from_metadata(args.metadata)

Expand All @@ -51,6 +58,8 @@

for table in tables:
logger.info(f"Handling table {table.url.string}…")
simplify_table_ids_and_references(ds, table, args.transparent, logger)
simplify_table_ids_and_references(
ds, table, args.transparent, logger, additional_normalize=normalize
)

ds.write_metadata()
15 changes: 10 additions & 5 deletions src/lexedata/exporter/cognates.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,11 +258,16 @@ def write_row_header(self, cogset, row_number: int):
raise NotImplementedError(
"You expect central conceps in your cognate set table, but you don't have any central concepts stored with your cognate sets"
)
try:
value = self.separators[db_name].join([str(v) for v in cogset[db_name]])
except KeyError:
# No separator
value = cogset.get(db_name, "")
if cogset[db_name] is None:
value = ""
else:
try:
value = self.separators[db_name].join(
[str(v) for v in cogset[db_name]]
)
except KeyError:
# No separator
value = cogset.get(db_name, "")
cell = self.ws.cell(row=row_number, column=col, value=value)
# Transfer the cognateset comment to the first Excel cell.
if col == 1 and cogset.get("comment"):
Expand Down
51 changes: 32 additions & 19 deletions src/lexedata/exporter/phylogenetics.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,7 +692,7 @@ def format_nexus(

sequences = [
"{} {} {}".format(lang, " " * (max_length - len(str(lang))), seq)
for lang, seq in zip(languages, sequences)
for lang, seq in sorted(zip(languages, sequences))
]

if partitions:
Expand Down Expand Up @@ -756,7 +756,7 @@ def fill_beast(data_object: ET.Element, languages, sequences) -> None:
data_object.attrib["dataType"] = "integer"
data_object.attrib["spec"] = "Alignment"
data_object.text = "\n"
for language, sequence in zip(languages, sequences):
for language, sequence in sorted(zip(languages, sequences)):
seq = "".join(sequence)
ET.SubElement(
data_object,
Expand Down Expand Up @@ -792,27 +792,40 @@ def compress_indices(indices: t.Set[int]) -> t.Iterator[slice]:
yield sl


def add_partitions(data_object: ET.Element, partitions):
def add_partitions(data_object: ET.Element, partitions: t.Dict[str, t.Iterable[int]]):
"""Add partitions after the <data> object
>>> xml = ET.fromstring("<beast><data id='alignment'/></beast>")
>>> data = xml.find(".//data")
>>> partitions = {"a": [1, 2, 3, 5], "b": [4, 6, 7]}
>>> add_partitions(data, partitions)
>>> print(ET.tostring(xml).decode("utf-8"))
<beast><data id="alignment"/><data id="concept:a" spec="FilteredAlignment" filter="1,2-4,6" data="@alignment" ascertained="true" excludefrom="0" excludeto="1"/><data id="concept:b" spec="FilteredAlignment" filter="1,5,7-8" data="@alignment" ascertained="true" excludefrom="0" excludeto="1"/></beast>
"""
previous_alignment = data_object
for name, indices in partitions.items():
indices_set = compress_indices(set(indices))
indices_string = ",".join(
"{:d}-{:d}".format(s.start + 1, s.stop) for s in indices_set
"{:d}-{:d}".format(s.start + 1, s.stop)
if s.start + 1 != s.stop
else "{:d}".format(s.stop)
for s in indices_set
)
previous_alignment.addnext(
data_object.makeelement(
"data",
{
"id": "concept:" + name,
"spec": "FilteredAlignment",
"filter": "1," + indices_string,
"data": "@" + data_object.attrib["id"],
"ascertained": "true",
"excludefrom": "0",
"excludeto": "1",
},
)
e = data_object.makeelement(
"data",
{
"id": "concept:" + name,
"spec": "FilteredAlignment",
"filter": "1," + indices_string,
"data": "@" + data_object.attrib["id"],
"ascertained": "true",
"excludefrom": "0",
"excludeto": "1",
},
)
previous_alignment.addnext(e)
previous_alignment = e


def parser():
Expand Down Expand Up @@ -863,7 +876,7 @@ def parser():
parser.add_argument(
"--coding",
action=cli.enum_from_lower(CodingProcedure),
default="RootMeaning",
default=CodingProcedure.ROOTMEANING,
help="""Coding method: In the `RootMeaning` coding method, every character
describes the presence or absence of a particular root morpheme or
cognate class in the word(s) for a given meaning; In the
Expand Down Expand Up @@ -908,7 +921,7 @@ def parser():
if language in args.languages
}

logger.info(f"Imported languages {set(ds)}.")
logger.info(f"Exported languages {set(ds)}.")

# Step 2: Code the data
n_symbols, datatype = 2, "binary"
Expand Down
19 changes: 16 additions & 3 deletions src/lexedata/util/simplify_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,20 @@
}


def clean_mapping(rows: t.Mapping[str, t.Mapping[str, str]]) -> t.Mapping[str, str]:
def clean_mapping(
rows: t.Mapping[str, t.Mapping[str, str]],
additional_normalize: t.Callable[[str], str] = str.lower,
) -> t.Mapping[str, str]:
"""Create unique normalized IDs.
>>> clean_mapping({"A": {}, "B": {}})
{'A': 'a', 'B': 'b'}
>>> clean_mapping({"A": {}, "a": {}})
{'A': 'a', 'a': 'a_x2'}
>>> clean_mapping({"A": {}, "a": {}}, str.upper)
{'A': 'A', 'a': 'A_x2'}
"""
avoid = {id.lower() for id in rows}

Expand All @@ -31,6 +37,7 @@ def clean_mapping(rows: t.Mapping[str, t.Mapping[str, str]]) -> t.Mapping[str, s
base = string_to_id("_".join(row.values()))
else:
base = string_to_id(id)
base = additional_normalize(base)

if base in avoid and base not in mapping.values():
# I kept a spot for you!
Expand Down Expand Up @@ -198,6 +205,7 @@ def simplify_table_ids_and_references(
table: csvw.Table,
transparent: bool = True,
logger: cli.logging.Logger = cli.logger,
additional_normalize: t.Callable[[str], str] = str.lower,
) -> bool:
"""Simplify the IDs of the given table."""
ttype = ds.get_tabletype(table)
Expand All @@ -219,9 +227,14 @@ def simplify_table_ids_and_references(

if transparent and ttype in ID_COMPONENTS:
cols = {prop: ds[ttype, prop].name for prop in ID_COMPONENTS[ttype]}
mapping = clean_mapping(cache_table(ds, ttype, cols))
mapping = clean_mapping(
cache_table(ds, ttype, cols), additional_normalize=additional_normalize
)
else:
mapping = clean_mapping(cache_table(ds, table.url.string, {}))
mapping = clean_mapping(
cache_table(ds, table.url.string, {}),
additional_normalize=additional_normalize,
)

update_ids(ds, table, mapping)
return True
22 changes: 11 additions & 11 deletions test/test_cognate_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,17 +270,17 @@ def test_adding_singleton_cognatesets_with_status(caplog):
# when accessing the row as a tuple the index is not 1-based as for excel sheets
status = [row[cogset_index].value for row in excel_writer.ws.iter_rows(min_row=2)]
assert status == [
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"NEW",
"NEW",
"NEW",
Expand Down
13 changes: 13 additions & 0 deletions test/test_various_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
)
from lexedata.exporter.cognates import parser as cex_parser
from lexedata.importer.excel_long_format import parser as ilong_parser
from lexedata.edit.simplify_ids import parser as sid_parser


def test_setorfromfile_list():
Expand Down Expand Up @@ -58,6 +59,18 @@ def test_loglevel_parser():
assert parameters.loglevel == logging.ERROR


def test_sid_parser():
parameters = sid_parser().parse_args(
["-v", "--tables", "ParameterTable", "CognatesetTable", "--uppercase"]
)
# Optional positional argument ("FormTable") after optional switch ("-q",
# but also "-V" which uses a builtin action) does not seem to work.
assert parameters.loglevel == logging.DEBUG
assert parameters.tables == ["ParameterTable", "CognatesetTable"]
assert parameters.uppercase
assert not parameters.transparent


def test_phylo_parser():
_, fname = tempfile.mkstemp(".csv")
with open(fname, "w", encoding="utf-8") as file:
Expand Down

0 comments on commit 3cad3ae

Please sign in to comment.