Merge pull request #376 from Anaphory/fix-phylo

Fix error in phylogenetics exporter
lexedata · Sep 22, 2022 · 3cad3ae · 3cad3ae
2 parents 47a7c32 + cabc7e0
commit 3cad3ae
Show file tree

Hide file tree

Showing 9 changed files with 102 additions and 49 deletions.
diff --git a/.zenodo.json b/.zenodo.json
@@ -8,8 +8,8 @@
         }
     ],
     "upload_type": "software",
-    "version": "1.0.6",
-    "publication_date": "2022-04-28",
+    "version": "1.0.7",
+    "publication_date": "2022-09-22",
     "creators": [
         {
             "orcid": "0000-0002-8155-9089",
@@ -30,7 +30,7 @@
     "related_identifiers": [
         {
             "scheme": "url",
-            "identifier": "https://github.com/Anaphory/lexedata/tree/v1.0.6",
+            "identifier": "https://github.com/Anaphory/lexedata/tree/v1.0.7",
             "relation": "isSupplementTo"
         },
         {

diff --git a/CITATION.cff b/CITATION.cff
@@ -11,8 +11,8 @@ authors:
   orcid: "https://orcid.org/0000-0002-5693-975X"
 title: "Lexedata"
 subtitle: "Lexical Data Editing tools"
-version: 1.0.6
+version: 1.0.7
 license: GPL-3.0+
 doi: 10.5281/zenodo.5541167
-date-released: 2022-04-28
+date-released: 2022-09-22
 url: "https://github.com/Anaphory/lexedata"
diff --git a/src/lexedata/__init__.py b/src/lexedata/__init__.py
@@ -11,4 +11,4 @@
 
 __all__ = ["util"]
 
-__version__ = "1.0.6"
+__version__ = "1.0.7"
diff --git a/src/lexedata/edit/simplify_ids.py b/src/lexedata/edit/simplify_ids.py
@@ -11,7 +11,9 @@
 import pycldf
 from lexedata.util.simplify_ids import simplify_table_ids_and_references
 
-if __name__ == "__main__":
+
+def parser():
+    """Construct the CLI argument parser for this script."""
     parser = cli.parser(__package__ + "." + Path(__file__).stem, __doc__)
     parser.add_argument(
         "--transparent",
@@ -30,12 +32,17 @@
         nargs="+",
         help="Only fix the IDs of these tables.",
     )
-    args = parser.parse_args()
+    return parser
+
+
+if __name__ == "__main__":
+    args = parser().parse_args()
     logger = cli.setup_logging(args)
 
     if args.uppercase:
-        # TODO: implement this
-        raise NotImplementedError
+        normalize = str.upper
+    else:
+        normalize = str.lower
 
     ds = pycldf.Wordlist.from_metadata(args.metadata)
 
@@ -51,6 +58,8 @@
 
     for table in tables:
         logger.info(f"Handling table {table.url.string}…")
-        simplify_table_ids_and_references(ds, table, args.transparent, logger)
+        simplify_table_ids_and_references(
+            ds, table, args.transparent, logger, additional_normalize=normalize
+        )
 
     ds.write_metadata()
diff --git a/src/lexedata/exporter/cognates.py b/src/lexedata/exporter/cognates.py
@@ -258,11 +258,16 @@ def write_row_header(self, cogset, row_number: int):
                 raise NotImplementedError(
                     "You expect central conceps in your cognate set table, but you don't have any central concepts stored with your cognate sets"
                 )
-            try:
-                value = self.separators[db_name].join([str(v) for v in cogset[db_name]])
-            except KeyError:
-                # No separator
-                value = cogset.get(db_name, "")
+            if cogset[db_name] is None:
+                value = ""
+            else:
+                try:
+                    value = self.separators[db_name].join(
+                        [str(v) for v in cogset[db_name]]
+                    )
+                except KeyError:
+                    # No separator
+                    value = cogset.get(db_name, "")
             cell = self.ws.cell(row=row_number, column=col, value=value)
             # Transfer the cognateset comment to the first Excel cell.
             if col == 1 and cogset.get("comment"):

diff --git a/src/lexedata/exporter/phylogenetics.py b/src/lexedata/exporter/phylogenetics.py
@@ -692,7 +692,7 @@ def format_nexus(
 
     sequences = [
         "{} {} {}".format(lang, " " * (max_length - len(str(lang))), seq)
-        for lang, seq in zip(languages, sequences)
+        for lang, seq in sorted(zip(languages, sequences))
     ]
 
     if partitions:
@@ -756,7 +756,7 @@ def fill_beast(data_object: ET.Element, languages, sequences) -> None:
     data_object.attrib["dataType"] = "integer"
     data_object.attrib["spec"] = "Alignment"
     data_object.text = "\n"
-    for language, sequence in zip(languages, sequences):
+    for language, sequence in sorted(zip(languages, sequences)):
         seq = "".join(sequence)
         ET.SubElement(
             data_object,
@@ -792,27 +792,40 @@ def compress_indices(indices: t.Set[int]) -> t.Iterator[slice]:
         yield sl
 
 
-def add_partitions(data_object: ET.Element, partitions):
+def add_partitions(data_object: ET.Element, partitions: t.Dict[str, t.Iterable[int]]):
+    """Add partitions after the <data> object
+
+    >>> xml = ET.fromstring("<beast><data id='alignment'/></beast>")
+    >>> data = xml.find(".//data")
+    >>> partitions = {"a": [1, 2, 3, 5], "b": [4, 6, 7]}
+    >>> add_partitions(data, partitions)
+    >>> print(ET.tostring(xml).decode("utf-8"))
+    <beast><data id="alignment"/><data id="concept:a" spec="FilteredAlignment" filter="1,2-4,6" data="@alignment" ascertained="true" excludefrom="0" excludeto="1"/><data id="concept:b" spec="FilteredAlignment" filter="1,5,7-8" data="@alignment" ascertained="true" excludefrom="0" excludeto="1"/></beast>
+
+    """
     previous_alignment = data_object
     for name, indices in partitions.items():
         indices_set = compress_indices(set(indices))
         indices_string = ",".join(
-            "{:d}-{:d}".format(s.start + 1, s.stop) for s in indices_set
+            "{:d}-{:d}".format(s.start + 1, s.stop)
+            if s.start + 1 != s.stop
+            else "{:d}".format(s.stop)
+            for s in indices_set
         )
-        previous_alignment.addnext(
-            data_object.makeelement(
-                "data",
-                {
-                    "id": "concept:" + name,
-                    "spec": "FilteredAlignment",
-                    "filter": "1," + indices_string,
-                    "data": "@" + data_object.attrib["id"],
-                    "ascertained": "true",
-                    "excludefrom": "0",
-                    "excludeto": "1",
-                },
-            )
+        e = data_object.makeelement(
+            "data",
+            {
+                "id": "concept:" + name,
+                "spec": "FilteredAlignment",
+                "filter": "1," + indices_string,
+                "data": "@" + data_object.attrib["id"],
+                "ascertained": "true",
+                "excludefrom": "0",
+                "excludeto": "1",
+            },
         )
+        previous_alignment.addnext(e)
+        previous_alignment = e
 
 
 def parser():
@@ -863,7 +876,7 @@ def parser():
     parser.add_argument(
         "--coding",
         action=cli.enum_from_lower(CodingProcedure),
-        default="RootMeaning",
+        default=CodingProcedure.ROOTMEANING,
         help="""Coding method: In the `RootMeaning` coding method, every character
         describes the presence or absence of a particular root morpheme or
         cognate class in the word(s) for a given meaning; In the
@@ -908,7 +921,7 @@ def parser():
         if language in args.languages
     }
 
-    logger.info(f"Imported languages {set(ds)}.")
+    logger.info(f"Exported languages {set(ds)}.")
 
     # Step 2: Code the data
     n_symbols, datatype = 2, "binary"

diff --git a/src/lexedata/util/simplify_ids.py b/src/lexedata/util/simplify_ids.py
@@ -13,14 +13,20 @@
 }
 
 
-def clean_mapping(rows: t.Mapping[str, t.Mapping[str, str]]) -> t.Mapping[str, str]:
+def clean_mapping(
+    rows: t.Mapping[str, t.Mapping[str, str]],
+    additional_normalize: t.Callable[[str], str] = str.lower,
+) -> t.Mapping[str, str]:
     """Create unique normalized IDs.
 
     >>> clean_mapping({"A": {}, "B": {}})
     {'A': 'a', 'B': 'b'}
 
     >>> clean_mapping({"A": {}, "a": {}})
     {'A': 'a', 'a': 'a_x2'}
+
+    >>> clean_mapping({"A": {}, "a": {}}, str.upper)
+    {'A': 'A', 'a': 'A_x2'}
     """
     avoid = {id.lower() for id in rows}
 
@@ -31,6 +37,7 @@ def clean_mapping(rows: t.Mapping[str, t.Mapping[str, str]]) -> t.Mapping[str, s
             base = string_to_id("_".join(row.values()))
         else:
             base = string_to_id(id)
+        base = additional_normalize(base)
 
         if base in avoid and base not in mapping.values():
             # I kept a spot for you!
@@ -198,6 +205,7 @@ def simplify_table_ids_and_references(
     table: csvw.Table,
     transparent: bool = True,
     logger: cli.logging.Logger = cli.logger,
+    additional_normalize: t.Callable[[str], str] = str.lower,
 ) -> bool:
     """Simplify the IDs of the given table."""
     ttype = ds.get_tabletype(table)
@@ -219,9 +227,14 @@ def simplify_table_ids_and_references(
 
     if transparent and ttype in ID_COMPONENTS:
         cols = {prop: ds[ttype, prop].name for prop in ID_COMPONENTS[ttype]}
-        mapping = clean_mapping(cache_table(ds, ttype, cols))
+        mapping = clean_mapping(
+            cache_table(ds, ttype, cols), additional_normalize=additional_normalize
+        )
     else:
-        mapping = clean_mapping(cache_table(ds, table.url.string, {}))
+        mapping = clean_mapping(
+            cache_table(ds, table.url.string, {}),
+            additional_normalize=additional_normalize,
+        )
 
     update_ids(ds, table, mapping)
     return True
diff --git a/test/test_cognate_exporter.py b/test/test_cognate_exporter.py
@@ -270,17 +270,17 @@ def test_adding_singleton_cognatesets_with_status(caplog):
     # when accessing the row as a tuple the index is not 1-based as for excel sheets
     status = [row[cogset_index].value for row in excel_writer.ws.iter_rows(min_row=2)]
     assert status == [
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
         "NEW",
         "NEW",
         "NEW",

diff --git a/test/test_various_parsers.py b/test/test_various_parsers.py
@@ -13,6 +13,7 @@
 )
 from lexedata.exporter.cognates import parser as cex_parser
 from lexedata.importer.excel_long_format import parser as ilong_parser
+from lexedata.edit.simplify_ids import parser as sid_parser
 
 
 def test_setorfromfile_list():
@@ -58,6 +59,18 @@ def test_loglevel_parser():
     assert parameters.loglevel == logging.ERROR
 
 
+def test_sid_parser():
+    parameters = sid_parser().parse_args(
+        ["-v", "--tables", "ParameterTable", "CognatesetTable", "--uppercase"]
+    )
+    # Optional positional argument ("FormTable") after optional switch ("-q",
+    # but also "-V" which uses a builtin action) does not seem to work.
+    assert parameters.loglevel == logging.DEBUG
+    assert parameters.tables == ["ParameterTable", "CognatesetTable"]
+    assert parameters.uppercase
+    assert not parameters.transparent
+
+
 def test_phylo_parser():
     _, fname = tempfile.mkstemp(".csv")
     with open(fname, "w", encoding="utf-8") as file: