Merge pull request #197 from Anaphory/nan-forms

Deal with NAN forms
lexedata · Sep 22, 2021 · 1756f60 · 1756f60
2 parents a034a4b + 0a65c7f
commit 1756f60
Show file tree

Hide file tree

Showing 11 changed files with 710 additions and 92 deletions.
diff --git a/src/lexedata/edit/add_segments.py b/src/lexedata/edit/add_segments.py
@@ -34,14 +34,22 @@
 tokenizer = segments.Tokenizer()
 
 
+@attr.s(auto_attribs=True)
+class ReportEntry:
+    count: int = 0
+    comment: str = ""
+
+
 @attr.s(auto_attribs=True)
 class SegmentReport:
-    sounds: defaultdict = defaultdict(lambda: {"count": 0, "comment": ""})
+    sounds: t.MutableMapping[str, ReportEntry] = attr.ib(
+        factory=lambda: defaultdict(ReportEntry)
+    )
 
-    def __call__(self, name: str) -> t.Tuple[str, str, int, str]:
+    def __call__(self, name: str) -> t.List[t.Tuple[str, str, int, str]]:
         res = []
         for k, v in self.sounds.items():
-            res.append((name, k, v["count"], v["comment"]))
+            res.append((name, k, v.count, v.comment))
         return res
 
 
@@ -137,8 +145,8 @@ def segment_form(
             i -= 1
             continue
         if raw_tokens[i].source == "/":
-            report.sounds[str(raw_tokens[i])]["count"] += 1
-            report.sounds[str(raw_tokens[i])]["comment"] = "illegal symbol"
+            report.sounds[str(raw_tokens[i])].count += 1
+            report.sounds[str(raw_tokens[i])].comment = "illegal symbol"
             del raw_tokens[i]
             logger.warning(
                 f"{context_for_warnings}Impossible sound '/' encountered in {formstring} – "
@@ -161,26 +169,32 @@ def segment_form(
             i -= 1
             continue
         if grapheme.endswith("ⁿ") or grapheme.endswith("ᵐ") or grapheme.endswith("ᵑ"):
-            if i + 1 > len(raw_tokens) - 1 or raw_tokens[i + 1].preceding is not None:
+            if (
+                i + 1 > len(raw_tokens) - 1
+                or not hasattr(raw_tokens[i + 1], "preceding")
+                or raw_tokens[i + 1].preceding is not None
+            ):
                 logger.warning(
                     f"{context_for_warnings}Unknown sound {raw_tokens[i]} encountered in {formstring}"
                 )
-                report.sounds[str(raw_tokens[i])]["count"] += 1
-                report.sounds[str(raw_tokens[i])][
-                    "comment"
-                ] = "unknown pre-nasalization"
+                report.sounds[str(raw_tokens[i])].count += 1
+                report.sounds[str(raw_tokens[i])].comment = "unknown pre-nasalization"
                 i -= 1
                 continue
             raw_tokens[i + 1] = bipa["pre-nasalized " + raw_tokens[i + 1].name]
             raw_tokens[i] = bipa[grapheme[:-1]]
             continue
         if grapheme.endswith("ʰ"):
-            if i + 1 > len(raw_tokens) - 1 or raw_tokens[i + 1].preceding is not None:
+            if (
+                i + 1 > len(raw_tokens) - 1
+                or not hasattr(raw_tokens[i + 1], "preceding")
+                or raw_tokens[i + 1].preceding is not None
+            ):
                 logger.warning(
                     f"{context_for_warnings}Unknown sound {raw_tokens[i]} encountered in {formstring}"
                 )
-                report.sounds[str(raw_tokens[i])]["count"] += 1
-                report.sounds[str(raw_tokens[i])]["comment"] = "unknown pre-aspiration"
+                report.sounds[str(raw_tokens[i])].count += 1
+                report.sounds[str(raw_tokens[i])].comment = "unknown pre-aspiration"
                 i -= 1
                 continue
             raw_tokens[i + 1] = bipa["pre-aspirated " + raw_tokens[i + 1].name]
@@ -189,8 +203,8 @@ def segment_form(
         logger.warning(
             f"{context_for_warnings}Unknown sound {raw_tokens[i]} encountered in {formstring}"
         )
-        report.sounds[str(raw_tokens[i])]["count"] += 1
-        report.sounds[str(raw_tokens[i])]["comment"] = "unknown sound"
+        report.sounds[str(raw_tokens[i])].count += 1
+        report.sounds[str(raw_tokens[i])].comment = "unknown sound"
         i -= 1
 
     return raw_tokens
@@ -227,7 +241,9 @@ def add_segments_to_dataset(
             write_back.append(row)
             continue
         else:
-            if row[transcription]:
+            if row[transcription] is None or row[transcription] == "-":
+                row[dataset.column_names.forms.segments] = ""
+            elif row[transcription]:
                 form = row[transcription].strip()
                 for wrong, right in pre_replace.items():
                     if wrong in form:

diff --git a/src/lexedata/edit/detect_cognates.py b/src/lexedata/edit/detect_cognates.py
@@ -15,6 +15,8 @@
 import lingpy.compare.partial
 
 import lexedata.cli as cli
+import lexedata.types as types
+
 
 clts_path = cldfcatalog.Config.from_file().get_clone("clts")
 clts = cldfbench.catalogs.CLTS(clts_path)
@@ -55,6 +57,26 @@ def clean_segments(segment_string: t.List[str]) -> t.Iterable[pyclts.models.Symb
     return segments[1:-1]
 
 
+def filter_function_factory(
+    dataset: types.Wordlist,
+) -> t.Callable[[t.Dict[str, t.Any]], bool]:
+    def filter(row: t.Dict[str, t.Any]) -> bool:
+        row["tokens"] = [
+            str(x)
+            for x in clean_segments(row[dataset.column_names.forms.segments.lower()])
+        ]
+        row["tokens"] = ["+" if x == "_" else x for x in row["tokens"]]
+        # TODO: Find the official LingPy way to consider word boundaries to
+        # also be morpheme boundaries – just adding them in
+        # `partial_cluster(sep=...+'_')` did not work, and why isn't it the
+        # default anyway?
+        row["doculect"] = row[dataset.column_names.forms.languageReference.lower()]
+        row["concept"] = row[dataset.column_names.forms.parameterReference.lower()]
+        return row["segments"] and row["concept"]
+
+    return filter
+
+
 def cognate_code_to_file(
     metadata: Path,
     ratio: float,
@@ -66,28 +88,14 @@ def cognate_code_to_file(
     mode: str,
     output_file: Path,
 ) -> None:
-    dataset = pycldf.Wordlist.from_metadata(args.metadata)
+    dataset = pycldf.Wordlist.from_metadata(metadata)
     assert (
         dataset.column_names.forms.segments is not None
     ), "Dataset must have a CLDF #segments column."
 
-    def filter(row: t.Dict[str, t.Any]) -> bool:
-        row["tokens"] = [
-            str(x)
-            for x in clean_segments(row[dataset.column_names.forms.segments.lower()])
-        ]
-        row["tokens"] = ["+" if x == "_" else x for x in row["tokens"]]
-        # TODO: Find the official LingPy way to consider word boundaries to
-        # also be morpheme boundaries – just adding them in
-        # `partial_cluster(sep=...+'_')` did not work, and why isn't it the
-        # default anyway?
-        row["doculect"] = row[dataset.column_names.forms.languageReference.lower()]
-        row["concept"] = row[dataset.column_names.forms.parameterReference.lower()]
-        return row["segments"] and row["concept"]
-
     lex = lingpy.compare.partial.Partial.from_cldf(
         metadata,
-        filter=filter,
+        filter=filter_function_factory(dataset),
         columns=["doculect", "concept", "tokens"],
         model=lingpy.data.model.Model(soundclass),
         check=True,
@@ -132,12 +140,12 @@ def filter(row: t.Dict[str, t.Any]) -> bool:
     # For some purposes it is useful to have monolithic cognate classes.
     lex.cluster(
         method="lexstat",
-        threshold=args.threshold,
+        threshold=threshold,
         ref="cogid",
         cluster_method=cluster_method,
         verbose=True,
         override=True,
-        gop=args.gop,
+        gop=gop,
         mode=mode,
     )
     # But actually, in most cases partial cognates are much more useful.
@@ -154,7 +162,7 @@ def filter(row: t.Dict[str, t.Any]) -> bool:
     lex.output("tsv", filename="auto-clusters")
     alm = lingpy.Alignments(lex, ref="partialcognateids", fuzzy=True)
     alm.align(method="progressive")
-    alm.output("tsv", filename=output_file, ignore="all", prettify=False)
+    alm.output("tsv", filename=str(output_file), ignore="all", prettify=False)
 
     try:
         dataset.add_component("CognateTable")
@@ -166,7 +174,7 @@ def filter(row: t.Dict[str, t.Any]) -> bool:
         ...
 
     read_back = csv.DictReader(
-        open(output_file + ".tsv", encoding="utf-8"), delimiter="\t"
+        open(str(output_file) + ".tsv", encoding="utf-8"), delimiter="\t"
     )
     cognatesets = {}
     judgements = []
@@ -176,7 +184,8 @@ def filter(row: t.Dict[str, t.Any]) -> bool:
         alignment = line["ALIGNMENT"].split(" + ")
         slice_start = 0
         for cs, alm in zip(partial, alignment):
-            cognatesets.setdefault(cs, {"ID": cs})
+            # TODO: @Gereon: is it alright to add the same content to Name and ID?
+            cognatesets.setdefault(cs, {"ID": cs, "Name": cs})
             length = len(alm.split())
             judgements.append(
                 {

diff --git a/src/lexedata/exporter/edictor.py b/src/lexedata/exporter/edictor.py
@@ -163,6 +163,7 @@ def forms_to_tsv(
         c_segment_slice = dataset["CognateTable", "segmentSlice"].name
         c_alignment = dataset["CognateTable", "alignment"].name
     except KeyError:
+        # TODO: why not use directly: cli.EXIT.NO_COGNATETABLE(message) ?
         logger.critical(
             """Edictor export requires your dataset to have an explicit CognateTable containing the judgements,
             with all of IDs, forms, cognatesets, segment slices and alignments.
@@ -173,12 +174,14 @@ def forms_to_tsv(
     c_form_language = dataset["FormTable", "languageReference"].name
     c_form_concept = dataset["FormTable", "parameterReference"].name
     c_form_id = dataset["FormTable", "id"].name
+    c_form_form = dataset["FormTable", "form"].name
     try:
         c_form_segments = dataset["FormTable", "segments"].name
     except KeyError:
+        # TODO: same: why not use cli:Exit....() directly?
         logger.critical(
             """Edictor export requires your dataset to have segments in the FormTable.
-        Run `lexedata.edit.segment_using_clts` to automatically add segments based on your forms."""
+        Run `lexedata.edit.add_segments` to automatically add segments based on your forms."""
         )
         # TODO: Exit.NO_SEGMENTS is not an `int`, so the exit code of the
         # python run is actually 1, not 4 as we wanted.
@@ -203,6 +206,8 @@ def forms_to_tsv(
     # select forms and cognates given restriction of languages and concepts, cognatesets respectively
     forms = {}
     for form in dataset["FormTable"]:
+        if form[c_form_form] is None or form[c_form_form] == "-":
+            continue
         if form[c_form_language] in languages:
             if concepts.intersection(ensure_list(form[c_form_concept])):
                 # Normalize the form:
@@ -214,14 +219,13 @@ def forms_to_tsv(
                         form[c] = d.join(form[c])
                     except TypeError:
                         logger.warning(
-                            f"No segments found for form {form[c_form_id]}. You can generate segments using `lexedata.enrich.segment_using_clts`."
+                            f"No segments found for form {form[c_form_id]}. You can generate segments using `lexedata.edit.add_segments`."
                         )
                 # 2. No tabs, newlines in entries
                 for c, v in form.items():
                     if type(v) == str:
                         form[c] = form[c].replace("\t", "!t").replace("\n", "!n")
                 forms[form[c_form_id]] = form
-
     cognateset_cache: t.Mapping[t.Optional[str], int]
     if "CognatesetTable" in dataset:
         cognateset_cache = {

diff --git a/src/lexedata/importer/excel_interleaved.py b/src/lexedata/importer/excel_interleaved.py
@@ -99,6 +99,8 @@ def import_interleaved(
                     )
                 )
             for form, cogset in zip(forms, cogsets + [None]):
+                if form == "?" or cogset == "?":
+                    continue
                 base_id = util.string_to_id(f"{language_name}_{concepts[c]}")
                 id = base_id
                 synonym = 1
@@ -129,13 +131,15 @@ def import_interleaved(
 
     ws = openpyxl.load_workbook(args.excel)
 
-    w = csv.writer(open(Path(args.directory) / "forms.csv", "w", encoding="utf-8"))
+    w = csv.writer(
+        open(Path(args.directory) / "forms.csv", "w", newline="", encoding="utf-8")
+    )
     w.writerow(
         ["ID", "Language_ID", "Parameter_ID", "Form", "Comment", "Cognateset_ID"]
     )
 
     if not args.sheet:
-        args.sheet = ws.get_sheet_names()
+        args.sheet = [sheet for sheet in ws.sheetnames]
 
     ids: t.Set[str] = set()
     for sheetname in args.sheet:

diff --git a/src/lexedata/importer/excel_long_format.py b/src/lexedata/importer/excel_long_format.py
@@ -79,6 +79,8 @@ def import_data_from_sheet(
 
     for row in row_iter:
         data = Form({k: clean_cell_value(cell) for k, cell in zip(sheet_header, row)})
+        if "?" in data.values():
+            continue
         if "value" in implicit:
             data[implicit["value"]] = "\t".join(map(str, data.values()))
         concept_entry = data.pop(concept_column[1])
@@ -315,7 +317,10 @@ def add_single_languages(
                 f"Importing all forms independent of concept"
             )
         concepts = KeyKeyDict()
-        concept_column = dataset["FormTable", "parameterReference"].name
+        if concept_name:
+            concept_column = concept_name
+        else:
+            concept_column = dataset["FormTable", "parameterReference"].name
     # add Status_Column if not existing and status_update given
     if status_update:
         add_status_column_to_table(dataset=dataset, table_name="FormTable")
@@ -405,9 +410,7 @@ def add_single_languages(
 
     if not args.sheet:
         sheets = [
-            sheet
-            for sheet in args.excel.worksheets
-            if sheet.title not in args.exclude_sheet
+            sheet for sheet in args.excel if sheet.title not in args.exclude_sheet
         ]
         logger.info("No sheets specified explicitly. Parsing sheets: %s", args.sheet)
     else:

diff --git a/src/lexedata/importer/excel_matrix.py b/src/lexedata/importer/excel_matrix.py
@@ -382,14 +382,28 @@ def parse_cells(
                     continue
 
                 # Parse the cell, which results (potentially) in multiple forms
+                if properties.__table__ == "FormTable":
+                    c_f_form = self.db.dataset[properties.__table__, "form"].name
                 for params in self.cell_parser.parse(
                     cell_with_forms,
                     this_lan,
                     f"{sheet.title}.{cell_with_forms.coordinate}",
                 ):
-                    self.handle_form(
-                        params, row_object, cell_with_forms, this_lan, status_update
-                    )
+                    if properties.__table__ == "FormTable":
+                        if params[c_f_form] == "?":
+                            continue
+                        else:
+                            self.handle_form(
+                                params,
+                                row_object,
+                                cell_with_forms,
+                                this_lan,
+                                status_update,
+                            )
+                    else:
+                        self.handle_form(
+                            params, row_object, cell_with_forms, this_lan, status_update
+                        )
         self.db.commit()
 
     def handle_form(
@@ -832,7 +846,7 @@ def load_dataset(
     parser.add_argument(
         "--cogsets",
         type=Path,
-        default="",
+        default=None,
         help="Path to an optional second Excel file containing cogsets and cognate judgements",
     )
     parser.add_argument(

diff --git a/src/lexedata/report/homophones.py b/src/lexedata/report/homophones.py
@@ -24,7 +24,13 @@ def list_homophones(
         clics = nx.Graph()
 
     c_id = dataset["ParameterTable", "id"].name
-    c_concepticon = dataset["ParameterTable", "concepticonReference"].name
+    try:
+        c_concepticon = dataset["ParameterTable", "concepticonReference"].name
+    except KeyError:
+        cli.Exit.INVALID_DATASET(
+            "This script requires a column concepticonReference in ParamterTable. "
+            "Please run add_concepticon.py"
+        )
     concepticon = {}
     for concept in dataset["ParameterTable"]:
         concepticon[concept[c_id]] = concept[c_concepticon]
@@ -39,6 +45,8 @@ def list_homophones(
     ] = t.DefaultDict(lambda: t.DefaultDict(set))
 
     for form in dataset["FormTable"]:
+        if form[f_form] == "-" or form[f_form] is None:
+            continue
         homophones[form[f_lang]][form[f_form]].add((form[f_concept], form[f_id]))
 
     for lang, forms in homophones.items():

diff --git a/src/lexedata/util/excel.py b/src/lexedata/util/excel.py
@@ -457,6 +457,7 @@ def parse_form(
                 if element.startswith(start):
                     break
             else:
+                # TODO: here an other if catchin '-' might be necessary
                 # The only thing we expect outside delimiters is the variant
                 # separators, '~' and '%'.
                 if self.variant_separator and element in self.variant_separator: