Add check for potentially problematic consonant clusters (#274)

* Check for potentially problematic consonant clusters * Update tests * Skip invalid segments * Add summary, add segments, fix imports
lexibank · Apr 30, 2024 · cbbaa1f · cbbaa1f
1 parent 1a52624
commit cbbaa1f
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 0 deletions.
diff --git a/src/pylexibank/commands/consonant_clusters.py b/src/pylexibank/commands/consonant_clusters.py
@@ -0,0 +1,73 @@
+"""
+Check for (potentially) problematic consonant clusters >= 3.
+"""
+
+import collections
+
+from cldfbench.cli_util import with_dataset, add_catalog_spec
+from clldutils.clilib import Table, add_format
+
+from pylexibank.cli_util import add_dataset_spec
+
+
+def register(parser):
+    add_dataset_spec(parser)
+    add_catalog_spec(parser, "clts")
+    add_format(parser, default="pipe")
+
+    parser.add_argument(
+        "-l", "--length",
+        type=int,
+        default=3,
+        help="Check for consonant clusters of this length or more"
+    )
+
+
+def run(args):
+    with_dataset(args, collect_consonant_clusters)
+
+
+def compute_consonant_cluster(word, sound_names):
+    out = [[]] if sound_names[0].split(" ")[-1] in ["consonant", "cluster"] else []
+    for i, sound in enumerate(sound_names):
+        if sound.split(" ")[-1] in ["diphthong", "vowel", "tone", "�", "marker"]:
+            out += [[]]
+        else:
+            out[-1] += [word[i]]
+    return [chunk for chunk in out if chunk]
+
+
+def collect_consonant_clusters(dataset, args):
+    by_lang = collections.defaultdict(lambda: collections.defaultdict(list))
+
+    with Table(args, "Language_ID", "Length", "Cluster", "Words") as table:
+        for row in sorted(
+            dataset.cldf_dir.read_csv("forms.csv", dicts=True), key=lambda r: r["ID"]
+        ):
+            if "<<" in row["Segments"]:
+                args.log.warning("Invalid segments in {0} (ID: {1}).".format(row["Segments"], row["ID"]))
+                continue
+            else:
+                segments = row["Segments"].split(" + ")
+
+            for morpheme, sounds in map(
+                lambda x: (
+                        x.split(),
+                        [s.name for s in args.clts.api.bipa(x.split())]
+                ),
+                    segments
+            ):
+                clusters = compute_consonant_cluster(morpheme, sounds)
+
+                for cluster in clusters:
+                    by_lang[tuple(cluster)][row["Language_ID"]] += [row["Segments"], row["Form"]]
+
+        cases = 0
+        for cluster in sorted(by_lang, key=lambda x: len(x)):
+            data = by_lang[cluster]
+            if len(cluster) >= args.length:
+                cases += 1
+                for language, words in data.items():
+                    table.append([language, str(len(cluster)), " ".join(cluster), " // ".join(words)])
+
+    args.log.warning(f"Found {cases} potentially problematic consonant cluster(s) with length {args.length}.")
diff --git a/tests/repos/datasets/test_dataset_multi_profile_with_cldf/cldf/forms.csv b/tests/repos/datasets/test_dataset_multi_profile_with_cldf/cldf/forms.csv
@@ -8,3 +8,5 @@ lang1-param1-6,,lang1,param1,ayz,ayz,b x/y ɣː ʒ’,,abc,,,^ a y z $,p1
 lang1-param1-7,,lang1,param1,axg,axg,b x/y ɡʷ,,abc,,,^ a x g $,p1
 lang1-param1-7,,lang1,param1,axd,axdou,b x/y dʱʷ,,abc,,,^ a x d $,p1
 lang1-param1-8,,lang1,param1,axd,axdou,b x/y dʱʷ,,abc,,,^ a x d $,p1
+lang1-param1-9,,lang1,param1,axd,axdou,ɡ̤ː ɡ̤ː b,,abc,,,^ a x d $,p1
+lang1-param1-10,,lang1,param1,axd,axdou,ɡ̤ː ɡ̤ː b dʱʷ dʱʷ dʱʷ,,abc,,,^ a x d $,p1
diff --git a/tests/test_commands.py b/tests/test_commands.py
@@ -128,6 +128,18 @@ def test_check_phonotactics(dataset, capsys):
 | 1 | lang2-param2-2 | a~b-c | ab | | + a + + b + |"""
 
 
+def test_consonant_clusters(dataset, repos, caplog, capsys):
+    d = repos / 'datasets' / 'test_dataset_multi_profile_with_cldf'
+    _main('lexibank.consonant_clusters -l 4 {0} --clts {1}'.format(str(d / 'tdmpcldf.py'), repos))
+    warnings = [r.message for r in caplog.records if r.levelname == 'WARNING']
+    out, _ = capsys.readouterr()
+
+    assert any("Found 1 potentially problematic consonant cluster(s) with length 4" in w for w in warnings)
+    assert out.strip() == """| Language_ID | Length | Cluster | Words |
+|:--------------|---------:|:----------------------|:-------------------------------|
+| lang1 | 6 | ɡ̤ː ɡ̤ː b dʱʷ dʱʷ dʱʷ | ɡ̤ː ɡ̤ː b dʱʷ dʱʷ dʱʷ // axdou |"""
+
+
 def test_check_profile(dataset, repos, caplog, capsys):
     _main('lexibank.check_profile {0} --clts {1}'.format(str(dataset.dir / 'td.py'), repos))
     assert len(caplog.records) == 2