Skip to content

Commit

Permalink
Add check for potentially problematic consonant clusters (#274)
Browse files Browse the repository at this point in the history
* Check for potentially problematic consonant clusters

* Update tests

* Skip invalid segments

* Add summary, add segments, fix imports
  • Loading branch information
chrzyki committed Apr 30, 2024
1 parent 1a52624 commit cbbaa1f
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 0 deletions.
73 changes: 73 additions & 0 deletions src/pylexibank/commands/consonant_clusters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""
Check for (potentially) problematic consonant clusters >= 3.
"""

import collections

from cldfbench.cli_util import with_dataset, add_catalog_spec
from clldutils.clilib import Table, add_format

from pylexibank.cli_util import add_dataset_spec


def register(parser):
add_dataset_spec(parser)
add_catalog_spec(parser, "clts")
add_format(parser, default="pipe")

parser.add_argument(
"-l", "--length",
type=int,
default=3,
help="Check for consonant clusters of this length or more"
)


def run(args):
with_dataset(args, collect_consonant_clusters)


def compute_consonant_cluster(word, sound_names):
out = [[]] if sound_names[0].split(" ")[-1] in ["consonant", "cluster"] else []
for i, sound in enumerate(sound_names):
if sound.split(" ")[-1] in ["diphthong", "vowel", "tone", "�", "marker"]:
out += [[]]
else:
out[-1] += [word[i]]
return [chunk for chunk in out if chunk]


def collect_consonant_clusters(dataset, args):
by_lang = collections.defaultdict(lambda: collections.defaultdict(list))

with Table(args, "Language_ID", "Length", "Cluster", "Words") as table:
for row in sorted(
dataset.cldf_dir.read_csv("forms.csv", dicts=True), key=lambda r: r["ID"]
):
if "<<" in row["Segments"]:
args.log.warning("Invalid segments in {0} (ID: {1}).".format(row["Segments"], row["ID"]))
continue
else:
segments = row["Segments"].split(" + ")

for morpheme, sounds in map(
lambda x: (
x.split(),
[s.name for s in args.clts.api.bipa(x.split())]
),
segments
):
clusters = compute_consonant_cluster(morpheme, sounds)

for cluster in clusters:
by_lang[tuple(cluster)][row["Language_ID"]] += [row["Segments"], row["Form"]]

cases = 0
for cluster in sorted(by_lang, key=lambda x: len(x)):
data = by_lang[cluster]
if len(cluster) >= args.length:
cases += 1
for language, words in data.items():
table.append([language, str(len(cluster)), " ".join(cluster), " // ".join(words)])

args.log.warning(f"Found {cases} potentially problematic consonant cluster(s) with length {args.length}.")
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ lang1-param1-6,,lang1,param1,ayz,ayz,b x/y ɣː ʒ’,,abc,,,^ a y z $,p1
lang1-param1-7,,lang1,param1,axg,axg,b x/y ɡʷ,,abc,,,^ a x g $,p1
lang1-param1-7,,lang1,param1,axd,axdou,b x/y dʱʷ,,abc,,,^ a x d $,p1
lang1-param1-8,,lang1,param1,axd,axdou,b x/y dʱʷ,,abc,,,^ a x d $,p1
lang1-param1-9,,lang1,param1,axd,axdou,ɡ̤ː ɡ̤ː b,,abc,,,^ a x d $,p1
lang1-param1-10,,lang1,param1,axd,axdou,ɡ̤ː ɡ̤ː b dʱʷ dʱʷ dʱʷ,,abc,,,^ a x d $,p1
12 changes: 12 additions & 0 deletions tests/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,18 @@ def test_check_phonotactics(dataset, capsys):
| 1 | lang2-param2-2 | a~b-c | ab | | + a + + b + |"""


def test_consonant_clusters(dataset, repos, caplog, capsys):
d = repos / 'datasets' / 'test_dataset_multi_profile_with_cldf'
_main('lexibank.consonant_clusters -l 4 {0} --clts {1}'.format(str(d / 'tdmpcldf.py'), repos))
warnings = [r.message for r in caplog.records if r.levelname == 'WARNING']
out, _ = capsys.readouterr()

assert any("Found 1 potentially problematic consonant cluster(s) with length 4" in w for w in warnings)
assert out.strip() == """| Language_ID | Length | Cluster | Words |
|:--------------|---------:|:----------------------|:-------------------------------|
| lang1 | 6 | ɡ̤ː ɡ̤ː b dʱʷ dʱʷ dʱʷ | ɡ̤ː ɡ̤ː b dʱʷ dʱʷ dʱʷ // axdou |"""


def test_check_profile(dataset, repos, caplog, capsys):
_main('lexibank.check_profile {0} --clts {1}'.format(str(dataset.dir / 'td.py'), repos))
assert len(caplog.records) == 2
Expand Down

0 comments on commit cbbaa1f

Please sign in to comment.