Skip to content

Commit

Permalink
Merge pull request #197 from Anaphory/nan-forms
Browse files Browse the repository at this point in the history
Deal with NAN forms
  • Loading branch information
Anaphory committed Sep 22, 2021
2 parents a034a4b + 0a65c7f commit 1756f60
Show file tree
Hide file tree
Showing 11 changed files with 710 additions and 92 deletions.
48 changes: 32 additions & 16 deletions src/lexedata/edit/add_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,22 @@
tokenizer = segments.Tokenizer()


@attr.s(auto_attribs=True)
class ReportEntry:
count: int = 0
comment: str = ""


@attr.s(auto_attribs=True)
class SegmentReport:
sounds: defaultdict = defaultdict(lambda: {"count": 0, "comment": ""})
sounds: t.MutableMapping[str, ReportEntry] = attr.ib(
factory=lambda: defaultdict(ReportEntry)
)

def __call__(self, name: str) -> t.Tuple[str, str, int, str]:
def __call__(self, name: str) -> t.List[t.Tuple[str, str, int, str]]:
res = []
for k, v in self.sounds.items():
res.append((name, k, v["count"], v["comment"]))
res.append((name, k, v.count, v.comment))
return res


Expand Down Expand Up @@ -137,8 +145,8 @@ def segment_form(
i -= 1
continue
if raw_tokens[i].source == "/":
report.sounds[str(raw_tokens[i])]["count"] += 1
report.sounds[str(raw_tokens[i])]["comment"] = "illegal symbol"
report.sounds[str(raw_tokens[i])].count += 1
report.sounds[str(raw_tokens[i])].comment = "illegal symbol"
del raw_tokens[i]
logger.warning(
f"{context_for_warnings}Impossible sound '/' encountered in {formstring} – "
Expand All @@ -161,26 +169,32 @@ def segment_form(
i -= 1
continue
if grapheme.endswith("ⁿ") or grapheme.endswith("ᵐ") or grapheme.endswith("ᵑ"):
if i + 1 > len(raw_tokens) - 1 or raw_tokens[i + 1].preceding is not None:
if (
i + 1 > len(raw_tokens) - 1
or not hasattr(raw_tokens[i + 1], "preceding")
or raw_tokens[i + 1].preceding is not None
):
logger.warning(
f"{context_for_warnings}Unknown sound {raw_tokens[i]} encountered in {formstring}"
)
report.sounds[str(raw_tokens[i])]["count"] += 1
report.sounds[str(raw_tokens[i])][
"comment"
] = "unknown pre-nasalization"
report.sounds[str(raw_tokens[i])].count += 1
report.sounds[str(raw_tokens[i])].comment = "unknown pre-nasalization"
i -= 1
continue
raw_tokens[i + 1] = bipa["pre-nasalized " + raw_tokens[i + 1].name]
raw_tokens[i] = bipa[grapheme[:-1]]
continue
if grapheme.endswith("ʰ"):
if i + 1 > len(raw_tokens) - 1 or raw_tokens[i + 1].preceding is not None:
if (
i + 1 > len(raw_tokens) - 1
or not hasattr(raw_tokens[i + 1], "preceding")
or raw_tokens[i + 1].preceding is not None
):
logger.warning(
f"{context_for_warnings}Unknown sound {raw_tokens[i]} encountered in {formstring}"
)
report.sounds[str(raw_tokens[i])]["count"] += 1
report.sounds[str(raw_tokens[i])]["comment"] = "unknown pre-aspiration"
report.sounds[str(raw_tokens[i])].count += 1
report.sounds[str(raw_tokens[i])].comment = "unknown pre-aspiration"
i -= 1
continue
raw_tokens[i + 1] = bipa["pre-aspirated " + raw_tokens[i + 1].name]
Expand All @@ -189,8 +203,8 @@ def segment_form(
logger.warning(
f"{context_for_warnings}Unknown sound {raw_tokens[i]} encountered in {formstring}"
)
report.sounds[str(raw_tokens[i])]["count"] += 1
report.sounds[str(raw_tokens[i])]["comment"] = "unknown sound"
report.sounds[str(raw_tokens[i])].count += 1
report.sounds[str(raw_tokens[i])].comment = "unknown sound"
i -= 1

return raw_tokens
Expand Down Expand Up @@ -227,7 +241,9 @@ def add_segments_to_dataset(
write_back.append(row)
continue
else:
if row[transcription]:
if row[transcription] is None or row[transcription] == "-":
row[dataset.column_names.forms.segments] = ""
elif row[transcription]:
form = row[transcription].strip()
for wrong, right in pre_replace.items():
if wrong in form:
Expand Down
51 changes: 30 additions & 21 deletions src/lexedata/edit/detect_cognates.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import lingpy.compare.partial

import lexedata.cli as cli
import lexedata.types as types


clts_path = cldfcatalog.Config.from_file().get_clone("clts")
clts = cldfbench.catalogs.CLTS(clts_path)
Expand Down Expand Up @@ -55,6 +57,26 @@ def clean_segments(segment_string: t.List[str]) -> t.Iterable[pyclts.models.Symb
return segments[1:-1]


def filter_function_factory(
dataset: types.Wordlist,
) -> t.Callable[[t.Dict[str, t.Any]], bool]:
def filter(row: t.Dict[str, t.Any]) -> bool:
row["tokens"] = [
str(x)
for x in clean_segments(row[dataset.column_names.forms.segments.lower()])
]
row["tokens"] = ["+" if x == "_" else x for x in row["tokens"]]
# TODO: Find the official LingPy way to consider word boundaries to
# also be morpheme boundaries – just adding them in
# `partial_cluster(sep=...+'_')` did not work, and why isn't it the
# default anyway?
row["doculect"] = row[dataset.column_names.forms.languageReference.lower()]
row["concept"] = row[dataset.column_names.forms.parameterReference.lower()]
return row["segments"] and row["concept"]

return filter


def cognate_code_to_file(
metadata: Path,
ratio: float,
Expand All @@ -66,28 +88,14 @@ def cognate_code_to_file(
mode: str,
output_file: Path,
) -> None:
dataset = pycldf.Wordlist.from_metadata(args.metadata)
dataset = pycldf.Wordlist.from_metadata(metadata)
assert (
dataset.column_names.forms.segments is not None
), "Dataset must have a CLDF #segments column."

def filter(row: t.Dict[str, t.Any]) -> bool:
row["tokens"] = [
str(x)
for x in clean_segments(row[dataset.column_names.forms.segments.lower()])
]
row["tokens"] = ["+" if x == "_" else x for x in row["tokens"]]
# TODO: Find the official LingPy way to consider word boundaries to
# also be morpheme boundaries – just adding them in
# `partial_cluster(sep=...+'_')` did not work, and why isn't it the
# default anyway?
row["doculect"] = row[dataset.column_names.forms.languageReference.lower()]
row["concept"] = row[dataset.column_names.forms.parameterReference.lower()]
return row["segments"] and row["concept"]

lex = lingpy.compare.partial.Partial.from_cldf(
metadata,
filter=filter,
filter=filter_function_factory(dataset),
columns=["doculect", "concept", "tokens"],
model=lingpy.data.model.Model(soundclass),
check=True,
Expand Down Expand Up @@ -132,12 +140,12 @@ def filter(row: t.Dict[str, t.Any]) -> bool:
# For some purposes it is useful to have monolithic cognate classes.
lex.cluster(
method="lexstat",
threshold=args.threshold,
threshold=threshold,
ref="cogid",
cluster_method=cluster_method,
verbose=True,
override=True,
gop=args.gop,
gop=gop,
mode=mode,
)
# But actually, in most cases partial cognates are much more useful.
Expand All @@ -154,7 +162,7 @@ def filter(row: t.Dict[str, t.Any]) -> bool:
lex.output("tsv", filename="auto-clusters")
alm = lingpy.Alignments(lex, ref="partialcognateids", fuzzy=True)
alm.align(method="progressive")
alm.output("tsv", filename=output_file, ignore="all", prettify=False)
alm.output("tsv", filename=str(output_file), ignore="all", prettify=False)

try:
dataset.add_component("CognateTable")
Expand All @@ -166,7 +174,7 @@ def filter(row: t.Dict[str, t.Any]) -> bool:
...

read_back = csv.DictReader(
open(output_file + ".tsv", encoding="utf-8"), delimiter="\t"
open(str(output_file) + ".tsv", encoding="utf-8"), delimiter="\t"
)
cognatesets = {}
judgements = []
Expand All @@ -176,7 +184,8 @@ def filter(row: t.Dict[str, t.Any]) -> bool:
alignment = line["ALIGNMENT"].split(" + ")
slice_start = 0
for cs, alm in zip(partial, alignment):
cognatesets.setdefault(cs, {"ID": cs})
# TODO: @Gereon: is it alright to add the same content to Name and ID?
cognatesets.setdefault(cs, {"ID": cs, "Name": cs})
length = len(alm.split())
judgements.append(
{
Expand Down
10 changes: 7 additions & 3 deletions src/lexedata/exporter/edictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def forms_to_tsv(
c_segment_slice = dataset["CognateTable", "segmentSlice"].name
c_alignment = dataset["CognateTable", "alignment"].name
except KeyError:
# TODO: why not use directly: cli.EXIT.NO_COGNATETABLE(message) ?
logger.critical(
"""Edictor export requires your dataset to have an explicit CognateTable containing the judgements,
with all of IDs, forms, cognatesets, segment slices and alignments.
Expand All @@ -173,12 +174,14 @@ def forms_to_tsv(
c_form_language = dataset["FormTable", "languageReference"].name
c_form_concept = dataset["FormTable", "parameterReference"].name
c_form_id = dataset["FormTable", "id"].name
c_form_form = dataset["FormTable", "form"].name
try:
c_form_segments = dataset["FormTable", "segments"].name
except KeyError:
# TODO: same: why not use cli:Exit....() directly?
logger.critical(
"""Edictor export requires your dataset to have segments in the FormTable.
Run `lexedata.edit.segment_using_clts` to automatically add segments based on your forms."""
Run `lexedata.edit.add_segments` to automatically add segments based on your forms."""
)
# TODO: Exit.NO_SEGMENTS is not an `int`, so the exit code of the
# python run is actually 1, not 4 as we wanted.
Expand All @@ -203,6 +206,8 @@ def forms_to_tsv(
# select forms and cognates given restriction of languages and concepts, cognatesets respectively
forms = {}
for form in dataset["FormTable"]:
if form[c_form_form] is None or form[c_form_form] == "-":
continue
if form[c_form_language] in languages:
if concepts.intersection(ensure_list(form[c_form_concept])):
# Normalize the form:
Expand All @@ -214,14 +219,13 @@ def forms_to_tsv(
form[c] = d.join(form[c])
except TypeError:
logger.warning(
f"No segments found for form {form[c_form_id]}. You can generate segments using `lexedata.enrich.segment_using_clts`."
f"No segments found for form {form[c_form_id]}. You can generate segments using `lexedata.edit.add_segments`."
)
# 2. No tabs, newlines in entries
for c, v in form.items():
if type(v) == str:
form[c] = form[c].replace("\t", "!t").replace("\n", "!n")
forms[form[c_form_id]] = form

cognateset_cache: t.Mapping[t.Optional[str], int]
if "CognatesetTable" in dataset:
cognateset_cache = {
Expand Down
8 changes: 6 additions & 2 deletions src/lexedata/importer/excel_interleaved.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ def import_interleaved(
)
)
for form, cogset in zip(forms, cogsets + [None]):
if form == "?" or cogset == "?":
continue
base_id = util.string_to_id(f"{language_name}_{concepts[c]}")
id = base_id
synonym = 1
Expand Down Expand Up @@ -129,13 +131,15 @@ def import_interleaved(

ws = openpyxl.load_workbook(args.excel)

w = csv.writer(open(Path(args.directory) / "forms.csv", "w", encoding="utf-8"))
w = csv.writer(
open(Path(args.directory) / "forms.csv", "w", newline="", encoding="utf-8")
)
w.writerow(
["ID", "Language_ID", "Parameter_ID", "Form", "Comment", "Cognateset_ID"]
)

if not args.sheet:
args.sheet = ws.get_sheet_names()
args.sheet = [sheet for sheet in ws.sheetnames]

ids: t.Set[str] = set()
for sheetname in args.sheet:
Expand Down
11 changes: 7 additions & 4 deletions src/lexedata/importer/excel_long_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ def import_data_from_sheet(

for row in row_iter:
data = Form({k: clean_cell_value(cell) for k, cell in zip(sheet_header, row)})
if "?" in data.values():
continue
if "value" in implicit:
data[implicit["value"]] = "\t".join(map(str, data.values()))
concept_entry = data.pop(concept_column[1])
Expand Down Expand Up @@ -315,7 +317,10 @@ def add_single_languages(
f"Importing all forms independent of concept"
)
concepts = KeyKeyDict()
concept_column = dataset["FormTable", "parameterReference"].name
if concept_name:
concept_column = concept_name
else:
concept_column = dataset["FormTable", "parameterReference"].name
# add Status_Column if not existing and status_update given
if status_update:
add_status_column_to_table(dataset=dataset, table_name="FormTable")
Expand Down Expand Up @@ -405,9 +410,7 @@ def add_single_languages(

if not args.sheet:
sheets = [
sheet
for sheet in args.excel.worksheets
if sheet.title not in args.exclude_sheet
sheet for sheet in args.excel if sheet.title not in args.exclude_sheet
]
logger.info("No sheets specified explicitly. Parsing sheets: %s", args.sheet)
else:
Expand Down
22 changes: 18 additions & 4 deletions src/lexedata/importer/excel_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,14 +382,28 @@ def parse_cells(
continue

# Parse the cell, which results (potentially) in multiple forms
if properties.__table__ == "FormTable":
c_f_form = self.db.dataset[properties.__table__, "form"].name
for params in self.cell_parser.parse(
cell_with_forms,
this_lan,
f"{sheet.title}.{cell_with_forms.coordinate}",
):
self.handle_form(
params, row_object, cell_with_forms, this_lan, status_update
)
if properties.__table__ == "FormTable":
if params[c_f_form] == "?":
continue
else:
self.handle_form(
params,
row_object,
cell_with_forms,
this_lan,
status_update,
)
else:
self.handle_form(
params, row_object, cell_with_forms, this_lan, status_update
)
self.db.commit()

def handle_form(
Expand Down Expand Up @@ -832,7 +846,7 @@ def load_dataset(
parser.add_argument(
"--cogsets",
type=Path,
default="",
default=None,
help="Path to an optional second Excel file containing cogsets and cognate judgements",
)
parser.add_argument(
Expand Down
10 changes: 9 additions & 1 deletion src/lexedata/report/homophones.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,13 @@ def list_homophones(
clics = nx.Graph()

c_id = dataset["ParameterTable", "id"].name
c_concepticon = dataset["ParameterTable", "concepticonReference"].name
try:
c_concepticon = dataset["ParameterTable", "concepticonReference"].name
except KeyError:
cli.Exit.INVALID_DATASET(
"This script requires a column concepticonReference in ParamterTable. "
"Please run add_concepticon.py"
)
concepticon = {}
for concept in dataset["ParameterTable"]:
concepticon[concept[c_id]] = concept[c_concepticon]
Expand All @@ -39,6 +45,8 @@ def list_homophones(
] = t.DefaultDict(lambda: t.DefaultDict(set))

for form in dataset["FormTable"]:
if form[f_form] == "-" or form[f_form] is None:
continue
homophones[form[f_lang]][form[f_form]].add((form[f_concept], form[f_id]))

for lang, forms in homophones.items():
Expand Down
1 change: 1 addition & 0 deletions src/lexedata/util/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,7 @@ def parse_form(
if element.startswith(start):
break
else:
# TODO: here an other if catchin '-' might be necessary
# The only thing we expect outside delimiters is the variant
# separators, '~' and '%'.
if self.variant_separator and element in self.variant_separator:
Expand Down
Loading

0 comments on commit 1756f60

Please sign in to comment.