Skip to content

Commit

Permalink
refactor(config): update binary_features to multinary_features
Browse files Browse the repository at this point in the history
  • Loading branch information
wajdikhattel authored and lukehsiao committed Jul 1, 2020
1 parent e0a1ca7 commit 21a5a1e
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 21 deletions.
2 changes: 1 addition & 1 deletion docs/user/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ The default ``.fonduer-config.yaml`` configuration file is shown below::
max: 2
get_col_ngrams:
max: 2
binary_features:
multinary_features:
min_row_diff:
absolute: False
min_col_diff:
Expand Down
2 changes: 1 addition & 1 deletion docs/user/features.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ The different featurization parameters are explained in this section::
get_col_ngrams:
max: 2
# binary feature settings
binary_features:
multinary_features:
# minimal difference in rows to check
min_row_diff:
absolute: False
Expand Down
40 changes: 23 additions & 17 deletions src/fonduer/features/feature_libs/tabular_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
DEF_VALUE = 1

unary_tablelib_feats: Dict[str, Set] = {}
multary_strlib_feats: Dict[str, Set] = {}
multinary_tablelib_feats: Dict[str, Set] = {}

settings = get_config()

Expand Down Expand Up @@ -62,12 +62,12 @@ def extract_tabular_features(
for f, v in unary_tablelib_feats[span.stable_id]:
yield candidate.id, FEAT_PRE + prefix + f, v

if candidate.id not in multary_strlib_feats:
multary_strlib_feats[candidate.id] = set()
for f, v in _tablelib_multary_features(spans):
multary_strlib_feats[candidate.id].add((f, v))
if candidate.id not in multinary_tablelib_feats:
multinary_tablelib_feats[candidate.id] = set()
for f, v in _tablelib_multinary_features(spans):
multinary_tablelib_feats[candidate.id].add((f, v))

for f, v in multary_strlib_feats[candidate.id]:
for f, v in multinary_tablelib_feats[candidate.id]:
yield candidate.id, FEAT_PRE + f, v


Expand Down Expand Up @@ -130,11 +130,11 @@ def _tablelib_unary_features(span: SpanMention) -> Iterator[Tuple[str, int]]:
# yield "COL_INFERRED_%s_[%s]" % (attrib.upper(), ngram), DEF_VALUE


def _tablelib_multary_features(
def _tablelib_multinary_features(
spans: Tuple[SpanMention, ...]
) -> Iterator[Tuple[str, int]]:
"""Table-/structure-related features for multiple spans."""
binary_features = settings["featurization"]["tabular"]["binary_features"]
multinary_features = settings["featurization"]["tabular"]["multinary_features"]
span_sentences = [span.sentence for span in spans]
if all([sentence.is_tabular() for sentence in span_sentences]):
span_tables = [sentence.table for sentence in span_sentences]
Expand All @@ -143,26 +143,32 @@ def _tablelib_multary_features(
if all([span.sentence.cell is not None for span in spans]):
row_diff = min_row_diff(
span_sentences,
absolute=binary_features["min_row_diff"]["absolute"],
absolute=multinary_features["min_row_diff"]["absolute"],
)
col_diff = min_col_diff(
span_sentences,
absolute=binary_features["min_col_diff"]["absolute"],
absolute=multinary_features["min_col_diff"]["absolute"],
)
yield f"SAME_TABLE_ROW_DIFF_[{row_diff}]", DEF_VALUE
yield f"SAME_TABLE_COL_DIFF_[{col_diff}]", DEF_VALUE
yield (
f"SAME_TABLE_MANHATTAN_DIST_[{abs(row_diff) + abs(col_diff)}]"
), DEF_VALUE
span_cells = [sentence.cell for sentence in span_sentences]
if [span_cells[1:] == span_cells[:-1]]:
if span_cells[1:] == span_cells[:-1]:
yield "SAME_CELL", DEF_VALUE
word_diff = spans[0].get_word_start_index() - min(
span.get_word_start_index() for span in spans
word_diff = sum(
[
s1.get_word_start_index() - s2.get_word_start_index()
for s1, s2 in zip(spans[:-1], spans[1:])
]
)
yield (f"WORD_DIFF_[{word_diff}]"), DEF_VALUE
char_diff = spans[0].char_start - min(
span.char_start for span in spans
char_diff = sum(
[
s1.char_start - s2.char_start
for s1, s2 in zip(spans[:-1], spans[1:])
]
)
yield (f"CHAR_DIFF_[{char_diff}]"), DEF_VALUE
if [span_sentences[1:] == span_sentences[:-1]]:
Expand All @@ -172,11 +178,11 @@ def _tablelib_multary_features(
yield "DIFF_TABLE", DEF_VALUE
row_diff = min_row_diff(
span_sentences,
absolute=binary_features["min_row_diff"]["absolute"],
absolute=multinary_features["min_row_diff"]["absolute"],
)
col_diff = min_col_diff(
span_sentences,
absolute=binary_features["min_col_diff"]["absolute"],
absolute=multinary_features["min_col_diff"]["absolute"],
)
yield f"DIFF_TABLE_ROW_DIFF_[{row_diff}]", DEF_VALUE
yield f"DIFF_TABLE_COL_DIFF_[{col_diff}]", DEF_VALUE
Expand Down
2 changes: 1 addition & 1 deletion src/fonduer/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"get_row_ngrams": {"max": 2},
"get_col_ngrams": {"max": 2},
},
"binary_features": {
"multinary_features": {
"min_row_diff": {"absolute": False},
"min_col_diff": {"absolute": False},
},
Expand Down
2 changes: 1 addition & 1 deletion tests/utils/.fonduer-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ featurization:
max: 2
get_col_ngrams:
max: 2
binary_features:
multinary_features:
min_row_diff:
absolute: False
min_col_diff:
Expand Down

0 comments on commit 21a5a1e

Please sign in to comment.