-
Notifications
You must be signed in to change notification settings - Fork 29
Add fact parser validation, tests, and bulk fact loading with tests #100
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
972b263
e498f4a
0e2e395
cf592e2
0908471
5a78cea
83a2452
ee0ea04
f1395dc
0e3db89
a402321
d1cb309
b903281
cad2d95
3eecf32
05b3748
0c79988
b1ea06c
01a20a4
42d54e2
f4fbcb0
3ba07ec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,40 +1,163 @@ | ||
| import pyreason.scripts.numba_wrapper.numba_types.interval_type as interval | ||
| import re | ||
|
|
||
|
|
||
| # Input validation work was implemented with the help of Claude Sonnet 4.5. | ||
| def parse_fact(fact_text): | ||
| # Validate input is not empty or whitespace only | ||
| if not fact_text or not fact_text.strip(): | ||
| raise ValueError("Fact text cannot be empty or whitespace only") | ||
|
|
||
| f = fact_text.replace(' ', '') | ||
|
|
||
| # Check for multiple colons | ||
| colon_count = f.count(':') | ||
| if colon_count > 1: | ||
| raise ValueError(f"Fact text contains multiple colons ({colon_count}), expected at most 1") | ||
|
|
||
| # Check for double negation | ||
| if f.startswith('~~'): | ||
| raise ValueError("Double negation is not allowed") | ||
|
|
||
| # Separate into predicate-component and bound. If there is no bound it means it's true | ||
| negate_interval = False | ||
| if ':' in f: | ||
| pred_comp, bound = f.split(':') | ||
| parts = f.split(':') | ||
| if len(parts) != 2: | ||
| raise ValueError("Invalid fact format: expected at most one colon separator") | ||
| pred_comp, bound = parts | ||
|
|
||
| # Check for negation with explicit bound | ||
| if pred_comp.startswith('~'): | ||
| pred_comp = pred_comp[1:] | ||
| if bound.lower() == 'true': | ||
| bound = 'False' | ||
| elif bound.lower() == 'false': | ||
| bound = 'True' | ||
| else: | ||
| negate_interval = True | ||
| else: | ||
| pred_comp = f | ||
| if pred_comp[0] == '~': | ||
| if pred_comp.startswith('~'): | ||
| bound = 'False' | ||
| pred_comp = pred_comp[1:] | ||
| else: | ||
| bound = 'True' | ||
|
|
||
| # Check if bound is a boolean or a list of floats | ||
| bound = bound.lower() | ||
| if bound == 'true': | ||
| bound = interval.closed(1, 1) | ||
| elif bound == 'false': | ||
| bound = interval.closed(0, 0) | ||
| else: | ||
| bound = [float(b) for b in bound[1:-1].split(',')] | ||
| bound = interval.closed(*bound) | ||
| # Validate predicate-component is not empty | ||
| if not pred_comp: | ||
| raise ValueError("Predicate-component cannot be empty") | ||
|
|
||
| # Validate parentheses exist and are properly formed | ||
| if '(' not in pred_comp: | ||
| raise ValueError("Missing opening parenthesis in fact") | ||
| if ')' not in pred_comp: | ||
| raise ValueError("Missing closing parenthesis in fact") | ||
|
|
||
| # Check for nested or multiple parentheses | ||
| open_count = pred_comp.count('(') | ||
| close_count = pred_comp.count(')') | ||
| if open_count != 1 or close_count != 1: | ||
| raise ValueError(f"Invalid parentheses: found {open_count} '(' and {close_count} ')', expected exactly 1 of each") | ||
|
|
||
| # Check parentheses are in correct order | ||
| open_idx = pred_comp.find('(') | ||
| close_idx = pred_comp.find(')') | ||
| if open_idx >= close_idx: | ||
| raise ValueError("Invalid parentheses order: '(' must come before ')'") | ||
|
|
||
| # Check closing parenthesis is at the end | ||
| if close_idx != len(pred_comp) - 1: | ||
| raise ValueError("Closing parenthesis must be at the end of predicate-component") | ||
|
|
||
| # Split the predicate and component | ||
| idx = pred_comp.find('(') | ||
| pred = pred_comp[:idx] | ||
| component = pred_comp[idx + 1:-1] | ||
|
|
||
| # Validate predicate is not empty | ||
| if not pred: | ||
| raise ValueError("Predicate cannot be empty") | ||
|
|
||
| # Validate predicate contains only valid characters (alphanumeric and underscore) | ||
| # Predicates must start with a letter or underscore (like Python identifiers) | ||
| if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', pred): | ||
| if pred[0].isdigit(): | ||
| raise ValueError(f"Predicate '{pred}' cannot start with a digit. Must start with a letter or underscore") | ||
| else: | ||
| raise ValueError(f"Predicate '{pred}' contains invalid characters. Only letters, digits, and underscores allowed, must start with letter or underscore") | ||
|
|
||
| # Validate component is not empty | ||
| if not component: | ||
| raise ValueError("Component cannot be empty") | ||
|
|
||
| # Check for invalid characters in component | ||
| if '(' in component or ')' in component: | ||
| raise ValueError("Component cannot contain parentheses") | ||
| if ':' in component: | ||
| raise ValueError("Component cannot contain colons") | ||
|
|
||
| # Check if it is a node or edge fact | ||
| if ',' in component: | ||
| fact_type = 'edge' | ||
| component = tuple(component.split(',')) | ||
| components = component.split(',') | ||
|
|
||
| # Validate exactly 2 components for edges | ||
| if len(components) != 2: | ||
| raise ValueError(f"Edge facts must have exactly 2 components, found {len(components)}") | ||
|
|
||
| # Validate no empty components | ||
| for i, comp in enumerate(components): | ||
| if not comp: | ||
| raise ValueError(f"Component {i+1} in edge fact cannot be empty") | ||
|
|
||
| component = tuple(components) | ||
| else: | ||
| fact_type = 'node' | ||
|
|
||
| # Check if bound is a boolean or a list of floats | ||
| if bound.lower() == 'true': | ||
| bound = interval.closed(1, 1) | ||
| elif bound.lower() == 'false': | ||
| bound = interval.closed(0, 0) | ||
| else: | ||
| # Validate interval format | ||
| if not bound.startswith('['): | ||
| raise ValueError(f"Invalid bound format: expected '[' at start of interval, got '{bound[0] if bound else 'empty'}'") | ||
| if not bound.endswith(']'): | ||
| raise ValueError(f"Invalid bound format: expected ']' at end of interval, got '{bound[-1] if bound else 'empty'}'") | ||
|
|
||
| # Extract values between brackets | ||
| interval_content = bound[1:-1] | ||
| if not interval_content: | ||
| raise ValueError("Interval cannot be empty") | ||
|
|
||
| # Parse float values | ||
| parts = interval_content.split(',') | ||
| if len(parts) != 2: | ||
| raise ValueError(f"Interval must have exactly 2 values, found {len(parts)}") | ||
|
|
||
| try: | ||
| bound_values = [float(b) for b in parts] | ||
| except ValueError as e: | ||
| raise ValueError(f"Invalid interval values: {e}") | ||
|
|
||
| lower, upper = bound_values | ||
| # Validate bounds are in valid range [0, 1] | ||
| if lower < 0 or lower > 1: | ||
| raise ValueError(f"Interval lower bound {lower} is out of valid range [0, 1]") | ||
| if upper < 0 or upper > 1: | ||
| raise ValueError(f"Interval upper bound {upper} is out of valid range [0, 1]") | ||
|
|
||
| # Validate lower <= upper | ||
| if lower > upper: | ||
| raise ValueError(f"Interval lower bound {lower} cannot be greater than upper bound {upper}") | ||
|
|
||
| # We calculate ~[l,u] = [1-u, 1-l] | ||
| # Round to eliminate floating point precision errors (e.g., 1 - 0.8 = 0.19999999...) | ||
| if negate_interval: | ||
| lower, upper = round(1 - upper, 10), round(1 - lower, 10) | ||
|
|
||
| bound = interval.closed(lower, upper) | ||
|
|
||
| return pred, component, bound, fact_type | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| fact_text,name,start_time,end_time,static | ||
| Viewed(Zach),seen-fact-zach,0,3,False | ||
| Viewed(Justin),seen-fact-justin,0,3,true | ||
| Viewed(Michelle),seen-fact-michelle,1,3,FALSE | ||
| Viewed(Amy),seen-fact-amy,2,3,0 | ||
| "HaveAccess(Zach,TextMessage)",access-zach,0,5,True | ||
| "HaveAccess(Justin,TextMessage)",access-justin,0,5,1 | ||
| "HaveAccess(Michelle,TextMessage)",access-michelle,0,5,yes | ||
| "HaveAccess(Amy,TextMessage)",access-amy,0,5,no | ||
| "Processed(Node1):[0.5,0.8]",interval-node,0,10,False | ||
| "Knows(Person1,Person2):[0.7,0.9]",interval-edge,0,10,True | ||
| Viewed(Valid),valid-fact,0,3,False | ||
| ,empty-fact-text,0,3,False | ||
| InvalidSyntax,bad-syntax,0,3,False | ||
| "Viewed(OutOfRange):[2.5,3.0]",out-of-range,0,3,False | ||
| Viewed(BadStartTime),bad-start,abc,5,False | ||
| Viewed(BadEndTime),bad-end,0,xyz,False | ||
| Viewed(BadStaticValue),bad-static,0,5,invalid | ||
| Viewed(EmptyOptionals),,,, |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,121 @@ | ||
| [ | ||
| { | ||
| "fact_text": "Viewed(Zach)", | ||
| "name": "seen-fact-zach", | ||
| "start_time": 0, | ||
| "end_time": 3, | ||
| "static": false | ||
| }, | ||
| { | ||
| "fact_text": "Viewed(Justin)", | ||
| "name": "seen-fact-justin", | ||
| "start_time": 0, | ||
| "end_time": 3, | ||
| "static": true | ||
| }, | ||
| { | ||
| "fact_text": "Viewed(Michelle)", | ||
| "name": "seen-fact-michelle", | ||
| "start_time": 1, | ||
| "end_time": 3, | ||
| "static": false | ||
| }, | ||
| { | ||
| "fact_text": "Viewed(Amy)", | ||
| "name": "seen-fact-amy", | ||
| "start_time": 2, | ||
| "end_time": 3, | ||
| "static": 0 | ||
| }, | ||
| { | ||
| "fact_text": "HaveAccess(Zach,TextMessage)", | ||
| "name": "access-zach", | ||
| "start_time": 0, | ||
| "end_time": 5, | ||
| "static": true | ||
| }, | ||
| { | ||
| "fact_text": "HaveAccess(Justin,TextMessage)", | ||
| "name": "access-justin", | ||
| "start_time": 0, | ||
| "end_time": 5, | ||
| "static": 1 | ||
| }, | ||
| { | ||
| "fact_text": "HaveAccess(Michelle,TextMessage)", | ||
| "name": "access-michelle", | ||
| "start_time": 0, | ||
| "end_time": 5, | ||
| "static": "yes" | ||
| }, | ||
| { | ||
| "fact_text": "HaveAccess(Amy,TextMessage)", | ||
| "name": "access-amy", | ||
| "start_time": 0, | ||
| "end_time": 5, | ||
| "static": "no" | ||
| }, | ||
| { | ||
| "fact_text": "Processed(Node1):[0.5,0.8]", | ||
| "name": "interval-node" | ||
| }, | ||
| { | ||
| "fact_text": "Knows(Person1,Person2):[0.7,0.9]", | ||
| "name": "interval-edge", | ||
| "start_time": 0, | ||
| "end_time": 10, | ||
| "static": true | ||
| }, | ||
| { | ||
| "fact_text": "Viewed(Valid)", | ||
| "name": "valid-fact", | ||
| "start_time": 0, | ||
| "end_time": 3, | ||
| "static": false | ||
| }, | ||
| { | ||
| "fact_text": "", | ||
| "name": "empty-fact-text", | ||
| "start_time": 0, | ||
| "end_time": 3, | ||
| "static": false | ||
| }, | ||
| { | ||
| "fact_text": "InvalidSyntax", | ||
| "name": "bad-syntax", | ||
| "start_time": 0, | ||
| "end_time": 3, | ||
| "static": false | ||
| }, | ||
| { | ||
| "fact_text": "Viewed(OutOfRange):[2.5,3.0]", | ||
| "name": "out-of-range", | ||
| "start_time": 0, | ||
| "end_time": 3, | ||
| "static": false | ||
| }, | ||
| { | ||
| "fact_text": "Viewed(BadStartTime)", | ||
| "name": "bad-start", | ||
| "start_time": "abc", | ||
| "end_time": 5, | ||
| "static": false | ||
| }, | ||
| { | ||
| "fact_text": "Viewed(BadEndTime)", | ||
| "name": "bad-end", | ||
| "start_time": 0, | ||
| "end_time": "xyz", | ||
| "static": false | ||
| }, | ||
| { | ||
| "fact_text": "Viewed(BadStaticValue)", | ||
| "name": "bad-static", | ||
| "start_time": 0, | ||
| "end_time": 5, | ||
| "static": "invalid" | ||
| }, | ||
| { | ||
| "fact_text": "Viewed(EmptyOptionals)" | ||
| } | ||
| ] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add cases where bounds are specified. Edge cases where bounds are wrong - non-numeric or outside [0,1] or lower > upper.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These checks are a part of the fact parser updates in the PR here. The load_from_json() validates that the csv inputs are integer values - see my response to the comment you left above. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| Viewed(Alice),fact-alice,0,5,False | ||
| Viewed(Bob),fact-bob,1,5,False | ||
| "Connected(Alice,Bob):[0.7,0.9]",connection-fact,0,10,True | ||
| ,empty-fact,0,5,False | ||
| InvalidNoParens,bad-fact,0,5,True | ||
| Viewed(Charlie),good-fact,bad-time,5,False |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@dyumanaditya Looks good to me, can you check this once?