From 3d89c033e68e93ffeee81cb4b8c214cdc97354a1 Mon Sep 17 00:00:00 2001 From: Oscar Dowson Date: Mon, 15 Sep 2025 11:46:16 +1200 Subject: [PATCH 1/4] [FileFormats.LP] allow newline in term and improve keyword identification --- src/FileFormats/LP/read.jl | 123 +++++++++++++++++++++++++++---------- test/FileFormats/LP/LP.jl | 59 ++++++++++++++++++ 2 files changed, 149 insertions(+), 33 deletions(-) diff --git a/src/FileFormats/LP/read.jl b/src/FileFormats/LP/read.jl index 0291d9e87c..6326c69264 100644 --- a/src/FileFormats/LP/read.jl +++ b/src/FileFormats/LP/read.jl @@ -280,13 +280,15 @@ A struct that is used to manage state when lexing. It stores: error message to the user on a parse error * `peek_char`: the next `Char` in the `io` * `peek_tokens`: the list of upcoming tokens that we have already peeked + * `current_token`: the most recent token that we have `read` """ mutable struct _LexerState{O<:IO} io::O line::Int peek_char::Union{Nothing,Char} peek_tokens::Vector{_Token} - _LexerState(io::IO) = new{typeof(io)}(io, 1, nothing, _Token[]) + current_token::Union{Nothing,_Token} + _LexerState(io::IO) = new{typeof(io)}(io, 1, nothing, _Token[], nothing) end """ @@ -351,6 +353,7 @@ function Base.read(state::_LexerState, ::Type{_Token}) ) end popfirst!(state.peek_tokens) + state.current_token = token return token end @@ -371,6 +374,16 @@ end _is_number(c::Char) = isdigit(c) || c in ('.', 'e', 'E', '+', '-') +_nothing_or_newline(::Nothing) = true +_nothing_or_newline(t::_Token) = t.kind == _TOKEN_NEWLINE + +function _prior_token(state::_LexerState) + if length(state.peek_tokens) <= 1 + return state.current_token + end + return state.peek_tokens[end-1] +end + function Base.peek(state::_LexerState, ::Type{_Token}, n::Int = 1) @assert n >= 1 while length(state.peek_tokens) < n @@ -379,22 +392,47 @@ function Base.peek(state::_LexerState, ::Type{_Token}, n::Int = 1) return nothing end push!(state.peek_tokens, token) - if _compare_case_insenstive(token, "subject") + if token.kind != _TOKEN_IDENTIFIER + continue + end + # Here we have a _TOKEN_IDENTIFIER. If it is preceeded by a + # _TOKEN_NEWLINE, it may be a _TOKEN_KEYWORD. + if !_nothing_or_newline(_prior_token(state)) + continue # It can't be a keyword + end + # It might be a _TOKEN_KEYWORD. + kw = _case_insenstive_identifier_to_keyword(token.value) + if kw !== nothing + # The token matches a single word keyword. All keywords are followed + # by a new line, or an EOF. t = _peek_inner(state) - if _compare_case_insenstive(t, "to") - state.peek_tokens[end] = - _Token(_TOKEN_KEYWORD, "CONSTRAINTS", token.pos) - else - push!(state.peek_tokens, t) + if _nothing_or_newline(t) + state.peek_tokens[end] = _Token(_TOKEN_KEYWORD, kw, token.pos) end - elseif _compare_case_insenstive(token, "such") - t = _peek_inner(state) - if _compare_case_insenstive(t, "that") - state.peek_tokens[end] = - _Token(_TOKEN_KEYWORD, "CONSTRAINTS", token.pos) - else + if t !== nothing push!(state.peek_tokens, t) end + continue + end + for (a, b) in ["subject" => "to", "such" => "that"] + if _compare_case_insenstive(token, a) + # This _might_ be `subject to`, or it might just be a variable + # named `subject`, like `obj:\n subject\n`. + t = _peek_inner(state) + if t !== nothing + t2 = _peek_inner(state) + if _compare_case_insenstive(t, b) && _nothing_or_newline(t2) + state.peek_tokens[end] = + _Token(_TOKEN_KEYWORD, "CONSTRAINTS", token.pos) + else + push!(state.peek_tokens, t) + end + if t2 !== nothing + push!(state.peek_tokens, t2) + end + end + continue + end end end return state.peek_tokens[n] @@ -426,11 +464,7 @@ function _peek_inner(state::_LexerState) write(buf, c) _ = read(state, Char) end - val = String(take!(buf)) - if (kw = _case_insenstive_identifier_to_keyword(val)) !== nothing - return _Token(_TOKEN_KEYWORD, kw, pos) - end - return _Token(_TOKEN_IDENTIFIER, val, pos) + return _Token(_TOKEN_IDENTIFIER, String(take!(buf)), pos) elseif (op = get(_OPERATORS, c, nothing)) !== nothing _ = read(state, Char) # Skip c if c == '-' && peek(state, Char) == '>' @@ -473,6 +507,19 @@ function _skip_newlines(state::_LexerState) return end +function _next_non_newline(state::_LexerState) + n = 1 + while true + t = peek(state, _Token, n) + if t === nothing + return nothing + elseif t.kind != _TOKEN_NEWLINE + return t + end + n += 1 + end +end + # IDENTIFIER := "string" # # There _are_ rules to what an identifier can be. We handle these when lexing. @@ -605,14 +652,10 @@ function _parse_quad_expression( ) end end - while _next_token_is(state, _TOKEN_NEWLINE) - if _next_token_is(state, _TOKEN_KEYWORD, 2) - break - end - _ = read(state, _Token, _TOKEN_NEWLINE) - end - if _next_token_is(state, _TOKEN_DIVISION) - _ = read(state, _Token) # / + t = _next_non_newline(state) + if t.kind == _TOKEN_DIVISION + _skip_newlines(state) + _ = read(state, _Token, _TOKEN_DIVISION) # / # Must be /2 n = read(state, _Token, _TOKEN_NUMBER) if n.value != "2" @@ -634,10 +677,11 @@ function _parse_quad_expression( end # TERM := -# "+" TERM +# [\n*] TERM +# | "+" TERM # | "-" TERM -# | NUMBER # | IDENTIFIER +# | NUMBER # | NUMBER IDENTIFIER # | NUMBER "*" IDENTIFIER # | QUADRATIC_EXPRESSION @@ -670,12 +714,25 @@ function _parse_term( _ = read(state, _Token, _TOKEN_MULTIPLICATION) x = _parse_variable(state, cache) return MOI.ScalarAffineTerm(coef, x) - elseif _next_token_is(state, _TOKEN_NEWLINE) || - _next_token_is(state, _TOKEN_ADDITION) || - _next_token_is(state, _TOKEN_SUBTRACTION) - # NUMBER - return coef + elseif _next_token_is(state, _TOKEN_NEWLINE) + # This could either be NUMBER \nEND-OF-TERM, or it could be a term + # split by a new line, like `2\nx`. + t = _next_non_newline(state) + if t.kind == _TOKEN_MULTIPLICATION + # NUMBER \n * [\n] IDENTIFIER + _skip_newlines(state) + _ = read(state, _Token, _TOKEN_MULTIPLICATION) + _skip_newlines(state) + x = _parse_variable(state, cache) + return MOI.ScalarAffineTerm(coef, x) + elseif t.kind == _TOKEN_IDENTIFIER + # NUMBER \n IDENTIFIER + x = _parse_variable(state, cache) + return MOI.ScalarAffineTerm(coef, x) + end end + # NUMBER + return coef elseif _next_token_is(state, _TOKEN_OPEN_BRACKET) # QUADRATIC_EXPRESSION return _parse_quad_expression(state, cache, prefix) diff --git a/test/FileFormats/LP/LP.jl b/test/FileFormats/LP/LP.jl index 1dc33db5f9..a0b9a0b619 100644 --- a/test/FileFormats/LP/LP.jl +++ b/test/FileFormats/LP/LP.jl @@ -1554,6 +1554,65 @@ function test_new_line_edge_case_fails() return end +function test_parse_keyword_edge_cases_identifier_is_keyword() + for name in ["max", "min", "st", "such", "bounds", "obj", "free"] + io = IOBuffer(""" + maximize + obj: $name + subject to + $name <= 1 + bounds + $name free + end + """) + seekstart(io) + model = LP.Model() + MOI.read!(io, model) + x = only(MOI.get(model, MOI.ListOfVariableIndices())) + @test MOI.get(model, MOI.VariableName(), x) == name + end + return +end + +function test_parse_keyword_subject_to_errors() + for line in ["subject", "subject too", "subject to a:"] + io = IOBuffer(""" + maximize + obj: x + $line + x <= 1 + bounds + x free + end + """) + seekstart(io) + model = LP.Model() + @test_throws LP.ParseError MOI.read!(io, model) + end + return +end + +function test_parse_keyword_subject_to_errors() + for obj in ["2 x", "\n2 x", "2\nx", "2*\nx", "2\n*x", "2\n\n*\n\n\nx\n"] + io = IOBuffer(""" + maximize + obj: $obj + subject to + bounds + x free + end + """) + seekstart(io) + model = LP.Model() + MOI.read!(io, model) + x = MOI.get(model, MOI.VariableIndex, "x") + f = 2.0 * x + g = MOI.get(model, MOI.ObjectiveFunction{typeof(f)}()) + @test isapprox(f, g) + end + return +end + end # module TestLP.runtests() From 002fd04d497799a819f99b6dab1f0bdb2c825535 Mon Sep 17 00:00:00 2001 From: Oscar Dowson Date: Mon, 15 Sep 2025 12:08:11 +1200 Subject: [PATCH 2/4] Update --- test/FileFormats/LP/LP.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/FileFormats/LP/LP.jl b/test/FileFormats/LP/LP.jl index a0b9a0b619..dc1db045a5 100644 --- a/test/FileFormats/LP/LP.jl +++ b/test/FileFormats/LP/LP.jl @@ -1592,7 +1592,7 @@ function test_parse_keyword_subject_to_errors() return end -function test_parse_keyword_subject_to_errors() +function test_parse_newline_in_objective_expression() for obj in ["2 x", "\n2 x", "2\nx", "2*\nx", "2\n*x", "2\n\n*\n\n\nx\n"] io = IOBuffer(""" maximize From 0cbd79ee81ba085c4f6dedacf9e37ce44523718b Mon Sep 17 00:00:00 2001 From: Oscar Dowson Date: Mon, 15 Sep 2025 12:18:43 +1200 Subject: [PATCH 3/4] Update --- src/FileFormats/LP/read.jl | 53 +++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/src/FileFormats/LP/read.jl b/src/FileFormats/LP/read.jl index 6326c69264..96b852974d 100644 --- a/src/FileFormats/LP/read.jl +++ b/src/FileFormats/LP/read.jl @@ -395,13 +395,13 @@ function Base.peek(state::_LexerState, ::Type{_Token}, n::Int = 1) if token.kind != _TOKEN_IDENTIFIER continue end - # Here we have a _TOKEN_IDENTIFIER. If it is preceeded by a - # _TOKEN_NEWLINE, it may be a _TOKEN_KEYWORD. + # Here we have a _TOKEN_IDENTIFIER. But if it is not preceeded by a + # _TOKEN_NEWLINE, it cannot be a _TOKEN_KEYWORD. if !_nothing_or_newline(_prior_token(state)) - continue # It can't be a keyword + continue end # It might be a _TOKEN_KEYWORD. - kw = _case_insenstive_identifier_to_keyword(token.value) + (kw = _case_insenstive_identifier_to_keyword(token.value)) if kw !== nothing # The token matches a single word keyword. All keywords are followed # by a new line, or an EOF. @@ -414,25 +414,36 @@ function Base.peek(state::_LexerState, ::Type{_Token}, n::Int = 1) end continue end - for (a, b) in ["subject" => "to", "such" => "that"] - if _compare_case_insenstive(token, a) - # This _might_ be `subject to`, or it might just be a variable - # named `subject`, like `obj:\n subject\n`. - t = _peek_inner(state) - if t !== nothing - t2 = _peek_inner(state) - if _compare_case_insenstive(t, b) && _nothing_or_newline(t2) - state.peek_tokens[end] = - _Token(_TOKEN_KEYWORD, "CONSTRAINTS", token.pos) - else - push!(state.peek_tokens, t) - end - if t2 !== nothing - push!(state.peek_tokens, t2) - end - end + # There are two keyword that contain whitespace: `subject to` and + # `such that` + for (a, b) in ("subject" => "to", "such" => "that") + if !_compare_case_insenstive(token, a) continue end + # This _might_ be `subject to`, or it might just be a variable + # named `subject`, like `obj:\n subject\n`. + token_b = _peek_inner(state) + if token_b === nothing + # The next token is EOF. Nothing to do here. + break + elseif !_compare_case_insenstive(token_b, b) + # The second token doesn't match. Store `token_b` and break + push!(state.peek_tokens, token_b) + break + end + # We have something that matches (a, b), but a TOKEN_KEYWORD needs + # to be followed by a new line. + token_nl = _peek_inner(state) + if _nothing_or_newline(token_nl) + state.peek_tokens[end] = + _Token(_TOKEN_KEYWORD, "CONSTRAINTS", token.pos) + else + push!(state.peek_tokens, token_b) + end + if token_nl !== nothing + push!(state.peek_tokens, token_nl) + end + break end end return state.peek_tokens[n] From 0d1ba918cd0bba411a5f53141d178f6977d39bf5 Mon Sep 17 00:00:00 2001 From: Oscar Dowson Date: Mon, 15 Sep 2025 13:13:19 +1200 Subject: [PATCH 4/4] Update --- src/FileFormats/LP/read.jl | 7 +++++-- test/FileFormats/LP/LP.jl | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/src/FileFormats/LP/read.jl b/src/FileFormats/LP/read.jl index 96b852974d..f3101ca8d8 100644 --- a/src/FileFormats/LP/read.jl +++ b/src/FileFormats/LP/read.jl @@ -664,7 +664,7 @@ function _parse_quad_expression( end end t = _next_non_newline(state) - if t.kind == _TOKEN_DIVISION + if t !== nothing && t.kind == _TOKEN_DIVISION _skip_newlines(state) _ = read(state, _Token, _TOKEN_DIVISION) # / # Must be /2 @@ -729,7 +729,10 @@ function _parse_term( # This could either be NUMBER \nEND-OF-TERM, or it could be a term # split by a new line, like `2\nx`. t = _next_non_newline(state) - if t.kind == _TOKEN_MULTIPLICATION + if t === nothing + # NUMBER + return coef + elseif t.kind == _TOKEN_MULTIPLICATION # NUMBER \n * [\n] IDENTIFIER _skip_newlines(state) _ = read(state, _Token, _TOKEN_MULTIPLICATION) diff --git a/test/FileFormats/LP/LP.jl b/test/FileFormats/LP/LP.jl index dc1db045a5..3d1a788958 100644 --- a/test/FileFormats/LP/LP.jl +++ b/test/FileFormats/LP/LP.jl @@ -1613,6 +1613,40 @@ function test_parse_newline_in_objective_expression() return end +function test_parse_subject_eof() + io = IOBuffer("maximize\nobj:\nsubject") + seekstart(io) + model = LP.Model() + MOI.read!(io, model) + x = MOI.get(model, MOI.VariableIndex, "subject") + @test x isa MOI.VariableIndex + return +end + +function test_parse_expr_eof() + io = IOBuffer("maximize\nobj: x + 2\n") + seekstart(io) + model = LP.Model() + MOI.read!(io, model) + x = MOI.get(model, MOI.VariableIndex, "x") + f = 1.0 * x + 2.0 + g = MOI.get(model, MOI.ObjectiveFunction{typeof(f)}()) + @test isapprox(f, g) + return +end + +function test_parse_quadratic_expr_eof() + io = IOBuffer("maximize\nobj: [x * x]\n") + seekstart(io) + model = LP.Model() + MOI.read!(io, model) + x = MOI.get(model, MOI.VariableIndex, "x") + f = 1.0 * x * x + g = MOI.get(model, MOI.ObjectiveFunction{typeof(f)}()) + @test isapprox(f, g) + return +end + end # module TestLP.runtests()