Skip to content
This repository has been archived by the owner on Feb 26, 2024. It is now read-only.

Commit

Permalink
Merge pull request #4 from HParker/convert-byte-offset-to-char-offset
Browse files Browse the repository at this point in the history
assure token encoding is correct and locations are in characters, not bytes
  • Loading branch information
kddnewton committed Dec 1, 2023
2 parents bdef470 + 0586e26 commit bd00146
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 23 deletions.
10 changes: 8 additions & 2 deletions lib/parser/prism.rb
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def try_declare_numparam(node)
private

def build_ast(program)
program.accept(Compiler.new(self))
program.accept(Compiler.new(self, offset_cache))
end

def build_comments(comments)
Expand All @@ -89,7 +89,13 @@ def build_comments(comments)
end

def build_tokens(tokens)
Lexer.new(source_buffer, tokens.map(&:first)).to_a
Lexer.new(source_buffer, tokens.map(&:first), offset_cache).to_a
end

def offset_cache
@offset_cache ||= Hash.new do |h, k|
h[k] = @source_buffer.source.byteslice(0, k).length
end
end
end
end
Expand Down
15 changes: 8 additions & 7 deletions lib/parser/prism/compiler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
module Parser
class Prism
class Compiler < ::Prism::Compiler
attr_reader :parser, :builder, :source_buffer
attr_reader :parser, :builder, :source_buffer, :offset_cache
attr_reader :locals, :in_destructure, :in_pattern

def initialize(parser, locals: nil, in_destructure: false, in_pattern: false)
def initialize(parser, offset_cache, locals: nil, in_destructure: false, in_pattern: false)
@parser = parser
@builder = parser.builder
@source_buffer = parser.source_buffer
@offset_cache = offset_cache

@locals = locals
@in_destructure = in_destructure
Expand Down Expand Up @@ -1560,7 +1561,7 @@ def visit_yield_node(node)
private

def copy_compiler(locals: self.locals, in_destructure: self.in_destructure, in_pattern: self.in_pattern)
Compiler.new(parser, locals: locals, in_destructure: in_destructure, in_pattern: in_pattern)
Compiler.new(parser, offset_cache, locals: locals, in_destructure: in_destructure, in_pattern: in_pattern)
end

# Blocks can have a special set of parameters that automatically expand
Expand All @@ -1579,12 +1580,12 @@ def procarg0?(parameters)

# Constructs a new source range from the given start and end offsets.
def srange(location)
Source::Range.new(source_buffer, location.start_offset, location.end_offset) if location
Source::Range.new(source_buffer, offset_cache[location.start_offset], offset_cache[location.end_offset]) if location
end

# Constructs a new source range from the given start and end offsets.
def srange_offsets(start_offset, end_offset)
Source::Range.new(source_buffer, start_offset, end_offset)
Source::Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])
end

# Constructs a new source range by finding the given tokens between the
Expand All @@ -1594,13 +1595,13 @@ def srange_find(start_offset, end_offset, tokens)
tokens.find do |token|
next unless (index = source_buffer.source.byteslice(start_offset...end_offset).index(token))
offset = start_offset + index
return [token, Source::Range.new(source_buffer, offset, offset + token.length)]
return [token, Source::Range.new(source_buffer, offset_cache[offset], offset_cache[offset + token.length])]
end
end

# Transform a location into a token that the parser gem expects.
def token(location)
[location.slice, Source::Range.new(source_buffer, location.start_offset, location.end_offset)] if location
[location.slice, Source::Range.new(source_buffer, offset_cache[location.start_offset], offset_cache[location.end_offset])] if location
end

# Visit a block node on a call.
Expand Down
29 changes: 15 additions & 14 deletions lib/parser/prism/lexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,12 @@ class Lexer

private_constant :TYPES

attr_reader :buffer, :lexed
attr_reader :buffer, :lexed, :offset_cache

def initialize(buffer, lexed)
def initialize(buffer, lexed, offset_cache)
@buffer = buffer
@lexed = lexed
@offset_cache = offset_cache
end

def to_a
Expand All @@ -195,24 +196,24 @@ def to_a

type = TYPES.fetch(token.type)
value = token.value
location = Source::Range.new(buffer, token.location.start_offset, token.location.end_offset)
location = Source::Range.new(buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset])

case type
when :tCHARACTER
value.delete_prefix!("?")
when :tCOMMENT
if token.type == :EMBDOC_BEGIN
until (next_token = lexed[index]) && next_token.type == :EMBDOC_END
value += next_token.location.slice
value += next_token.value
index += 1
end

value += lexed[index].location.slice
location = Source::Range.new(buffer, token.location.start_offset, lexed[index].location.end_offset)
value += next_token.value
location = Source::Range.new(buffer, offset_cache[token.location.start_offset], offset_cache[lexed[index].location.end_offset])
index += 1
else
value.chomp!
location = Source::Range.new(buffer, token.location.start_offset, token.location.end_offset - 1)
location = Source::Range.new(buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - 1])
end
when :tNL
value = nil
Expand All @@ -223,7 +224,7 @@ def to_a
value = Complex(0, value.end_with?("r") ? Rational(value.chomp("r")) : value)
when :tINTEGER
if value.start_with?("+")
tokens << [:tUNARY_NUM, ["+", Source::Range.new(buffer, token.location.start_offset, token.location.start_offset + 1)]]
tokens << [:tUNARY_NUM, ["+", Source::Range.new(buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])]]
location = Source::Range.new(buffer, token.location.start_offset + 1, token.location.end_offset)
end

Expand All @@ -245,13 +246,13 @@ def to_a
next_location = token.location.join(next_token.location)
type = :tSTRING
value = ""
location = Source::Range.new(buffer, next_location.start_offset, next_location.end_offset)
location = Source::Range.new(buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
index += 1
elsif ["\"", "'"].include?(value) && (next_token = lexed[index]) && next_token.type == :STRING_CONTENT && (next_next_token = lexed[index + 1]) && next_next_token.type == :STRING_END
next_location = token.location.join(next_next_token.location)
type = :tSTRING
value = next_token.location.slice
location = Source::Range.new(buffer, next_location.start_offset, next_location.end_offset)
value = next_token.value
location = Source::Range.new(buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
index += 2
elsif value.start_with?("<<")
quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
Expand All @@ -262,13 +263,13 @@ def to_a
when :tSTRING_END
if token.type == :REGEXP_END
value = value[0]
location = Source::Range.new(buffer, token.location.start_offset, token.location.start_offset + 1)
location = Source::Range.new(buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])
end
when :tSYMBEG
if (next_token = lexed[index]) && next_token.type != :STRING_CONTENT
next_location = token.location.join(next_token.location)
type = :tSYMBOL
value = next_token.location.slice
value = next_token.value
location = Source::Range.new(buffer, next_location.start_offset, next_location.end_offset)
index += 1
end
Expand All @@ -281,7 +282,7 @@ def to_a
tokens << [type, [value, location]]

if token.type == :REGEXP_END
tokens << [:tREGEXP_OPT, [token.location.slice[1..], Source::Range.new(buffer, token.location.start_offset + 1, token.location.end_offset)]]
tokens << [:tREGEXP_OPT, [token.value[1..], Source::Range.new(buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])]]
end
end

Expand Down
24 changes: 24 additions & 0 deletions test/fixtures/multibyte.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# credits to https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt

a = "😍"
b = "πŸ‘©πŸ½"
c = "πŸ‘¨β€πŸ¦° πŸ‘¨πŸΏβ€πŸ¦° πŸ‘¨β€πŸ¦± πŸ‘¨πŸΏβ€πŸ¦± πŸ¦ΉπŸΏβ€β™‚οΈ"
d = "πŸ‘Ύ πŸ™‡ πŸ’ πŸ™… πŸ™† πŸ™‹ πŸ™Ž πŸ™"
e = "🐡 πŸ™ˆ πŸ™‰ πŸ™Š"
f = "❀️ πŸ’” πŸ’Œ πŸ’• πŸ’ž πŸ’“ πŸ’— πŸ’– πŸ’˜ πŸ’ πŸ’Ÿ πŸ’œ πŸ’› πŸ’š πŸ’™"
g = "βœ‹πŸΏ πŸ’ͺ🏿 πŸ‘πŸΏ πŸ™ŒπŸΏ πŸ‘πŸΏ πŸ™πŸΏ"
h = "πŸ‘¨β€πŸ‘©β€πŸ‘¦ πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦ πŸ‘¨β€πŸ‘¨β€πŸ‘¦ πŸ‘©β€πŸ‘©β€πŸ‘§ πŸ‘¨β€πŸ‘¦ πŸ‘¨β€πŸ‘§β€πŸ‘¦ πŸ‘©β€πŸ‘¦ πŸ‘©β€πŸ‘§β€πŸ‘¦"
i = "🚾 πŸ†’ πŸ†“ πŸ†• πŸ†– πŸ†— πŸ†™ 🏧"
j = "0️⃣ 1️⃣ 2️⃣ 3️⃣ 4️⃣ 5️⃣ 6️⃣ 7️⃣ 8️⃣ 9️⃣ πŸ”Ÿ"


ra = /😍/
rb = /πŸ‘©πŸ½/
rc = /πŸ‘¨β€πŸ¦° πŸ‘¨πŸΏβ€πŸ¦° πŸ‘¨β€πŸ¦± πŸ‘¨πŸΏβ€πŸ¦± πŸ¦ΉπŸΏβ€β™‚οΈ/
rd = /πŸ‘Ύ πŸ™‡ πŸ’ πŸ™… πŸ™† πŸ™‹ πŸ™Ž πŸ™/
re = /🐡 πŸ™ˆ πŸ™‰ πŸ™Š/
rf = /❀️ πŸ’” πŸ’Œ πŸ’• πŸ’ž πŸ’“ πŸ’— πŸ’– πŸ’˜ πŸ’ πŸ’Ÿ πŸ’œ πŸ’› πŸ’š πŸ’™/
rg = /βœ‹πŸΏ πŸ’ͺ🏿 πŸ‘πŸΏ πŸ™ŒπŸΏ πŸ‘πŸΏ πŸ™πŸΏ/
rh = /πŸ‘¨β€πŸ‘©β€πŸ‘¦ πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦ πŸ‘¨β€πŸ‘¨β€πŸ‘¦ πŸ‘©β€πŸ‘©β€πŸ‘§ πŸ‘¨β€πŸ‘¦ πŸ‘¨β€πŸ‘§β€πŸ‘¦ πŸ‘©β€πŸ‘¦ πŸ‘©β€πŸ‘§β€πŸ‘¦/
ri = /🚾 πŸ†’ πŸ†“ πŸ†• πŸ†– πŸ†— πŸ†™ 🏧/
rj = /0️⃣ 1️⃣ 2️⃣ 3️⃣ 4️⃣ 5️⃣ 6️⃣ 7️⃣ 8️⃣ 9️⃣ πŸ”Ÿ/

0 comments on commit bd00146

Please sign in to comment.