Merge pull request #4 from HParker/convert-byte-offset-to-char-offset

assure token encoding is correct and locations are in characters, not bytes
kddnewton · Dec 1, 2023 · bd00146 · bd00146
2 parents bdef470 + 0586e26
commit bd00146
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 23 deletions.
diff --git a/lib/parser/prism.rb b/lib/parser/prism.rb
@@ -75,7 +75,7 @@ def try_declare_numparam(node)
     private
 
     def build_ast(program)
-      program.accept(Compiler.new(self))
+      program.accept(Compiler.new(self, offset_cache))
     end
 
     def build_comments(comments)
@@ -89,7 +89,13 @@ def build_comments(comments)
     end
 
     def build_tokens(tokens)
-      Lexer.new(source_buffer, tokens.map(&:first)).to_a
+      Lexer.new(source_buffer, tokens.map(&:first), offset_cache).to_a
+    end
+
+    def offset_cache
+      @offset_cache ||= Hash.new do |h, k|
+        h[k] = @source_buffer.source.byteslice(0, k).length
+      end
     end
   end
 end

diff --git a/lib/parser/prism/compiler.rb b/lib/parser/prism/compiler.rb
@@ -3,13 +3,14 @@
 module Parser
   class Prism
     class Compiler < ::Prism::Compiler
-      attr_reader :parser, :builder, :source_buffer
+      attr_reader :parser, :builder, :source_buffer, :offset_cache
       attr_reader :locals, :in_destructure, :in_pattern
 
-      def initialize(parser, locals: nil, in_destructure: false, in_pattern: false)
+      def initialize(parser, offset_cache, locals: nil, in_destructure: false, in_pattern: false)
         @parser = parser
         @builder = parser.builder
         @source_buffer = parser.source_buffer
+        @offset_cache = offset_cache
 
         @locals = locals
         @in_destructure = in_destructure
@@ -1560,7 +1561,7 @@ def visit_yield_node(node)
       private
 
       def copy_compiler(locals: self.locals, in_destructure: self.in_destructure, in_pattern: self.in_pattern)
-        Compiler.new(parser, locals: locals, in_destructure: in_destructure, in_pattern: in_pattern)
+        Compiler.new(parser, offset_cache, locals: locals, in_destructure: in_destructure, in_pattern: in_pattern)
       end
 
       # Blocks can have a special set of parameters that automatically expand
@@ -1579,12 +1580,12 @@ def procarg0?(parameters)
 
       # Constructs a new source range from the given start and end offsets.
       def srange(location)
-        Source::Range.new(source_buffer, location.start_offset, location.end_offset) if location
+        Source::Range.new(source_buffer, offset_cache[location.start_offset], offset_cache[location.end_offset]) if location
       end
 
       # Constructs a new source range from the given start and end offsets.
       def srange_offsets(start_offset, end_offset)
-        Source::Range.new(source_buffer, start_offset, end_offset)
+        Source::Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])
       end
 
       # Constructs a new source range by finding the given tokens between the
@@ -1594,13 +1595,13 @@ def srange_find(start_offset, end_offset, tokens)
         tokens.find do |token|
           next unless (index = source_buffer.source.byteslice(start_offset...end_offset).index(token))
           offset = start_offset + index
-          return [token, Source::Range.new(source_buffer, offset, offset + token.length)]
+          return [token, Source::Range.new(source_buffer, offset_cache[offset], offset_cache[offset + token.length])]
         end
       end
 
       # Transform a location into a token that the parser gem expects.
       def token(location)
-        [location.slice, Source::Range.new(source_buffer, location.start_offset, location.end_offset)] if location
+        [location.slice, Source::Range.new(source_buffer, offset_cache[location.start_offset], offset_cache[location.end_offset])] if location
       end
 
       # Visit a block node on a call.

diff --git a/lib/parser/prism/lexer.rb b/lib/parser/prism/lexer.rb
@@ -177,11 +177,12 @@ class Lexer
 
       private_constant :TYPES
 
-      attr_reader :buffer, :lexed
+      attr_reader :buffer, :lexed, :offset_cache
 
-      def initialize(buffer, lexed)
+      def initialize(buffer, lexed, offset_cache)
         @buffer = buffer
         @lexed = lexed
+        @offset_cache = offset_cache
       end
 
       def to_a
@@ -195,24 +196,24 @@ def to_a
 
           type = TYPES.fetch(token.type)
           value = token.value
-          location = Source::Range.new(buffer, token.location.start_offset, token.location.end_offset)
+          location = Source::Range.new(buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset])
 
           case type
           when :tCHARACTER
             value.delete_prefix!("?")
           when :tCOMMENT
             if token.type == :EMBDOC_BEGIN
               until (next_token = lexed[index]) && next_token.type == :EMBDOC_END
-                value += next_token.location.slice
+                value += next_token.value
                 index += 1
               end
 
-              value += lexed[index].location.slice
-              location = Source::Range.new(buffer, token.location.start_offset, lexed[index].location.end_offset)
+              value += next_token.value
+              location = Source::Range.new(buffer, offset_cache[token.location.start_offset], offset_cache[lexed[index].location.end_offset])
               index += 1
             else
               value.chomp!
-              location = Source::Range.new(buffer, token.location.start_offset, token.location.end_offset - 1)
+              location = Source::Range.new(buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - 1])
             end
           when :tNL
             value = nil
@@ -223,7 +224,7 @@ def to_a
             value = Complex(0, value.end_with?("r") ? Rational(value.chomp("r")) : value)
           when :tINTEGER
             if value.start_with?("+")
-              tokens << [:tUNARY_NUM, ["+", Source::Range.new(buffer, token.location.start_offset, token.location.start_offset + 1)]]
+              tokens << [:tUNARY_NUM, ["+", Source::Range.new(buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])]]
               location = Source::Range.new(buffer, token.location.start_offset + 1, token.location.end_offset)
             end
 
@@ -245,13 +246,13 @@ def to_a
               next_location = token.location.join(next_token.location)
               type = :tSTRING
               value = ""
-              location = Source::Range.new(buffer, next_location.start_offset, next_location.end_offset)
+              location = Source::Range.new(buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
               index += 1
             elsif ["\"", "'"].include?(value) && (next_token = lexed[index]) && next_token.type == :STRING_CONTENT && (next_next_token = lexed[index + 1]) && next_next_token.type == :STRING_END
               next_location = token.location.join(next_next_token.location)
               type = :tSTRING
-              value = next_token.location.slice
-              location = Source::Range.new(buffer, next_location.start_offset, next_location.end_offset)
+              value = next_token.value
+              location = Source::Range.new(buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
               index += 2
             elsif value.start_with?("<<")
               quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
@@ -262,13 +263,13 @@ def to_a
           when :tSTRING_END
             if token.type == :REGEXP_END
               value = value[0]
-              location = Source::Range.new(buffer, token.location.start_offset, token.location.start_offset + 1)
+              location = Source::Range.new(buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])
             end
           when :tSYMBEG
             if (next_token = lexed[index]) && next_token.type != :STRING_CONTENT
               next_location = token.location.join(next_token.location)
               type = :tSYMBOL
-              value = next_token.location.slice
+              value = next_token.value
               location = Source::Range.new(buffer, next_location.start_offset, next_location.end_offset)
               index += 1
             end
@@ -281,7 +282,7 @@ def to_a
           tokens << [type, [value, location]]
 
           if token.type == :REGEXP_END
-            tokens << [:tREGEXP_OPT, [token.location.slice[1..], Source::Range.new(buffer, token.location.start_offset + 1, token.location.end_offset)]]
+            tokens << [:tREGEXP_OPT, [token.value[1..], Source::Range.new(buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])]]
           end
         end
 

diff --git a/test/fixtures/multibyte.rb b/test/fixtures/multibyte.rb
@@ -0,0 +1,24 @@
+# credits to https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt
+
+a = "😍"
+b = "👩🏽"
+c = "👨‍🦰 👨🏿‍🦰 👨‍🦱 👨🏿‍🦱 🦹🏿‍♂️"
+d = "👾 🙇 💁 🙅 🙆 🙋 🙎 🙍"
+e = "🐵 🙈 🙉 🙊"
+f = "❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙"
+g = "✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿"
+h = "👨‍👩‍👦 👨‍👩‍👧‍👦 👨‍👨‍👦 👩‍👩‍👧 👨‍👦 👨‍👧‍👦 👩‍👦 👩‍👧‍👦"
+i = "🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧"
+j = "0️⃣ 1️⃣ 2️⃣ 3️⃣ 4️⃣ 5️⃣ 6️⃣ 7️⃣ 8️⃣ 9️⃣ 🔟"
+
+
+ra = /😍/
+rb = /👩🏽/
+rc = /👨‍🦰 👨🏿‍🦰 👨‍🦱 👨🏿‍🦱 🦹🏿‍♂️/
+rd = /👾 🙇 💁 🙅 🙆 🙋 🙎 🙍/
+re = /🐵 🙈 🙉 🙊/
+rf = /❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙/
+rg = /✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿/
+rh = /👨‍👩‍👦 👨‍👩‍👧‍👦 👨‍👨‍👦 👩‍👩‍👧 👨‍👦 👨‍👧‍👦 👩‍👦 👩‍👧‍👦/
+ri = /🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧/
+rj = /0️⃣ 1️⃣ 2️⃣ 3️⃣ 4️⃣ 5️⃣ 6️⃣ 7️⃣ 8️⃣ 9️⃣ 🔟/