Merge pull request eclipse-rdf4j#843 from jamesrdf/issues/eclipse-rdf…

…4j#62-long-unicode Fix eclipse-rdf4j#62: Decode both short and long unicode SPARQL escape Signed-off-by: Heshan Jayasinghe <shanujse@gmail.com>
heshanjse · Jun 9, 2017 · 893c4f2 · 893c4f2
2 parents b9bb20f + bbc816e
commit 893c4f2
Show file tree

Hide file tree

Showing 6 changed files with 272 additions and 64 deletions.
diff --git a/...ueryparser/sparql/src/main/java/org/eclipse/rdf4j/query/parser/sparql/ast/CharStream.java b/...ueryparser/sparql/src/main/java/org/eclipse/rdf4j/query/parser/sparql/ast/CharStream.java
@@ -0,0 +1,115 @@
+/* Generated By:JavaCC: Do not edit this line. CharStream.java Version 5.0 */
+/* JavaCCOptions:STATIC=false,SUPPORT_CLASS_VISIBILITY_PUBLIC=true */
+package org.eclipse.rdf4j.query.parser.sparql.ast;
+
+/**
+ * This interface describes a character stream that maintains line and
+ * column number positions of the characters.  It also has the capability
+ * to backup the stream to some extent.  An implementation of this
+ * interface is used in the TokenManager implementation generated by
+ * JavaCCParser.
+ *
+ * All the methods except backup can be implemented in any fashion. backup
+ * needs to be implemented correctly for the correct operation of the lexer.
+ * Rest of the methods are all used to get information like line number,
+ * column number and the String that constitutes a token and are not used
+ * by the lexer. Hence their implementation won't affect the generated lexer's
+ * operation.
+ */
+
+public
+interface CharStream {
+
+  /**
+   * Returns the next character from the selected input.  The method
+   * of selecting the input is the responsibility of the class
+   * implementing this interface.  Can throw any java.io.IOException.
+   */
+  char readChar() throws java.io.IOException;
+
+  @Deprecated
+  /**
+   * Returns the column position of the character last read.
+   * @deprecated
+   * @see #getEndColumn
+   */
+  int getColumn();
+
+  @Deprecated
+  /**
+   * Returns the line number of the character last read.
+   * @deprecated
+   * @see #getEndLine
+   */
+  int getLine();
+
+  /**
+   * Returns the column number of the last character for current token (being
+   * matched after the last call to BeginTOken).
+   */
+  int getEndColumn();
+
+  /**
+   * Returns the line number of the last character for current token (being
+   * matched after the last call to BeginTOken).
+   */
+  int getEndLine();
+
+  /**
+   * Returns the column number of the first character for current token (being
+   * matched after the last call to BeginTOken).
+   */
+  int getBeginColumn();
+
+  /**
+   * Returns the line number of the first character for current token (being
+   * matched after the last call to BeginTOken).
+   */
+  int getBeginLine();
+
+  /**
+   * Backs up the input stream by amount steps. Lexer calls this method if it
+   * had already read some characters, but could not use them to match a
+   * (longer) token. So, they will be used again as the prefix of the next
+   * token and it is the implemetation's responsibility to do this right.
+   */
+  void backup(int amount);
+
+  /**
+   * Returns the next character that marks the beginning of the next token.
+   * All characters must remain in the buffer between two successive calls
+   * to this method to implement backup correctly.
+   */
+  char BeginToken() throws java.io.IOException;
+
+  /**
+   * Returns a string made up of characters from the marked token beginning
+   * to the current buffer position. Implementations have the choice of returning
+   * anything that they want to. For example, for efficiency, one might decide
+   * to just return null, which is a valid implementation.
+   */
+  String GetImage();
+
+  /**
+   * Returns an array of characters that make up the suffix of length 'len' for
+   * the currently matched token. This is used to build up the matched string
+   * for use in actions in the case of MORE. A simple and inefficient
+   * implementation of this is as follows :
+   *
+   *   {
+   *      String t = GetImage();
+   *      return t.substring(t.length() - len, t.length()).toCharArray();
+   *   }
+   */
+  char[] GetSuffix(int len);
+
+  /**
+   * The lexer calls this function to indicate that it is done with the stream
+   * and hence implementations can free any resources held by this class.
+   * Again, the body of this function can be just empty and it will not
+   * affect the lexer's operation.
+   */
+  void Done();
+
+}
+/* JavaCC - OriginalChecksum=d5d02d7f2852c9b712f39bed41ca22b5 (do not edit this line) */
diff --git a/...ser/sparql/src/main/java/org/eclipse/rdf4j/query/parser/sparql/ast/SyntaxTreeBuilder.java b/...ser/sparql/src/main/java/org/eclipse/rdf4j/query/parser/sparql/ast/SyntaxTreeBuilder.java
@@ -29,11 +29,10 @@ public class SyntaxTreeBuilder/*@bgen(jjtree)*/implements SyntaxTreeBuilderTreeC
         public static ASTQueryContainer parseQuery(String query)
                 throws TokenMgrError, ParseException
         {
-                SyntaxTreeBuilder stb = new SyntaxTreeBuilder( new StringReader(query) );
+                SyntaxTreeBuilder stb = new SyntaxTreeBuilder( new UnicodeEscapeStream(new StringReader(query), 1) );
 
                 // Set size of tab to 1 to force tokenmanager to report correct column
                 // index for substring splitting of service graph pattern.
-                stb.jj_input_stream.setTabSize(1);
 
                 ASTQueryContainer container = stb.QueryContainer();
                 container.setSourceString(query);
@@ -51,11 +50,10 @@ public static ASTQueryContainer parseQuery(String query)
         public static ASTUpdateSequence parseUpdateSequence(String sequence)
                 throws TokenMgrError, ParseException
         {
-                SyntaxTreeBuilder stb = new SyntaxTreeBuilder( new StringReader(sequence) );
+                SyntaxTreeBuilder stb = new SyntaxTreeBuilder( new UnicodeEscapeStream(new StringReader(sequence), 1) );
 
                 // Set size of tab to 1 to force tokenmanager to report correct column
                 // index for substring splitting of service graph pattern.
-                stb.jj_input_stream.setTabSize(1);
 
                 ASTUpdateSequence seq = stb.UpdateSequence();
                 seq.setSourceString(sequence);
@@ -8159,6 +8157,11 @@ private boolean jj_2_7(int xla) {
     finally { jj_save(6, xla); }
   }
 
+  private boolean jj_3R_67() {
+    if (jj_scan_token(LBRACK)) return true;
+    return false;
+  }
+
   private boolean jj_3R_61() {
     Token xsp;
     xsp = jj_scanpos;
@@ -8648,14 +8651,8 @@ private boolean jj_3R_84() {
     return false;
   }
 
-  private boolean jj_3R_67() {
-    if (jj_scan_token(LBRACK)) return true;
-    return false;
-  }
-
   /** Generated Token Manager. */
   public SyntaxTreeBuilderTokenManager token_source;
-  JavaCharStream jj_input_stream;
   /** Current token. */
   public Token token;
   /** Next token. */
@@ -8701,41 +8698,9 @@ private static void jj_la1_init_5() {
   private boolean jj_rescan = false;
   private int jj_gc = 0;
 
-  /** Constructor with InputStream. */
-  public SyntaxTreeBuilder(java.io.InputStream stream) {
-     this(stream, null);
-  }
-  /** Constructor with InputStream and supplied encoding */
-  public SyntaxTreeBuilder(java.io.InputStream stream, String encoding) {
-    try { jj_input_stream = new JavaCharStream(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
-    token_source = new SyntaxTreeBuilderTokenManager(jj_input_stream);
-    token = new Token();
-    jj_ntk = -1;
-    jj_gen = 0;
-    for (int i = 0; i < 174; i++) jj_la1[i] = -1;
-    for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
-  }
-
-  /** Reinitialise. */
-  public void ReInit(java.io.InputStream stream) {
-     ReInit(stream, null);
-  }
-  /** Reinitialise. */
-  public void ReInit(java.io.InputStream stream, String encoding) {
-    try { jj_input_stream.ReInit(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
-    token_source.ReInit(jj_input_stream);
-    token = new Token();
-    jj_ntk = -1;
-    jjtree.reset();
-    jj_gen = 0;
-    for (int i = 0; i < 174; i++) jj_la1[i] = -1;
-    for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
-  }
-
-  /** Constructor. */
-  public SyntaxTreeBuilder(java.io.Reader stream) {
-    jj_input_stream = new JavaCharStream(stream, 1, 1);
-    token_source = new SyntaxTreeBuilderTokenManager(jj_input_stream);
+  /** Constructor with user supplied CharStream. */
+  public SyntaxTreeBuilder(CharStream stream) {
+    token_source = new SyntaxTreeBuilderTokenManager(stream);
     token = new Token();
     jj_ntk = -1;
     jj_gen = 0;
@@ -8744,9 +8709,8 @@ public SyntaxTreeBuilder(java.io.Reader stream) {
   }
 
   /** Reinitialise. */
-  public void ReInit(java.io.Reader stream) {
-    jj_input_stream.ReInit(stream, 1, 1);
-    token_source.ReInit(jj_input_stream);
+  public void ReInit(CharStream stream) {
+    token_source.ReInit(stream);
     token = new Token();
     jj_ntk = -1;
     jjtree.reset();
@@ -8865,18 +8829,21 @@ private void jj_add_error_token(int kind, int pos) {
       for (int i = 0; i < jj_endpos; i++) {
         jj_expentry[i] = jj_lasttokens[i];
       }
-      jj_entries_loop: for (java.util.Iterator<?> it = jj_expentries.iterator(); it.hasNext();) {
+      boolean exists = false;
+      for (java.util.Iterator<?> it = jj_expentries.iterator(); it.hasNext();) {
+        exists = true;
         int[] oldentry = (int[])(it.next());
         if (oldentry.length == jj_expentry.length) {
           for (int i = 0; i < jj_expentry.length; i++) {
             if (oldentry[i] != jj_expentry[i]) {
-              continue jj_entries_loop;
+              exists = false;
+              break;
             }
           }
-          jj_expentries.add(jj_expentry);
-          break jj_entries_loop;
+          if (exists) break;
         }
       }
+      if (!exists) jj_expentries.add(jj_expentry);
       if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind;
     }
   }

diff --git a/...rc/main/java/org/eclipse/rdf4j/query/parser/sparql/ast/SyntaxTreeBuilderTokenManager.java b/...rc/main/java/org/eclipse/rdf4j/query/parser/sparql/ast/SyntaxTreeBuilderTokenManager.java
@@ -3315,25 +3315,23 @@ private static final boolean jjCanMove_2(int hiByte, int i1, int i2, long l1, lo
 static final long[] jjtoSpecial = {
    0x8L, 0x0L, 0x0L, 
 };
-protected JavaCharStream input_stream;
+protected CharStream input_stream;
 private final int[] jjrounds = new int[157];
 private final int[] jjstateSet = new int[314];
 protected char curChar;
 /** Constructor. */
-public SyntaxTreeBuilderTokenManager(JavaCharStream stream){
-   if (JavaCharStream.staticFlag)
-      throw new Error("ERROR: Cannot use a static CharStream class with a non-static lexical analyzer.");
+public SyntaxTreeBuilderTokenManager(CharStream stream){
    input_stream = stream;
 }
 
 /** Constructor. */
-public SyntaxTreeBuilderTokenManager(JavaCharStream stream, int lexState){
+public SyntaxTreeBuilderTokenManager(CharStream stream, int lexState){
    this(stream);
    SwitchTo(lexState);
 }
 
 /** Reinitialise parser. */
-public void ReInit(JavaCharStream stream)
+public void ReInit(CharStream stream)
 {
    jjmatchedPos = jjnewStateCnt = 0;
    curLexState = defaultLexState;
@@ -3349,7 +3347,7 @@ private void ReInitRounds()
 }
 
 /** Reinitialise parser. */
-public void ReInit(JavaCharStream stream, int lexState)
+public void ReInit(CharStream stream, int lexState)
 {
    ReInit(stream);
    SwitchTo(lexState);