diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs index 4b57649a954..7660c23f6ba 100644 --- a/.settings/org.eclipse.jdt.core.prefs +++ b/.settings/org.eclipse.jdt.core.prefs @@ -1,12 +1,12 @@ -#Mon Jan 02 00:48:54 CST 2006 +#Mon Sep 25 23:02:20 CDT 2006 eclipse.preferences.version=1 -org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=disabled -org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.4 +org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5 org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve -org.eclipse.jdt.core.compiler.compliance=1.4 +org.eclipse.jdt.core.compiler.compliance=1.5 org.eclipse.jdt.core.compiler.debug.lineNumber=generate org.eclipse.jdt.core.compiler.debug.localVariable=generate org.eclipse.jdt.core.compiler.debug.sourceFile=generate org.eclipse.jdt.core.compiler.problem.assertIdentifier=error -org.eclipse.jdt.core.compiler.problem.enumIdentifier=warning -org.eclipse.jdt.core.compiler.source=1.4 +org.eclipse.jdt.core.compiler.problem.enumIdentifier=error +org.eclipse.jdt.core.compiler.source=1.5 diff --git a/.settings/org.eclipse.jdt.ui.prefs b/.settings/org.eclipse.jdt.ui.prefs index cc204d65d6b..a061f461dcf 100644 --- a/.settings/org.eclipse.jdt.ui.prefs +++ b/.settings/org.eclipse.jdt.ui.prefs @@ -1,3 +1,3 @@ -#Thu Sep 29 15:25:20 CDT 2005 +#Mon Sep 25 23:02:20 CDT 2006 eclipse.preferences.version=1 -internal.default.compliance=user +internal.default.compliance=default diff --git a/src/org/jruby/util/string/UcharIterator.java b/src/org/jruby/util/string/UcharIterator.java new file mode 100644 index 00000000000..10b5da6e839 --- /dev/null +++ b/src/org/jruby/util/string/UcharIterator.java @@ -0,0 +1,78 @@ +package org.jruby.util.string; + +/** + * UcharIterator - an Iterator on Unicode characters in a UTF-8 byte array. + * + *

A conventional Iterator, remove() is not supported, and there's an extra + * nextChar() method that returns a naked int as opposed to a wrapped Integer. + *

+ * + * @author Tim Bray + * @see org.jruby.util.string.Ustr + */ + +public class UcharIterator implements java.util.Iterator, java.io.Serializable { + private Ustr u; + private int next; + + /** + * Creates a new UcharIterator starting at an offset in a buffer. + * + * @param s the byte array containing UTF-8-encoded Unicode characters. + * @param offset how far into the array to start iterating. + */ + public UcharIterator(byte[] s, int offset) { + u = new Ustr(s, offset); + u.prepareNext(); + next = u.nextChar(); + } + + /** + * Tests whether there are any more characters in the buffer. + * + * @return true or false depending on whether there are more characters. + */ + public boolean hasNext() { + return (next != 0); + } + + /** + * Retrieve the next Unicode character from a UTF-8 byte buffer, wrapped + * in an Integer object. Throws NoSuchElementException if hasNext + * would return false. + * + * @return the next Unicode character as a java.lang.Integer + * @throws NoSuchElementException + */ + public Object next() { + if (next == 0) + throw new java.util.NoSuchElementException("Ran off end of array"); + Integer i = new Integer(next); + next = u.nextChar(); + return i; + } + + /** + * Retrieve the next Unicode character from a UTF-8 byte buffer and return + * it as an int. Once the null-termination is hit, returns 0 as many times + * as you want to call it. + * + * @return the next Unicode character as an int, 0 on end-of-string. + */ + public int nextChar() { + int i = next; + if (i != 0) + next = u.nextChar(); + return i; + } + + /** + * Throws an UnsupportedOperationException. + * + * @throws UnsupportedOperationException + */ + public void remove() { + throw new UnsupportedOperationException("UcharIterator doesn't remove"); + } + +} diff --git a/src/org/jruby/util/string/Ustr.java b/src/org/jruby/util/string/Ustr.java new file mode 100644 index 00000000000..6786a1103d5 --- /dev/null +++ b/src/org/jruby/util/string/Ustr.java @@ -0,0 +1,1442 @@ +package org.jruby.util.string; +import java.io.Serializable; +import java.util.Hashtable; + +/** + * Ustr - rhymes with Wooster. + * Implements a string, with three design goals: + * + *
    + *
  1. Correct implementation of Unicode semantics.
  2. + *
  3. Support for as many of java's String and StringBuffer methods as + * is reasonable.
  4. + *
  5. Support for the familiar null-terminated-string primitives + * of the C programming language: strcpy() and friends.
+ * + *

A Ustr is a fairly thin wrapper around a byte[] array, which + * contains null-terminated UTF8-encoded text.

+ * + *

Note that in the context of a Ustr, "index" always means how + * many Unicode characters you are into the Ustr's text, while "offset" + * always mean how many bytes you are into its UTF8 encoded form.

+ * + *

Similarly, "char" and "String" always refer to the Java constructs, + * while "character" always means a Unicode character, always identified + * by a Java int.

+ * + *

If any of the Ustr methods are passed an integer alleged to represent + * a Unicode character whose value is not a valid code point, i.e. is either + * negative or greater than 0x10ffff, the method will throw a UstrException, + * which extends RuntimeException and is thus not checked at compile-time.

+ * + *

For any method that copies characters and might overrun a buffer, a + * "safe" version is provided, starting with an extra s, e.g. + * sstrcopy and sstrcat. These versions always + * arrange that the copied string not overrun the provided buffer, which + * will be properly null-terminated.

+ * + * @author Tim Bray + * @see org.jruby.util.string.UstrException + */ +public class Ustr + implements Comparable, Serializable { + + // the number of bytes of UTF8, indexed by the value of the first byte + private static final byte[] encLength = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, -1, -1, -1, -1, -1, -1, -1, -1 + }; + + + private static Hashtable interns = new Hashtable(); + + /** + * A null-terminated byte array containing the string in UTF-8 form. All + * Ustr operations count on null-termination. The byte array may + * be much bigger than the contained string + */ + public byte[] s; // UTF-encoded text + + /** + * Where in the array s the string starts. You can + * have lots of different Ustrs co-existing in a single byte array. + */ + public int base = 0; // of the start of the string + + /** + * To keep track of a single character position within the string; + * this is used by the nextChar and appendChar + * methods. + */ + public int offset = 0; // for iterating, relative to base + + /** + * Creates an empty Ustr with no buffer + */ + public Ustr() { + base = offset = 0; + } + /** + * Creates an empty Ustr, with a null termination at the front. + * + * @param length length of the buffer, in bytes + */ + public Ustr(int length) { + s = new byte[length]; + base = offset = 0; + s[0] = 0; + } + /** + * Wraps a Ustr around a buffer. Does not do null termination, so you + * can pass in a buffer already containing a string. + * + * @param bytes the buffer + */ + public Ustr(byte[] bytes) { + s = bytes; + base = offset = 0; + } + /** + * Wraps a Ustr around a position in a buffer. Does not do null + * termination, so you can pass in a buffer already containing a string. + * + * @param bytes the buffer + * @param start where in the buffer the strings starts + */ + public Ustr(byte[] bytes, int start) { + s = bytes; + base = offset = start; + } + /** + * Makes a Ustr which is a copy of another Ustr + * + * @param from the Ustr to copy + */ + public Ustr(Ustr from) { + s = new byte[from.strlen() + 1]; + base = offset = 0; + strcpy(from); + } + /** + * Makes a Ustr from a char[] array. The Ustr is null-terminated, but + * no space is allocated beyond what's needed. + * + * @param chars the char array + */ + public Ustr(char [] chars) { + + int size = 0; + for (char utf16 : chars) { + // this works because surrogate characters will be counted as 2 + // each, and anything in the astral planes takes 4 bytes. + size += bytesInChar(utf16); + } + s = new byte[size + 1]; + base = 0; + prepareAppend(); + int i = 0; + while (i < chars.length) { + int val = Character.codePointAt(chars, i); + if (val > 0xffff) + i += 2; + else + i++; + appendChar(val); + } + s[s.length - 1] = 0; + } + + /** + * Makes a Ustr from an int[] array, where each int is the value of + * a Unicode character. Throws a UstrException if one of the ints + * is not a Unicode codepoint (negative or >0x10ffff). + * + * @param ints the int array + * @throws UstrException + * + */ + public Ustr(int [] ints) { + int bufsiz = 0; + + for (int i : ints) { + if (i < 0) + throw new UstrException("Negative character value"); + if (i > 0x10ffff) + throw new UstrException("Character out of Unicode range"); + + bufsiz += bytesInChar(i); + + } + s = new byte[bufsiz + 1]; + base = offset = 0; + + for (int i : ints) + appendChar(i); + } + + /** + * Makes a Ustr from an object, based on its toString(). + * Most commonly used with a String argument. The Ustr is null-terminated, + * but no space is allocated beyond what's needed. Throws a UstrException + * if the environment doesn't support the UTF8 encoding. + * + * @param o the Object + * @throws UstrException + */ + public Ustr(Object o) { + byte[] inbytes; + + base = offset = 0; + try { + inbytes = o.toString().getBytes("UTF8"); + } catch (java.io.UnsupportedEncodingException e) { + throw new UstrException("UTF8 not supported!?!?"); + } + + // because we need one more byte than getBytes provides + s = new byte[inbytes.length + 1]; + for (int i = 0; i < inbytes.length; i++) + s[i] = inbytes[i]; + + s[inbytes.length] = 0; + } + /** + * Makes a Ustr from an object, based on its toString(), + * leaving room for growth. Most commonly used with a String argument. + * The Ustr is null-terminated. + * + * @param space How large a buffer to allocate + * @param o The object + */ + public Ustr(int space, Object o) { + s = new byte[space]; + base = offset = 0; + byte [] b; + + try { + b = o.toString().getBytes("UTF8"); + } catch (java.io.UnsupportedEncodingException e) { + throw new RuntimeException("UTF8 not supported!?!?"); + } + + for (int i = 0; i < b.length; i++) + s[i] = b[i]; + + s[b.length] = 0; + } + + /** + * Empty a Ustr by setting its first byte to 0. + */ + public void init() { + s[base] = 0; + offset = base; + } + + /** + * Supports the Comparable interface. The ordering is that of + * native Unicode code points and probably not culturally appropriate + * anywhere. + * + * @param other the object compared + * @return -1, 0, or 1 as you'd expect. + */ + public int compareTo(Object other) { + Ustr o = (other instanceof Ustr) ? (Ustr) other : new Ustr(other); + return strcmp(s, base, o.s, o.base); + } + + /** + * Generates a Java String representing the Ustr. Throws a UstrException + * if the Java environment doesn't support the UTF8 encoding. + * + * @return the String. + * @throws UstrException + */ + public String toString() { + try { + return new String(s, base, strlen(), "UTF8"); + } catch (java.io.UnsupportedEncodingException e) { + throw new UstrException("UTF8 not supported!?!?"); + } + } + + // per-Unicode-character operations + // + /** + * Length of a Ustr in Unicode characters (not bytes). + * + * @return the number of Unicode characters. + */ + public int length() { + int saveOffset = offset; + int l = 0; + for (prepareNext(); nextChar() != 0; l++) + ; // empty + offset = saveOffset; + return l; + } + /** + * Number of Unicode characters stored starting at some offset in a byte + * array. Assumes UTF-8 encoding and null termination. + * + * @param b the byte array + * @param offset where to start counting + * @return the number of unicode characters. + */ + public static int length(byte [] b, int offset) { + return (new Ustr(b, offset)).length(); + } + /** + * Number of Unicode characters stored in a byte array. Assumes UTF-8 + * encoding and null termination. + * + * @param b the byte array + * @return the number of Unicode characters. + */ + public static int length(byte [] b) { + return length(b, 0); + } + + /** + * Number of Unicode characters stored in a Java string. + * if s is a String, s.length() and + * Ustr.length(s) will be the same except when s + * contains non-BMP characters. + * + * @param str the string + * @return the number of Unicode characters + */ + public static int length(String str) { + return (new Ustr(str)).length(); + } + + /** + * Set up for appendChar. Points the offset + * field at the buffer's null terminator. + */ + public void prepareAppend() { + offset = strlen(); + } + /** + * Append one Unicode character to a Ustr. Assumes that the + * offset points to the null-termination, + * where the character ought to go, updates that field and applies + * another null termination. You could change the value of + * offset and start "appending" into the middle of a Ustr + * if that's what you wanted. This generates the UTF-8 bytes from + * the input characters. + *

If the character is less than 128, one byte of buffer is used. + * If less than 0x8000, two bytes. If less than 2**16, three bytes. + * If less than 0x10ffff, four bytes. If greater than 0x10ffff, or + * negative, you get an exception.

+ * + * @param c the character to be appended. + */ + public void appendChar(int c) { + offset = appendChar(c, s, offset); + } + + /** + * Writes one Unicode character into a UTF-8 encoded byte array at + * a given offset, and null-terminates it. Throws a UstrException if + * the 'c' argument is not a Unicode codepoint (negative or >0x10ffff) + * + * @param c the Unicode character + * @param s the array + * @param offset the offset to write at + * @return the offset of the null byte after the encoded character + * @throws UstrException + */ + public static int appendChar(int c, byte [] s, int offset) { + if (c < 0) + throw new UstrException("Appended negative character"); + if (c < 128) + s[offset++] = (byte) c; + else if (c <= 0x7ff) { + s[offset++] = (byte) ( (c >> 6) | 0xc0); + s[offset++] = (byte) ((c & 0x3f) | 0x80); + } else if (c <= 0xffff) { + s[offset++] = (byte) ( (c >> 12) | 0xe0); + s[offset++] = (byte) (((c >> 6) & 0x3f) | 0x80); + s[offset++] = (byte) ( (c & 0x3f) | 0x80); + } else if (c <= 0x10ffff) { + s[offset++] = (byte) ( (c >> 18) | 0xf0); + s[offset++] = (byte) (((c >> 12) & 0x3f) | 0x80); + s[offset++] = (byte) ( ((c >> 6) & 0x3f) | 0x80); + s[offset++] = (byte) ( (c & 0x3f) | 0x80); + } else + throw new UstrException("Appended character > 0x10ffff"); + s[offset] = 0; + return offset; + } + + /** + * Set up for nextChar(). Points the offset + * field at the start of the buffer. + */ + public void prepareNext() { + offset = base; + } + /** + * Retrieve one Unicode character from a Ustr and advance the working + * offset. Assumes the working offset is sanely located. + * + * @return the Unicode character, 0 signaling the end of the string + */ + public int nextChar() { + if (s[offset] == 0) + return 0; + if ((s[offset] & 0x80) == 0) + return (int) s[offset++]; + if ((s[offset] & 0xe0) == 0xc0) { + // 110w wwww 10zz zzzz + // xxxx xwww wwzz zzzz + int c = (s[offset++] & 0x1f) << 6; + c |= s[offset++] & 0x3f; + return c; + } + if ((s[offset] & 0xf0) == 0xe0) { + // 1110 wwww 10zz zzzz 10xx xxxx + // wwww zzzz zzxx xxxx + int c = (s[offset++] & 0xf) << 12; + c |= (s[offset++] & 0x3f) << 6; + c |= s[offset++] & 0x3f; + return c; + } + // 1111 0www 10zz zzzz 10xx xxxx 10yy yyyy + // wwwwzz zzzzxxxx xxyyyyyy + int c = (s[offset++] & 0x7) << 18; + c |= (s[offset++] & 0x3f) << 12; + c |= (s[offset++] & 0x3f) << 6; + c |= s[offset++] & 0x3f; + return c; + } + + // Strlen variants + // + /** + * The length in bytes of a Ustr's UTF representation. Assumes + * null-termination. + * + * @return the number of bytes + */ + public int strlen() { + return strlen(s, base); + } + /** + * The length in bytes of a null-terminated byte array + * + * @param b the array + * @return the number of bytes + */ + public static int strlen(byte [] b) { + int i = 0; + while (b[i] != 0) + i++; + return i; + } + /** + * The length in bytes of a null-terminated sequence starting at some + * offset in a byte array. + * + * @param b the byte array + * @param base the byte offset to start counting at + * @return the number of bytes + */ + public static int strlen(byte [] b, int base) { + int i = base; + while (b[i] != 0) + i++; + return i - base; + } + + // Strcpy variants + // + /** + * Copy a null-terminated byte array. + * + * @param to destination array + * @param from source array + * @return the destination array + */ + public static byte [] strcpy(byte [] to, byte [] from) { + return strcpy(to, 0, from, 0); + } + /** + * Copy null-terminated byte arrays with control over offsets. + * + * @param to destination array + * @param tbase starting offset in destination array + * @param from source array + * @param fbase starting offset in source array + * @return the destination array + */ + public static byte [] strcpy(byte [] to, int tbase, byte [] from, int fbase) { + while (from[fbase] != 0) + to[tbase++] = from[fbase++]; + to[tbase] = 0; + + return to; + } + /** + * Copy in the contents of another Ustr. Does not change the offset. + * + * @param from source Ustr + * @return this Ustr + */ + public Ustr strcpy(Ustr from) { + strcpy(s, base, from.s, from.base); + return this; + } + + /** + * Copy in the String representation of an Object. Does not change the + * offset. + * + * @param o the source object + * @return this Ustr + */ + public Ustr strcpy(Object o) { + strcpy(new Ustr(o)); + return this; + } + /** + * Copy in the contents of a null-terminated byte array. Does not change + * the offset. + * + * @param from the byte array + * @return this Ustr + */ + public Ustr strcpy(byte[] from) { + strcpy(s, from); + return this; + } + /** + * Copy in the contents at some offset in a null-terminated byte array. + * Does not change the offset. + * + * @param from the source byte array + * @param boffset where to start copying in the source array + * @return this Ustr + */ + public Ustr strcpy(byte[] from, int boffset) { + strcpy(s, 0, from, boffset); + return this; + } + /** + * + * Load a null-terminated UTF-8 encoding of a String into a byte array at + * the front. + * + * @param b the byte array + * @param s the String + * + * @return the byte array + */ + public static byte [] strcpy(byte [] b, String s) { + return strcpy(b, 0, s); + } + + /** + * Load a null-terminated UTF-8 encoding of a String into a byte array. + * + * @param b the byte array + * @param offset where in the byte array to load + * @param s the String + * + * @return the byte array + */ + public static byte [] strcpy(byte [] b, int offset, String s) { + byte [] sbytes; + + try { sbytes = s.getBytes("UTF8"); } catch (java.io.UnsupportedEncodingException e) { + throw new RuntimeException("UTF8 not supported!?!?"); } + + for (int i = 0; i < sbytes.length; i++) + b[offset + i] = sbytes[i]; + b[offset + sbytes.length] = 0; + return b; + } + + + // safe versions + // could check for to.length myself, but since Java is necessarily + // doing this for me each time around the loop, why bother? + // + /** + * Safely append one Ustr to another. + * + * @param from the Ustr to be appended + * @return this + */ + public Ustr sstrcat(Ustr from) { + sstrcat(s, base, from.s, from.base); + return this; + } + + /** + * Safely append one null-terminated byte array to another. Destination + * buffer will not be overrun. + * + * @param to dest array + * @param from source array + * @return dest array + */ + public byte [] sstrcat(byte [] to, byte[] from) { + return sstrcat(to, 0, from, 0); + } + /** + * Safely append one null-terminated byte array to another with control + * over offsets. Destination buffer will not be overrun. + * + * @param to dest array + * @param tbase base of dest array + * @param from source array + * @param fbase base of source array + * @return to + */ + public static byte [] sstrcat(byte [] to, int tbase, byte [] from, int fbase) { + // don't want to catch if the dest string is malformed + while (to[tbase] != 0) + tbase++; + + try { + while (from[fbase] != 0) + to[tbase++] = from[fbase++]; + to[tbase] = 0; + + return to; + } catch (java.lang.ArrayIndexOutOfBoundsException e) { + if (tbase >= to.length) + to[to.length - 1] = 0; + else + throw e; + } + return to; + } + + /** + * Safely copy null-terminated byte arrays with control over offsets. + * Destination buffer will not be overrun. + * + * @param to destination array + * @param tbase starting offset in destination array + * @param from source array + * @param fbase starting offset in source array` + * @return the destination array + */ + public static byte [] sstrcpy(byte [] to, int tbase, byte [] from, int fbase) { + try { + while (from[fbase] != 0) + to[tbase++] = from[fbase++]; + to[tbase] = 0; + } + + catch (java.lang.ArrayIndexOutOfBoundsException e) { + // if the buffer's too short + if (tbase >= to.length) + to[to.length - 1] = 0; + + // otherwise there's some problem with the source string, we + // shouldn't catch it + else + throw e; + } + return to; + } + /** + * Safely copy a null-terminated byte array. The destination buffer will not + * be overrun. + * + * @param to destination array + * @param from source array + * @return the destination array + */ + public static byte [] sstrcpy(byte [] to, byte [] from) { + return sstrcpy(to, 0, from, 0); + } + + /** + * Safely copy in the contents of another Ustr. Does not change the offset. + * The destination buffer will not be overrun. + * + * @param from source Ustr + * @return this Ustr + */ + public Ustr sstrcpy(Ustr from) { + sstrcpy(s, base, from.s, from.base); + return this; + } + + /** + * Copy one null-terminated array to the end of another, with + * starting offsets for each + * + * @param to destination array + * @param tbase base pos of destination + * @param from source array + * @param fbase base pos of source + * @return destination + */ + public static byte [] strcat(byte [] to, int tbase, byte [] from, int fbase) { + while (to[tbase] != 0) + tbase++; + + while (from[fbase] != 0) + to[tbase++] = from[fbase++]; + to[tbase] = 0; + + return to; + } + + /** + * Copy one null-terminated byte array to the end of another. + * + * @param to destination array + * @param from source array + * @return the destionation array + */ + public static byte [] strcat(byte [] to, byte [] from) { + return strcat(to, 0, from, 0); + } + + /** + * Append the contents of another Ustr to the end of this one + * + * @param other the other Ustr + * @return this Ustr + */ + public Ustr strcat(Ustr other) { + strcat(s, other.s); + return this; + } + + /** + * Compare two null-terminated byte arrays. The ordering is that of + * native Unicode code points and probably not culturally appropriate + * anywhere. + * + * @param s1 first byte array + * @param s2 second byte array + * @return a negative number, zero, or a positive number depending + * on whether s1 is lexically less than, equal to, or greater than s2. */ + public static int strcmp(byte [] s1, byte [] s2) { + return strcmp(s1, 0, s2, 0); + } + /** + * Compare sections of two null-terminated byte arrays. The ordering is + * that of + * native Unicode code points and probably not culturally appropriate + * anywhere. + * + * @param s1 first byte array + * @param s1base byte offset in first array to start comparing + * @param s2 second byte array + * @param s2base byte offset in second array to start comparing + * @return a negative number, zero, or a positive number depending on + * whether s1 is lexically less than, equal to, or greater than s2. + */ + public static int strcmp(byte [] s1, int s1base, byte [] s2, int s2base) { + + Ustr u1 = new Ustr(s1, s1base); + Ustr u2 = new Ustr(s2, s2base); + + int c1 = u1.nextChar(); + int c2 = u2.nextChar(); + + while (c1 != 0 && c2 != 0 && c1 == c2) { + c1 = u1.nextChar(); + c2 = u2.nextChar(); + } + + return c1 - c2; + } + /** + * Compare two Ustrs. The ordering is that of + * native Unicode code points and probably not culturally appropriate + * anywhere. + * + * @param other the other Ustr + * @return a negative number, zero, or a positive number depending on + * whether the other is lexically less than, equal to, or greater than this. + */ + public int strcmp(Ustr other) { + return strcmp(s, base, other.s, other.base); + } + + /** + * Compare a Ustr to an object's String representation. The ordering + * is that of native Unicode code points and probably not culturally + * appropriate anywhere. + * + * @param other the other Object + * @return a negative number, zero, or a positive number depending on + * whether the other is lexically less than, equal to, or greater than this. + */ + public int strcmp(Object other) { + return strcmp(new Ustr(other)); + } + + /** + * Locate a Unicode character in a Ustr. Returns null if not + * found; if the character is zero, finds the offset of the null termination. + * + * @param c the character, as an integer + * @return a Ustr with the same buffer, starting at the matching character, + * or null if it's not found. + */ + public Ustr strchr(int c) { + int where = strchr(s, c); + return (where == -1) ? null : new Ustr(s, where); + } + + /** + * Find the offset where a Unicode character starts in a null-terminated + * UTF-encoded byte array. + * Returns -1 if not found; if the character is zero, finds the index of + * the null termination. + * + * @param b UTF-encoded null-terminated byte array + * @return the offset in the string, or -1 + */ + public static int strchr(byte [] b, int c) { + byte [] cbytes = new byte[10]; + appendChar(c, cbytes, 0); + return strstr(b, cbytes); + } + + /** + * Locate the last occurrence of a Unicode character in a Ustr. + * If found, returns a Ustr built around the same buffer as + * this, with the base set to the matching location. If not found, null + * + * @param c the character, as an integer + * @return a Ustr with the base set to the match, or null + */ + public Ustr strrchr(int c) { + int where = strrchr(s, c); + return (where == -1) ? null : new Ustr(s, where); + } + + /** + * Find the index of the last appearance of a Unicode character in a + * null-terminated UTF-encoded byte array. + * Returns -1 if not found. + * + * @param b the byte array + * @param c the integer + * @return the offset where the last occurence of c starts, or -1 + */ + public static int strrchr(byte [] b, int c) { + byte [] cbytes = new byte[10]; + appendChar(c, cbytes, 0); + + int where = b.length - strlen(cbytes); + while (where >= 0) { + int i; + for (i = 0; cbytes[i] != 0; i++) + if (b[where + i] != cbytes[i]) + break; + if (cbytes[i] == 0) + return where; + where--; + } + return -1; + } + + /** + * Locate a substring in a string. Returns a Ustr built around the same + * buffer, but starting at the matching position, or null if no match + * is found. + * + * @param little the substring to be located + * @return matching Ustr, or null + */ + public Ustr strstr(Ustr little) { + int where = strstr(s, little.s); + return (where == -1) ? null : new Ustr(s, where); + } + + /** + * locate a substring in a byte array. Returns the offset of the substring + * if it matches, otherwise -1. + * + * @param big the array to search in + * @param little the array to search for + * @return the index of the match, or -1 + */ + public static int strstr(byte [] big, byte [] little) { + // should BoyerMooreify this... + + for (int bi = 0; big[bi] != 0; bi++) { + int li; + for (li = 0; little[li] != 0; li++) + if (big[bi + li] != little[li]) + break; + if (little[li] == 0) + return bi; + } + return -1; + } + + ///////////////////////////////////////////////////////////////// + // From here on down the methods are those from java.lang.String + ///////////////////////////////////////////////////////////////// + + /** + * Returns a Ustr generated from the char array. + * + * @param data the char array + * @return a new Ustr + */ + static Ustr copyValueOf(char [] data) { + return new Ustr(data); + } + + /** + * Returns a Ustr generated from a piece of the char array. + * + * @param data the char array + * @param offset where to start generating from + * @param count how many java chars to use + * @return a new Ustr + */ + static Ustr copyValueOf(char [] data, int offset, int count) { + char [] chunk = new char[count]; + for (int i = 0; i < count; i++) + chunk[i] = data[offset + i]; + return new Ustr(chunk); + } + + /** + * find the Unicode character at some index in a Ustr. Throws an + * IndexOutOfBounds exception if appropriate. + * + * @param at the index + * @return the Unicode character, as an integer + */ + public int charAt(int at) + throws IndexOutOfBoundsException { + if (at < 0) + throw new IndexOutOfBoundsException("Negative Ustr charAt"); + int c = 0; + offset = 0; + prepareNext(); + do { + c = nextChar(); + at--; + } while (c != 0 && at >= 0); + + if (at > 0) + throw new IndexOutOfBoundsException("Ustr charAt too large"); + return c; + } + + /** + * Append a String to the end of this. + * + * @param str the string + * @return a a new Ustr which contains the concatenation + */ + public Ustr concat(String str) { + Ustr us = new Ustr(str); + return concat(us); + } + + /** + * Append a Ustr to the end of this. + * + * @param us the ustr to append + * @return a new ustr + */ + public Ustr concat(Ustr us) { + Ustr ret = new Ustr(strlen() + us.strlen() + 1); + ret.strcpy(this); + ret.strcat(us); + return ret; + } + + /** + * Test if this Ustr ends with the specified suffix (a Ustr). + * + * @param suffix the possible suffix. + * @return true or false. + */ + public boolean endsWith(Ustr suffix) { + int start = strlen() - suffix.strlen(); + if (start < 0) + return false; + return (strcmp(s, base + start, suffix.s, suffix.base) == 0); + } + + /** + * Test if this Ustr ends with specified suffix (a String). + * + * @param suffix the possible suffix + * @return true or false + */ + public boolean endsWith(String suffix) { + return endsWith(new Ustr(suffix)); + } + + /** + * Compares this Ustr to another object. + * + * @param anObject the other object + * @return true or false + */ + public boolean equals(Object anObject) { + return (compareTo(anObject) == 0); + } + + /** + * Convert this Ustr into bytes according to the platform's default + * character encoding, storing the result in a new byte array. + * + * @return a new byte array + */ + public byte [] getBytes() { + return toString().getBytes(); + } + + /** + * Convert this Ustr into bytes according to the specified + * character encoding, storing the result into a new byte array. + * + * @param enc the encoding to use in generating bytes + * @return the new byte array + */ + public byte [] getBytes(String enc) + throws java.io.UnsupportedEncodingException { + return toString().getBytes(enc); + } + + /** + * Copies Unicode characters from this String into the destination + * char array. Note that if the String contains UTF-16 surrogate + * pairs, each pair counts as a single character. + * + * @param str the string + * @param srcBegin where to start copying + * @param srcEnd index after last char to copy + * @param dst start of destination array + * @param dstBegin where in the destination array to start copying + */ + public static void getChars(String str, int srcBegin, int srcEnd, + char [] dst, int dstBegin) { + Ustr us = new Ustr(str); + us.getChars(srcBegin, srcEnd, dst, dstBegin); + } + + /** + * Copies Unicode characters from this Ustr into the destination + * char array. We can't just dispatch to the String implementation + * because we do Unicode characters, it does UTF-16 code points + * + * @param srcBegin where to start copying + * @param srcEnd index after last char to copy + * @param dst start of destination array + * @param dstBegin where in the destination array to start copying + */ + public void getChars(int srcBegin, int srcEnd, char [] dst, int dstBegin) { + if (srcBegin < 0 || srcBegin > srcEnd || dstBegin < 0) + throw new IndexOutOfBoundsException("bogus getChars index bounds"); + if (dst == null) + throw new NullPointerException("null 'dst' argument to getChars"); + + prepareNext(); + while (srcBegin > 0) { + srcBegin--; + nextChar(); + } + int c; + int howMany = srcEnd - srcBegin; + int i, j; + for (i = j = 0; i < howMany; i++, j++) { + c = nextChar(); + if (c == 0 && i < howMany - 1) + throw new IndexOutOfBoundsException("getChars ran off buffer"); + if (c < 0x10000) + dst[dstBegin + j] = (char) c; + else { + // two UTF-16 codepoints + // 10346 => D800/DF46 + // 000uuuuuxxxxxxxxxxxxxxxx 110110wwwwxxxxxx 110111xxxxxxxxxx + // where wwww = uuuuu - 1 + + c -= 0x10000; + int uHi = (c >> 10) & 0x3ff; + dst[dstBegin + j] = (char) (0xd800 | uHi); + j++; + + int uLo = c & 0x3ff; + dst[dstBegin + j] = (char) (0xdc00 | uLo); + } + } + } + + /** + * Returns a hashcode for this Ustr. The algorithm is that documented + * for String, only that documentation says 'int' + * arithmetic, which is clearly wrong, but this produces the same result + * as String's hashCode() for the strings "1" and "2", and thus by + * induction must be correct. + * + * @return an integer hashcode + */ + public int hashCode() { + long h = 0; + long c; + long n = length() - 1; + prepareNext(); + while ((c = nextChar()) != 0) { + h += c * pow(31, n); + n--; + } + return (int) (h & 0xffffffff); + } + + // er blush I'm on a plane and can't find long exponentiation in Java + private static long pow(long a, long b) { + long p = 1; + while (b-- > 0) + p *= a; + return p; + } + + /** + * Returns the first index within this Ustr of the specified + * Unicode character. + * + * @param ch the character + * @return index (usable by charAt) in the string of the char, or -1 + */ + public int indexOf(int ch) { + return indexOf(ch, 0); + } + + /** + * Returns the first index within this Ustr of the specified + * character, starting at the specified index. + * + * @param ch the character + * @param start where to start looking + * @return index (usable by charAt) in the string of the char, or -1 + */ + public int indexOf(int ch, int start) { + int i = 0; + prepareNext(); + while (start-- > 0) { + nextChar(); + i++; + } + int c; + while ((c = nextChar()) != 0) { + if (c == ch) + return i; + i++; + } + if (ch == 0) + return i; + return -1; + } + + /** + * Returns the index within this Ustr of the first occurrence of the + * specified other Ustr, or -1. + * + * @param us the other Ustr + * @return the index of the match, or -1 + */ + public int indexOf(Ustr us) { + return indexOf(us, 0); + } + + /** + * Returns the index within this Ustr of the first occurrence of the + * specified other Ustr starting at the given offset, or -1. + * + * @param us the other Ustr + * @param start the index to start looking + * @return the index of the match, or -1 + */ + public int indexOf(Ustr us, int start) { + int i = 0; + prepareNext(); + while (start-- > 0) { + nextChar(); + i++; + } + + // we'll work at the UTF level, but this should be BoyerMoore-ized + do { + int j; + for (j = 0; s[base + offset + j] != 0 && us.s[us.base + j] != 0; j++) + if (s[base + offset + j] != us.s[us.base + j]) + break; + if (us.s[base + j] == 0) + return i; + i++; + } while (nextChar() != 0); + + return -1; + } + + /** + * returns a canonical version of the Ustr, which should be treated as + * read-only. Differs from the intern function + * of String in that it never returns the input string; if a new hashtable + * entry is required, it makes a new Ustr and returns that. If a programmer + * updates the contents of a Ustr returned from intern(), grave disorder + * will ensue. + * + * @return the canonical version of the Ustr. + */ + public Ustr intern() { + Ustr u = interns.get(this); + if (u != null) + return u; + + u = new Ustr(strlen() + 1); + u.strcpy(this); + interns.put(u, u); + return u; + } + + /** + * Returns the index within this Ustr of the last occurrence of the + * specified Unicode character. + * + * @param ch the character + * @return the last index of the character, or -1 + */ + public int lastIndexOf(int ch) { + return lastIndexOf(ch, length()); + } + + /** + * Returns the index within this Ustr of the last occurrence of the + * specified Unicode character before the specified stop index. + * + * @param ch the character + * @param stop last index to consider + * @return the last index of the character, or -1 + */ + public int lastIndexOf(int ch, int stop) { + int i = 0; + prepareNext(); + int foundAt = -1; + do { + if (ch == nextChar()) + foundAt = i; + i++; + } while (i <= stop); + + return foundAt; + } + + /** + * Finds the last substring match. + * + * @param us the subtring to search for + * @return the match index, or =1 + */ + public int lastIndexOf(Ustr us) { + return lastIndexOf(us, length()); + } + + /** + * Finds the last substring match before the given index. + * + * @param us the subtring to search for + * @param stop where to stop searching + * @return the match index, or =1 + */ + public int lastIndexOf(Ustr us, int stop) { + int i = 0; + int foundAt = -1; + + // we'll work at the UTF level, but this should be BoyerMoore-ized + prepareNext(); + do { + int j; + for (j = 0; s[base + offset + j] != 0 && us.s[us.base + j] != 0; j++) + if (s[base + offset + j] != us.s[us.base + j]) + break; + if (us.s[base + j] == 0) + foundAt = i; + i++; + } while (nextChar() != 0 && i <= stop); + + return foundAt; + } + + private static int bytesInChar(int c) { + if (c < 128) + return 1; + else if (c < 0x800) + return 2; + else if (c < 0x10000) + return 3; + else + return 4; + } + + /** + * returns a new Ustr with all instances of one Unicode character replaced + * by another. Throws a UstrException if newChar + * is not a Unicode codepoint (negative or >0x10ffff). + * + * @param oldChar the Unicode character to be replaced + * @param newChar the Unicode character to replace it with + * @return the new Ustr + * @throws UstrException + */ + public Ustr replace(int oldChar, int newChar) { + if (newChar < 0) + throw new UstrException("Negative replacement character"); + else if (newChar > 0x10ffff) + throw new UstrException("Replacement character > 0x10ffff"); + + // figure out how much space we need + int space = strlen() + 1; + int delta = bytesInChar(newChar) - bytesInChar(newChar); + if (delta != 0) { + int c; + + while ((c = nextChar()) != 0) + if (c == oldChar) + space += delta; + } + + Ustr us = new Ustr(space); + prepareNext(); us.prepareAppend(); + int c; + while ((c = nextChar()) != 0) + us.appendChar((c == oldChar) ? newChar : c); + return us; + } + + /** + * Tests if other Ustr is prefix of this. + * + * @param us the other Ustr + * @return true/false + */ + public boolean startsWith(Ustr us) { + return startsWith(us, 0); + } + + /** + * Tests if other Ustr is prefix at given index. + * + * @param us the other Ustr + * @param start where to test + * @return true/false + */ + public boolean startsWith(Ustr us, int start) { + prepareNext(); + while (start-- > 0) + nextChar(); + + for (int i = 0; us.s[base + i] != 0; i++) + if (s[base + offset + i] != us.s[us.base + i]) + return false; + + return true; + } + + /** + * makes a new substring of a Ustr given a start index. + * + * @param start index of start of substr + * @return new Ustr containing substr + */ + public Ustr substring(int start) { + return substring(start, length()); + } + + /** + * makes a new substring of a Ustr identified by start and end + * indices. + * + * @param start index of start of substr + * @param end index of end of substr + * @return new Ustr containing substr + */ + public Ustr substring(int start, int end) { + if (start < 0 || end < start || end > length()) + throw new IndexOutOfBoundsException("bogus start/end"); + + int howMany = end - start; + offset = 0; + + // move up to the start + while (start-- > 0) { + int c = s[base + offset] & 0xff; + if (c == 0) + throw new IndexOutOfBoundsException("substring too long"); + offset += encLength[c]; + } + + int startAt = offset; + for (int i = 0; i < howMany; i++) { + int c = s[base + offset] & 0xff; + if (c == 0) + throw new IndexOutOfBoundsException("substring too long"); + offset += encLength[c]; + } + int bytesToMove = offset - startAt; + Ustr us = new Ustr(bytesToMove + 1); + System.arraycopy(s, startAt, us.s, 0, bytesToMove); + us.s[bytesToMove] = 0; + + /* + int to = 0; + while (startAt < offset) + us.s[to++] = s[startAt++]; + us.s[to] = 0; + */ + + /* + prepareNext(); + while (start-- > 0) + nextChar(); + + Ustr us = new Ustr(strlen(s, offset) + 1); + + us.prepareAppend(); + for (int i = 0; i < howMany; i++) { + int c = nextChar(); + if (c == 0) + throw new IndexOutOfBoundsException("substring too long"); + us.appendChar(c); + } + */ + return us; + } + + /** + * converts a Ustr to a char array. + * + * @return the new char array + */ + public char [] toCharArray() { + return toString().toCharArray(); + } +} diff --git a/src/org/jruby/util/string/UstrException.java b/src/org/jruby/util/string/UstrException.java new file mode 100644 index 00000000000..cc486931794 --- /dev/null +++ b/src/org/jruby/util/string/UstrException.java @@ -0,0 +1,7 @@ +package org.jruby.util.string; + +public class UstrException extends java.lang.RuntimeException { + public UstrException(String message) { + super(message); + } +}