String in C is now UTF8 encoded

Primitives for head/tail/index/cons/reverse/length now all assume the char* is UTF8 encoded. Also updated generation of literals to encode as UTF8. Primitives are probably not as efficient as they could be (though some of the will be used rarely) ASCII strings will work exactly as before. Everything I know about UTF8 encoding has been learned in the past few hours. Therefore, this is unlikely to be the best way to do this. Please educate me, ideally in the form of annotated Pull Requests :).
idris-lang · Mar 28, 2015 · c413280 · c413280
1 parent d0b8bb5
commit c413280
Show file tree

Hide file tree

Showing 10 changed files with 283 additions and 29 deletions.
diff --git a/idris.cabal b/idris.cabal
@@ -342,6 +342,9 @@ Extra-source-files:
                        test/basic012/run
                        test/basic012/*.idr
                        test/basic012/expected
+                       test/basic013/run
+                       test/basic013/*.idr
+                       test/basic013/expected
 
                        test/buffer001-disabled/*.idr
                        test/buffer001-disabled/run

diff --git a/rts/Makefile b/rts/Makefile
@@ -1,9 +1,10 @@
 include ../config.mk
 
 OBJS = idris_rts.o idris_heap.o idris_gc.o idris_gmp.o idris_bitstring.o \
-       idris_opts.o idris_stats.o mini-gmp.o
+       idris_opts.o idris_stats.o idris_utf8.o mini-gmp.o
 HDRS = idris_rts.h idris_heap.h idris_gc.h idris_gmp.h idris_bitstring.h \
-       idris_opts.h idris_stats.h mini-gmp.h idris_stdfgn.h idris_net.h
+       idris_opts.h idris_stats.h mini-gmp.h idris_stdfgn.h idris_net.h \
+       idris_utf8.h
 CFLAGS:=-fPIC $(CFLAGS)
 CFLAGS += $(GMP_INCLUDE_DIR) $(GMP) -DIDRIS_TARGET_OS="\"$(OS)\""
 CFLAGS += -DIDRIS_TARGET_TRIPLE="\"$(MACHINE)\""

diff --git a/rts/idris_rts.c b/rts/idris_rts.c
@@ -2,6 +2,7 @@
 
 #include "idris_rts.h"
 #include "idris_gc.h"
+#include "idris_utf8.h"
 #include "idris_bitstring.h"
 
 #ifdef HAS_PTHREAD
@@ -596,7 +597,7 @@ VAL idris_streq(VM* vm, VAL l, VAL r) {
 }
 
 VAL idris_strlen(VM* vm, VAL l) {
-    return MKINT((i_int)(strlen(GETSTR(l))));
+    return MKINT((i_int)(idris_utf8_strlen(GETSTR(l))));
 }
 
 #define BUFSIZE 256
@@ -647,7 +648,7 @@ VAL idris_readStr(VM* vm, FILE* h) {
 }
 
 VAL idris_strHead(VM* vm, VAL str) {
-    return MKINT((i_int)(GETSTR(str)[0]));
+    return idris_strIndex(vm, str, 0);
 }
 
 VAL MKSTROFFc(VM* vm, StrOffset* off) {
@@ -679,42 +680,50 @@ VAL idris_strTail(VM* vm, VAL str) {
         }
 
         cl->info.str_offset->str = root;
-        cl->info.str_offset->offset = offset+1;
+        cl->info.str_offset->offset = offset+idris_utf8_charlen(GETSTR(str));
 
         return cl;
     } else {
-        return MKSTR(vm, GETSTR(str)+1);
+        char* nstr = GETSTR(str);
+        return MKSTR(vm, nstr+idris_utf8_charlen(nstr));
     }
 }
 
 VAL idris_strCons(VM* vm, VAL x, VAL xs) {
     char *xstr = GETSTR(xs);
-    Closure* cl = allocate(sizeof(Closure) +
-                           strlen(xstr) + 2, 0);
-    SETTY(cl, STRING);
-    cl -> info.str = (char*)cl + sizeof(Closure);
-    cl -> info.str[0] = (char)(GETINT(x));
-    strcpy(cl -> info.str+1, xstr);
-    return cl;
+    int xval = GETINT(x);
+    if ((xval & 0x80) == 0) { // ASCII char
+        Closure* cl = allocate(sizeof(Closure) +
+                               strlen(xstr) + 2, 0);
+        SETTY(cl, STRING);
+        cl -> info.str = (char*)cl + sizeof(Closure);
+        cl -> info.str[0] = (char)(GETINT(x));
+        strcpy(cl -> info.str+1, xstr);
+        return cl;
+    } else {
+        char *init = idris_utf8_fromChar(xval);
+        Closure* cl = allocate(sizeof(Closure) + strlen(init) + strlen(xstr) + 1, 0);
+        SETTY(cl, STRING);
+        cl -> info.str = (char*)cl + sizeof(Closure);
+        strcpy(cl -> info.str, init);
+        strcat(cl -> info.str, xstr);
+        free(init);
+        return cl;
+    }
 }
 
 VAL idris_strIndex(VM* vm, VAL str, VAL i) {
-    return MKINT((i_int)(GETSTR(str)[GETINT(i)]));
+    int idx = idris_utf8_index(GETSTR(str), GETINT(i));
+    return MKINT((i_int)idx);
 }
 
 VAL idris_strRev(VM* vm, VAL str) {
     char *xstr = GETSTR(str);
     Closure* cl = allocate(sizeof(Closure) +
                            strlen(xstr) + 1, 0);
     SETTY(cl, STRING);
-    cl -> info.str = (char*)cl + sizeof(Closure);
-    int y = 0;
-    int x = strlen(xstr);
-
-    cl-> info.str[x+1] = '\0';
-    while(x>0) {
-        cl -> info.str[y++] = xstr[--x];
-    }
+    cl->info.str = (char*)cl + sizeof(Closure);
+    idris_utf8_rev(xstr, cl->info.str);
     return cl;
 }
 

diff --git a/rts/idris_rts.h b/rts/idris_rts.h
@@ -335,6 +335,8 @@ VAL idris_readStr(VM* vm, FILE* h);
 
 VAL idris_strHead(VM* vm, VAL str);
 VAL idris_strTail(VM* vm, VAL str);
+// This is not expected to be efficient! Mostly we wouldn't expect to call
+// it at all at run time.
 VAL idris_strCons(VM* vm, VAL x, VAL xs);
 VAL idris_strIndex(VM* vm, VAL str, VAL i);
 VAL idris_strRev(VM* vm, VAL str);

diff --git a/rts/idris_utf8.c b/rts/idris_utf8.c
@@ -0,0 +1,144 @@
+#include "idris_utf8.h"
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+int idris_utf8_strlen(char *s) {
+   int i = 0, j = 0;
+   while (s[i]) {
+     if ((s[i] & 0xc0) != 0x80) j++;
+     i++;
+   }
+   return j;
+}
+
+int idris_utf8_charlen(char* s) {
+    int init = (int)s[0];
+    if ((init & 0x80) == 0) {
+        return 1; // Top bit unset, so 1 byte
+    }
+    if ((init > 244 && init < 256) ||
+        (init == 192) ||
+        (init == 193)) {
+        return 1; // Invalid characters
+    }
+    int i = 1;
+    while ((s[i] & 0xc0) == 0x80) {
+        i++; // Move on until top 2 bits are not 10
+    }
+    return i;
+}
+
+unsigned idris_utf8_index(char* s, int idx) {
+   int i = 0, j = 0;
+   while (j < idx) {
+     if ((s[i] & 0xc0) != 0x80) j++;
+     i++;
+   }
+   // Find the start of the next character
+   while ((s[i] & 0xc0) == 0x80) { i++; }
+
+   unsigned bytes = 0;
+   unsigned top = 0;
+
+   int init = (int)s[1];
+
+   // s[i] is now the start of the character we want
+   if ((s[i] & 0x80) == 0) {
+       bytes = 1;
+       top = (int)(s[i]);
+   } else if ((init > 244 && init < 256) ||
+              (init == 192) ||
+              (init == 193)) {
+       bytes = 1;
+       top = (int)(s[i]); // Invalid characters
+   } else if ((s[i] & 0xe0) == 0xc0) {
+       bytes = 2;
+       top = (int)(s[i] & 0x1f); // 5 bits
+   } else if ((s[i] & 0xf0) == 0xe0) {
+       bytes = 3;
+       top = (int)(s[i] & 0x0f); // 4 bits
+   } else if ((s[i] & 0xf8) == 0xf0) {
+       bytes = 4;
+       top = (int)(s[i] & 0x07); // 3 bits
+   } else if ((s[i] & 0xfc) == 0xf8) {
+       bytes = 5;
+       top = (int)(s[i] & 0x03); // 2 bits
+   } else if ((s[i] & 0xfe) == 0xfc) {
+       bytes = 6;
+       top = (int)(s[i] & 0x01); // 1 bits
+   }
+
+   while (bytes > 1) {
+       top = top << 6;
+       top += s[++i] & 0x3f; // 6 bits
+       --bytes;
+   }
+
+   return top;
+}
+
+char* idris_utf8_fromChar(int x) {
+    char* str;
+    int bytes = 0, top = 0;
+
+    if ((x & 0x80) == 0) {
+        str = malloc(2);
+        str[0] = (char)x;
+        str[1] = '\0';
+        return str;
+    }
+
+    if (x >= 0x80 && x <= 0x7ff) {
+        bytes = 2;
+        top = 0xc0;
+    } else if (x >= 0x800 && x <= 0xffff) {
+        bytes = 3;
+        top = 0xe0;
+    } else if (x >= 0x10000 && x <= 0x10ffff) {
+        bytes = 4;
+        top = 0xf0;
+    }
+
+    str = malloc(bytes + 1);
+    str[bytes] = '\0';
+    while(bytes > 0) {
+        int xbits = x & 0x3f; // Next 6 bits
+        bytes--;
+        if (bytes > 0) {
+            str[bytes] = (char)xbits + 0x80;
+        } else {
+            str[0] = (char)xbits + top;
+        }
+        x = x >> 6;
+    }
+
+    return str;
+}
+
+void reverse_range(char *start, char *end)
+{
+    while(start < end)
+    {
+        char c = *start;
+        *start++ = *end;
+        *end-- = c;
+    }
+}
+
+char* reverse_char(char *start)
+{
+    char *end = start;
+    while((end[1] & 0xc0) == 0x80) { end++; }
+    reverse_range(start, end);
+    return(end + 1);
+}
+
+char* idris_utf8_rev(char* s, char* result) {
+    strcpy(result, s);
+    char* end = result;
+    while(*end) { end = reverse_char(end); }
+    reverse_range(result, end-1);
+    return result;
+}
+
diff --git a/rts/idris_utf8.h b/rts/idris_utf8.h
@@ -0,0 +1,22 @@
+#ifndef _IDRIS_UTF8
+#define _IDRIS_UTF8
+
+/* Various functions for dealing with UTF8 encoding. These are probably
+   not very efficient (and I'm (EB) making no guarantees about their
+   correctness.) Nevertheless, they mean that we can treat Strings as
+   UFT8. Patches welcome :). */
+
+// Get length of a UTF8 encoded string in characters
+int idris_utf8_strlen(char *s);
+// Get number of bytes the first character takes in a string
+int idris_utf8_charlen(char* s);
+// Return int representation of string at an index.
+// Assumes in bounds.
+unsigned idris_utf8_index(char* s, int j);
+// Convert a char as an integer to a char* as a byte sequence
+// Null terminated; caller responsible for freeing
+char* idris_utf8_fromChar(int x);
+// Reverse a UTF8 encoded string, putting the result in 'result'
+char* idris_utf8_rev(char* s, char* result);
+
+#endif
diff --git a/src/IRTS/CodegenC.hs b/src/IRTS/CodegenC.hs
@@ -12,6 +12,7 @@ import Util.System
 
 import Numeric
 import Data.Char
+import Data.Bits
 import Data.List (intercalate)
 import qualified Data.Vector.Unboxed as V
 import System.Process
@@ -145,8 +146,26 @@ showCStr s = '"' : foldr ((++) . showChar) "\"" s
         | ord c < 0x10  = "\"\"\\x0" ++ showHex (ord c) "\"\""
         | ord c < 0x20  = "\"\"\\x"  ++ showHex (ord c) "\"\""
         | ord c < 0x7f  = [c]    -- 0x7f = \DEL
-        | ord c < 0x100 = "\"\"\\x"  ++ showHex (ord c) "\"\""
-        | otherwise = error $ "non-8-bit character in string literal: " ++ show c
+        | otherwise = showHexes (utf8bytes (ord c))
+
+    utf8bytes :: Int -> [Int]
+    utf8bytes x = let (h : bytes) = split [] x in
+                      headHex h (length bytes) : map toHex bytes
+      where
+        split acc 0 = acc
+        split acc x = let xbits = x .&. 0x3f 
+                          xrest = shiftR x 6 in
+                          split (xbits : acc) xrest
+
+        headHex h 1 = h + 0xc0
+        headHex h 2 = h + 0xe0
+        headHex h 3 = h + 0xf0
+        headHex h n = error "Can't happen: Invalid UTF8 character"
+
+        toHex i = i + 0x80
+
+    showHexes = foldr ((++) . showUTF8) ""
+    showUTF8 c = "\"\"\\x" ++ showHex c "\"\""
 
 bcc :: Int -> BC -> String
 bcc i (ASSIGN l r) = indent i ++ creg l ++ " = " ++ creg r ++ ";\n"
@@ -424,11 +443,6 @@ doOp v (LSLe (ATInt ITBig)) [l, r] = v ++ "idris_bigLe(vm, " ++ creg l ++ ", " +
 doOp v (LSGt (ATInt ITBig)) [l, r] = v ++ "idris_bigGt(vm, " ++ creg l ++ ", " ++ creg r ++ ")"
 doOp v (LSGe (ATInt ITBig)) [l, r] = v ++ "idris_bigGe(vm, " ++ creg l ++ ", " ++ creg r ++ ")"
 
-doOp v LStrConcat [l,r] = v ++ "idris_concat(vm, " ++ creg l ++ ", " ++ creg r ++ ")"
-doOp v LStrLt [l,r] = v ++ "idris_strlt(vm, " ++ creg l ++ ", " ++ creg r ++ ")"
-doOp v LStrEq [l,r] = v ++ "idris_streq(vm, " ++ creg l ++ ", " ++ creg r ++ ")"
-doOp v LStrLen [x] = v ++ "idris_strlen(vm, " ++ creg x ++ ")"
-
 doOp v (LIntFloat ITNative) [x] = v ++ "idris_castIntFloat(" ++ creg x ++ ")"
 doOp v (LFloatInt ITNative) [x] = v ++ "idris_castFloatInt(" ++ creg x ++ ")"
 doOp v (LSExt ITNative ITBig) [x] = v ++ "idris_castIntBig(vm, " ++ creg x ++ ")"
@@ -529,11 +543,18 @@ doOp v LFFloor [x] = v ++ flUnOp "floor" (creg x)
 doOp v LFCeil [x] = v ++ flUnOp "ceil" (creg x)
 doOp v LFNegate [x] = v ++ "MKFLOAT(vm, -GETFLOAT(" ++ (creg x) ++ "))"
 
+-- String functions which don't need to know we're UTF8
+doOp v LStrConcat [l,r] = v ++ "idris_concat(vm, " ++ creg l ++ ", " ++ creg r ++ ")"
+doOp v LStrLt [l,r] = v ++ "idris_strlt(vm, " ++ creg l ++ ", " ++ creg r ++ ")"
+doOp v LStrEq [l,r] = v ++ "idris_streq(vm, " ++ creg l ++ ", " ++ creg r ++ ")"
+
+-- String functions which need to know we're UTF8
 doOp v LStrHead [x] = v ++ "idris_strHead(vm, " ++ creg x ++ ")"
 doOp v LStrTail [x] = v ++ "idris_strTail(vm, " ++ creg x ++ ")"
 doOp v LStrCons [x, y] = v ++ "idris_strCons(vm, " ++ creg x ++ "," ++ creg y ++ ")"
 doOp v LStrIndex [x, y] = v ++ "idris_strIndex(vm, " ++ creg x ++ "," ++ creg y ++ ")"
 doOp v LStrRev [x] = v ++ "idris_strRev(vm, " ++ creg x ++ ")"
+doOp v LStrLen [x] = v ++ "idris_strlen(vm, " ++ creg x ++ ")"
 
 doOp v LAllocate [x] = v ++ "idris_buffer_allocate(vm, " ++ creg x ++ ")"
 doOp v LAppendBuffer [a, b, c, d, e, f] = v ++ "idris_appendBuffer(vm, " ++ creg a ++ "," ++ creg b ++ "," ++ creg c ++ "," ++ creg d ++ "," ++ creg e ++ "," ++ creg f ++ ")"

diff --git a/test/basic013/basic013.idr b/test/basic013/basic013.idr
@@ -0,0 +1,34 @@
+foo : String
+foo = "λx→x"
+
+bar : String
+bar = "λx→x"
+
+baz : Char
+baz = 'λ'
+
+quux : String
+quux = "\x0a\x80\xC9\xFF\n3\n4"
+
+appMany : Nat -> String
+appMany Z = foo
+appMany (S k) = bar ++ appMany k
+
+main : IO ()
+main = do putStrLn foo
+          putStrLn (foo ++ bar)
+          putStrLn (reverse (foo ++ bar))
+          printLn (length foo)
+          printLn baz
+          let x = 4
+          let newstr = appMany (toNat x)
+          putStrLn newstr
+          printLn (strHead newstr)
+          printLn (length newstr)
+          printLn (strIndex newstr 4)
+          putStrLn (strCons (strIndex newstr 4) "")
+          putStrLn ("Tail: " ++ strTail newstr)
+          putStrLn ("Tail Tail: " ++ strTail (strTail newstr))
+          putStrLn ("Cons: " ++ strCons 'λ' newstr)
+          putStrLn ("Reverse: " ++ reverse newstr)
+