Skip to content

Commit

Permalink
String in C is now UTF8 encoded
Browse files Browse the repository at this point in the history
Primitives for head/tail/index/cons/reverse/length now all assume the
char* is UTF8 encoded.  Also updated generation of literals to encode as
UTF8.  Primitives are probably not as efficient as they could be (though
some of the will be used rarely)

ASCII strings will work exactly as before.

Everything I know about UTF8 encoding has been learned in the past few
hours. Therefore, this is unlikely to be the best way to do this. Please
educate me, ideally in the form of annotated Pull Requests :).
  • Loading branch information
edwinb committed Mar 28, 2015
1 parent d0b8bb5 commit c413280
Show file tree
Hide file tree
Showing 10 changed files with 283 additions and 29 deletions.
3 changes: 3 additions & 0 deletions idris.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,9 @@ Extra-source-files:
test/basic012/run
test/basic012/*.idr
test/basic012/expected
test/basic013/run
test/basic013/*.idr
test/basic013/expected

test/buffer001-disabled/*.idr
test/buffer001-disabled/run
Expand Down
5 changes: 3 additions & 2 deletions rts/Makefile
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
include ../config.mk

OBJS = idris_rts.o idris_heap.o idris_gc.o idris_gmp.o idris_bitstring.o \
idris_opts.o idris_stats.o mini-gmp.o
idris_opts.o idris_stats.o idris_utf8.o mini-gmp.o
HDRS = idris_rts.h idris_heap.h idris_gc.h idris_gmp.h idris_bitstring.h \
idris_opts.h idris_stats.h mini-gmp.h idris_stdfgn.h idris_net.h
idris_opts.h idris_stats.h mini-gmp.h idris_stdfgn.h idris_net.h \
idris_utf8.h
CFLAGS:=-fPIC $(CFLAGS)
CFLAGS += $(GMP_INCLUDE_DIR) $(GMP) -DIDRIS_TARGET_OS="\"$(OS)\""
CFLAGS += -DIDRIS_TARGET_TRIPLE="\"$(MACHINE)\""
Expand Down
49 changes: 29 additions & 20 deletions rts/idris_rts.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "idris_rts.h"
#include "idris_gc.h"
#include "idris_utf8.h"
#include "idris_bitstring.h"

#ifdef HAS_PTHREAD
Expand Down Expand Up @@ -596,7 +597,7 @@ VAL idris_streq(VM* vm, VAL l, VAL r) {
}

VAL idris_strlen(VM* vm, VAL l) {
return MKINT((i_int)(strlen(GETSTR(l))));
return MKINT((i_int)(idris_utf8_strlen(GETSTR(l))));
}

#define BUFSIZE 256
Expand Down Expand Up @@ -647,7 +648,7 @@ VAL idris_readStr(VM* vm, FILE* h) {
}

VAL idris_strHead(VM* vm, VAL str) {
return MKINT((i_int)(GETSTR(str)[0]));
return idris_strIndex(vm, str, 0);
}

VAL MKSTROFFc(VM* vm, StrOffset* off) {
Expand Down Expand Up @@ -679,42 +680,50 @@ VAL idris_strTail(VM* vm, VAL str) {
}

cl->info.str_offset->str = root;
cl->info.str_offset->offset = offset+1;
cl->info.str_offset->offset = offset+idris_utf8_charlen(GETSTR(str));

return cl;
} else {
return MKSTR(vm, GETSTR(str)+1);
char* nstr = GETSTR(str);
return MKSTR(vm, nstr+idris_utf8_charlen(nstr));
}
}

VAL idris_strCons(VM* vm, VAL x, VAL xs) {
char *xstr = GETSTR(xs);
Closure* cl = allocate(sizeof(Closure) +
strlen(xstr) + 2, 0);
SETTY(cl, STRING);
cl -> info.str = (char*)cl + sizeof(Closure);
cl -> info.str[0] = (char)(GETINT(x));
strcpy(cl -> info.str+1, xstr);
return cl;
int xval = GETINT(x);
if ((xval & 0x80) == 0) { // ASCII char
Closure* cl = allocate(sizeof(Closure) +
strlen(xstr) + 2, 0);
SETTY(cl, STRING);
cl -> info.str = (char*)cl + sizeof(Closure);
cl -> info.str[0] = (char)(GETINT(x));
strcpy(cl -> info.str+1, xstr);
return cl;
} else {
char *init = idris_utf8_fromChar(xval);
Closure* cl = allocate(sizeof(Closure) + strlen(init) + strlen(xstr) + 1, 0);
SETTY(cl, STRING);
cl -> info.str = (char*)cl + sizeof(Closure);
strcpy(cl -> info.str, init);
strcat(cl -> info.str, xstr);
free(init);
return cl;
}
}

VAL idris_strIndex(VM* vm, VAL str, VAL i) {
return MKINT((i_int)(GETSTR(str)[GETINT(i)]));
int idx = idris_utf8_index(GETSTR(str), GETINT(i));
return MKINT((i_int)idx);
}

VAL idris_strRev(VM* vm, VAL str) {
char *xstr = GETSTR(str);
Closure* cl = allocate(sizeof(Closure) +
strlen(xstr) + 1, 0);
SETTY(cl, STRING);
cl -> info.str = (char*)cl + sizeof(Closure);
int y = 0;
int x = strlen(xstr);

cl-> info.str[x+1] = '\0';
while(x>0) {
cl -> info.str[y++] = xstr[--x];
}
cl->info.str = (char*)cl + sizeof(Closure);
idris_utf8_rev(xstr, cl->info.str);
return cl;
}

Expand Down
2 changes: 2 additions & 0 deletions rts/idris_rts.h
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,8 @@ VAL idris_readStr(VM* vm, FILE* h);

VAL idris_strHead(VM* vm, VAL str);
VAL idris_strTail(VM* vm, VAL str);
// This is not expected to be efficient! Mostly we wouldn't expect to call
// it at all at run time.
VAL idris_strCons(VM* vm, VAL x, VAL xs);
VAL idris_strIndex(VM* vm, VAL str, VAL i);
VAL idris_strRev(VM* vm, VAL str);
Expand Down
144 changes: 144 additions & 0 deletions rts/idris_utf8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#include "idris_utf8.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

int idris_utf8_strlen(char *s) {
int i = 0, j = 0;
while (s[i]) {
if ((s[i] & 0xc0) != 0x80) j++;
i++;
}
return j;
}

int idris_utf8_charlen(char* s) {
int init = (int)s[0];
if ((init & 0x80) == 0) {
return 1; // Top bit unset, so 1 byte
}
if ((init > 244 && init < 256) ||
(init == 192) ||
(init == 193)) {
return 1; // Invalid characters
}
int i = 1;
while ((s[i] & 0xc0) == 0x80) {
i++; // Move on until top 2 bits are not 10
}
return i;
}

unsigned idris_utf8_index(char* s, int idx) {
int i = 0, j = 0;
while (j < idx) {
if ((s[i] & 0xc0) != 0x80) j++;
i++;
}
// Find the start of the next character
while ((s[i] & 0xc0) == 0x80) { i++; }

unsigned bytes = 0;
unsigned top = 0;

int init = (int)s[1];

// s[i] is now the start of the character we want
if ((s[i] & 0x80) == 0) {
bytes = 1;
top = (int)(s[i]);
} else if ((init > 244 && init < 256) ||
(init == 192) ||
(init == 193)) {
bytes = 1;
top = (int)(s[i]); // Invalid characters
} else if ((s[i] & 0xe0) == 0xc0) {
bytes = 2;
top = (int)(s[i] & 0x1f); // 5 bits
} else if ((s[i] & 0xf0) == 0xe0) {
bytes = 3;
top = (int)(s[i] & 0x0f); // 4 bits
} else if ((s[i] & 0xf8) == 0xf0) {
bytes = 4;
top = (int)(s[i] & 0x07); // 3 bits
} else if ((s[i] & 0xfc) == 0xf8) {
bytes = 5;
top = (int)(s[i] & 0x03); // 2 bits
} else if ((s[i] & 0xfe) == 0xfc) {
bytes = 6;
top = (int)(s[i] & 0x01); // 1 bits
}

while (bytes > 1) {
top = top << 6;
top += s[++i] & 0x3f; // 6 bits
--bytes;
}

return top;
}

char* idris_utf8_fromChar(int x) {
char* str;
int bytes = 0, top = 0;

if ((x & 0x80) == 0) {
str = malloc(2);
str[0] = (char)x;
str[1] = '\0';
return str;
}

if (x >= 0x80 && x <= 0x7ff) {
bytes = 2;
top = 0xc0;
} else if (x >= 0x800 && x <= 0xffff) {
bytes = 3;
top = 0xe0;
} else if (x >= 0x10000 && x <= 0x10ffff) {
bytes = 4;
top = 0xf0;
}

str = malloc(bytes + 1);
str[bytes] = '\0';
while(bytes > 0) {
int xbits = x & 0x3f; // Next 6 bits
bytes--;
if (bytes > 0) {
str[bytes] = (char)xbits + 0x80;
} else {
str[0] = (char)xbits + top;
}
x = x >> 6;
}

return str;
}

void reverse_range(char *start, char *end)
{
while(start < end)
{
char c = *start;
*start++ = *end;
*end-- = c;
}
}

char* reverse_char(char *start)
{
char *end = start;
while((end[1] & 0xc0) == 0x80) { end++; }
reverse_range(start, end);
return(end + 1);
}

char* idris_utf8_rev(char* s, char* result) {
strcpy(result, s);
char* end = result;
while(*end) { end = reverse_char(end); }
reverse_range(result, end-1);
return result;
}

22 changes: 22 additions & 0 deletions rts/idris_utf8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#ifndef _IDRIS_UTF8
#define _IDRIS_UTF8

/* Various functions for dealing with UTF8 encoding. These are probably
not very efficient (and I'm (EB) making no guarantees about their
correctness.) Nevertheless, they mean that we can treat Strings as
UFT8. Patches welcome :). */

// Get length of a UTF8 encoded string in characters
int idris_utf8_strlen(char *s);
// Get number of bytes the first character takes in a string
int idris_utf8_charlen(char* s);
// Return int representation of string at an index.
// Assumes in bounds.
unsigned idris_utf8_index(char* s, int j);
// Convert a char as an integer to a char* as a byte sequence
// Null terminated; caller responsible for freeing
char* idris_utf8_fromChar(int x);
// Reverse a UTF8 encoded string, putting the result in 'result'
char* idris_utf8_rev(char* s, char* result);

#endif
35 changes: 28 additions & 7 deletions src/IRTS/CodegenC.hs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import Util.System

import Numeric
import Data.Char
import Data.Bits
import Data.List (intercalate)
import qualified Data.Vector.Unboxed as V
import System.Process
Expand Down Expand Up @@ -145,8 +146,26 @@ showCStr s = '"' : foldr ((++) . showChar) "\"" s
| ord c < 0x10 = "\"\"\\x0" ++ showHex (ord c) "\"\""
| ord c < 0x20 = "\"\"\\x" ++ showHex (ord c) "\"\""
| ord c < 0x7f = [c] -- 0x7f = \DEL
| ord c < 0x100 = "\"\"\\x" ++ showHex (ord c) "\"\""
| otherwise = error $ "non-8-bit character in string literal: " ++ show c
| otherwise = showHexes (utf8bytes (ord c))

utf8bytes :: Int -> [Int]
utf8bytes x = let (h : bytes) = split [] x in
headHex h (length bytes) : map toHex bytes
where
split acc 0 = acc
split acc x = let xbits = x .&. 0x3f
xrest = shiftR x 6 in
split (xbits : acc) xrest

headHex h 1 = h + 0xc0
headHex h 2 = h + 0xe0
headHex h 3 = h + 0xf0
headHex h n = error "Can't happen: Invalid UTF8 character"

toHex i = i + 0x80

showHexes = foldr ((++) . showUTF8) ""
showUTF8 c = "\"\"\\x" ++ showHex c "\"\""

bcc :: Int -> BC -> String
bcc i (ASSIGN l r) = indent i ++ creg l ++ " = " ++ creg r ++ ";\n"
Expand Down Expand Up @@ -424,11 +443,6 @@ doOp v (LSLe (ATInt ITBig)) [l, r] = v ++ "idris_bigLe(vm, " ++ creg l ++ ", " +
doOp v (LSGt (ATInt ITBig)) [l, r] = v ++ "idris_bigGt(vm, " ++ creg l ++ ", " ++ creg r ++ ")"
doOp v (LSGe (ATInt ITBig)) [l, r] = v ++ "idris_bigGe(vm, " ++ creg l ++ ", " ++ creg r ++ ")"

doOp v LStrConcat [l,r] = v ++ "idris_concat(vm, " ++ creg l ++ ", " ++ creg r ++ ")"
doOp v LStrLt [l,r] = v ++ "idris_strlt(vm, " ++ creg l ++ ", " ++ creg r ++ ")"
doOp v LStrEq [l,r] = v ++ "idris_streq(vm, " ++ creg l ++ ", " ++ creg r ++ ")"
doOp v LStrLen [x] = v ++ "idris_strlen(vm, " ++ creg x ++ ")"

doOp v (LIntFloat ITNative) [x] = v ++ "idris_castIntFloat(" ++ creg x ++ ")"
doOp v (LFloatInt ITNative) [x] = v ++ "idris_castFloatInt(" ++ creg x ++ ")"
doOp v (LSExt ITNative ITBig) [x] = v ++ "idris_castIntBig(vm, " ++ creg x ++ ")"
Expand Down Expand Up @@ -529,11 +543,18 @@ doOp v LFFloor [x] = v ++ flUnOp "floor" (creg x)
doOp v LFCeil [x] = v ++ flUnOp "ceil" (creg x)
doOp v LFNegate [x] = v ++ "MKFLOAT(vm, -GETFLOAT(" ++ (creg x) ++ "))"

-- String functions which don't need to know we're UTF8
doOp v LStrConcat [l,r] = v ++ "idris_concat(vm, " ++ creg l ++ ", " ++ creg r ++ ")"
doOp v LStrLt [l,r] = v ++ "idris_strlt(vm, " ++ creg l ++ ", " ++ creg r ++ ")"
doOp v LStrEq [l,r] = v ++ "idris_streq(vm, " ++ creg l ++ ", " ++ creg r ++ ")"

-- String functions which need to know we're UTF8
doOp v LStrHead [x] = v ++ "idris_strHead(vm, " ++ creg x ++ ")"
doOp v LStrTail [x] = v ++ "idris_strTail(vm, " ++ creg x ++ ")"
doOp v LStrCons [x, y] = v ++ "idris_strCons(vm, " ++ creg x ++ "," ++ creg y ++ ")"
doOp v LStrIndex [x, y] = v ++ "idris_strIndex(vm, " ++ creg x ++ "," ++ creg y ++ ")"
doOp v LStrRev [x] = v ++ "idris_strRev(vm, " ++ creg x ++ ")"
doOp v LStrLen [x] = v ++ "idris_strlen(vm, " ++ creg x ++ ")"

doOp v LAllocate [x] = v ++ "idris_buffer_allocate(vm, " ++ creg x ++ ")"
doOp v LAppendBuffer [a, b, c, d, e, f] = v ++ "idris_appendBuffer(vm, " ++ creg a ++ "," ++ creg b ++ "," ++ creg c ++ "," ++ creg d ++ "," ++ creg e ++ "," ++ creg f ++ ")"
Expand Down
34 changes: 34 additions & 0 deletions test/basic013/basic013.idr
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
foo : String
foo = "λx→x"

bar : String
bar = "λx→x"

baz : Char
baz = 'λ'

quux : String
quux = "\x0a\x80\xC9\xFF\n3\n4"

appMany : Nat -> String
appMany Z = foo
appMany (S k) = bar ++ appMany k

main : IO ()
main = do putStrLn foo
putStrLn (foo ++ bar)
putStrLn (reverse (foo ++ bar))
printLn (length foo)
printLn baz
let x = 4
let newstr = appMany (toNat x)
putStrLn newstr
printLn (strHead newstr)
printLn (length newstr)
printLn (strIndex newstr 4)
putStrLn (strCons (strIndex newstr 4) "")
putStrLn ("Tail: " ++ strTail newstr)
putStrLn ("Tail Tail: " ++ strTail (strTail newstr))
putStrLn ("Cons: " ++ strCons 'λ' newstr)
putStrLn ("Reverse: " ++ reverse newstr)

0 comments on commit c413280

Please sign in to comment.