-
Notifications
You must be signed in to change notification settings - Fork 642
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Primitives for head/tail/index/cons/reverse/length now all assume the char* is UTF8 encoded. Also updated generation of literals to encode as UTF8. Primitives are probably not as efficient as they could be (though some of the will be used rarely) ASCII strings will work exactly as before. Everything I know about UTF8 encoding has been learned in the past few hours. Therefore, this is unlikely to be the best way to do this. Please educate me, ideally in the form of annotated Pull Requests :).
- Loading branch information
Showing
10 changed files
with
283 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
#include "idris_utf8.h" | ||
#include <stdio.h> | ||
#include <string.h> | ||
#include <stdlib.h> | ||
|
||
int idris_utf8_strlen(char *s) { | ||
int i = 0, j = 0; | ||
while (s[i]) { | ||
if ((s[i] & 0xc0) != 0x80) j++; | ||
i++; | ||
} | ||
return j; | ||
} | ||
|
||
int idris_utf8_charlen(char* s) { | ||
int init = (int)s[0]; | ||
if ((init & 0x80) == 0) { | ||
return 1; // Top bit unset, so 1 byte | ||
} | ||
if ((init > 244 && init < 256) || | ||
(init == 192) || | ||
(init == 193)) { | ||
return 1; // Invalid characters | ||
} | ||
int i = 1; | ||
while ((s[i] & 0xc0) == 0x80) { | ||
i++; // Move on until top 2 bits are not 10 | ||
} | ||
return i; | ||
} | ||
|
||
unsigned idris_utf8_index(char* s, int idx) { | ||
int i = 0, j = 0; | ||
while (j < idx) { | ||
if ((s[i] & 0xc0) != 0x80) j++; | ||
i++; | ||
} | ||
// Find the start of the next character | ||
while ((s[i] & 0xc0) == 0x80) { i++; } | ||
|
||
unsigned bytes = 0; | ||
unsigned top = 0; | ||
|
||
int init = (int)s[1]; | ||
|
||
// s[i] is now the start of the character we want | ||
if ((s[i] & 0x80) == 0) { | ||
bytes = 1; | ||
top = (int)(s[i]); | ||
} else if ((init > 244 && init < 256) || | ||
(init == 192) || | ||
(init == 193)) { | ||
bytes = 1; | ||
top = (int)(s[i]); // Invalid characters | ||
} else if ((s[i] & 0xe0) == 0xc0) { | ||
bytes = 2; | ||
top = (int)(s[i] & 0x1f); // 5 bits | ||
} else if ((s[i] & 0xf0) == 0xe0) { | ||
bytes = 3; | ||
top = (int)(s[i] & 0x0f); // 4 bits | ||
} else if ((s[i] & 0xf8) == 0xf0) { | ||
bytes = 4; | ||
top = (int)(s[i] & 0x07); // 3 bits | ||
} else if ((s[i] & 0xfc) == 0xf8) { | ||
bytes = 5; | ||
top = (int)(s[i] & 0x03); // 2 bits | ||
} else if ((s[i] & 0xfe) == 0xfc) { | ||
bytes = 6; | ||
top = (int)(s[i] & 0x01); // 1 bits | ||
} | ||
|
||
while (bytes > 1) { | ||
top = top << 6; | ||
top += s[++i] & 0x3f; // 6 bits | ||
--bytes; | ||
} | ||
|
||
return top; | ||
} | ||
|
||
char* idris_utf8_fromChar(int x) { | ||
char* str; | ||
int bytes = 0, top = 0; | ||
|
||
if ((x & 0x80) == 0) { | ||
str = malloc(2); | ||
str[0] = (char)x; | ||
str[1] = '\0'; | ||
return str; | ||
} | ||
|
||
if (x >= 0x80 && x <= 0x7ff) { | ||
bytes = 2; | ||
top = 0xc0; | ||
} else if (x >= 0x800 && x <= 0xffff) { | ||
bytes = 3; | ||
top = 0xe0; | ||
} else if (x >= 0x10000 && x <= 0x10ffff) { | ||
bytes = 4; | ||
top = 0xf0; | ||
} | ||
|
||
str = malloc(bytes + 1); | ||
str[bytes] = '\0'; | ||
while(bytes > 0) { | ||
int xbits = x & 0x3f; // Next 6 bits | ||
bytes--; | ||
if (bytes > 0) { | ||
str[bytes] = (char)xbits + 0x80; | ||
} else { | ||
str[0] = (char)xbits + top; | ||
} | ||
x = x >> 6; | ||
} | ||
|
||
return str; | ||
} | ||
|
||
void reverse_range(char *start, char *end) | ||
{ | ||
while(start < end) | ||
{ | ||
char c = *start; | ||
*start++ = *end; | ||
*end-- = c; | ||
} | ||
} | ||
|
||
char* reverse_char(char *start) | ||
{ | ||
char *end = start; | ||
while((end[1] & 0xc0) == 0x80) { end++; } | ||
reverse_range(start, end); | ||
return(end + 1); | ||
} | ||
|
||
char* idris_utf8_rev(char* s, char* result) { | ||
strcpy(result, s); | ||
char* end = result; | ||
while(*end) { end = reverse_char(end); } | ||
reverse_range(result, end-1); | ||
return result; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#ifndef _IDRIS_UTF8 | ||
#define _IDRIS_UTF8 | ||
|
||
/* Various functions for dealing with UTF8 encoding. These are probably | ||
not very efficient (and I'm (EB) making no guarantees about their | ||
correctness.) Nevertheless, they mean that we can treat Strings as | ||
UFT8. Patches welcome :). */ | ||
|
||
// Get length of a UTF8 encoded string in characters | ||
int idris_utf8_strlen(char *s); | ||
// Get number of bytes the first character takes in a string | ||
int idris_utf8_charlen(char* s); | ||
// Return int representation of string at an index. | ||
// Assumes in bounds. | ||
unsigned idris_utf8_index(char* s, int j); | ||
// Convert a char as an integer to a char* as a byte sequence | ||
// Null terminated; caller responsible for freeing | ||
char* idris_utf8_fromChar(int x); | ||
// Reverse a UTF8 encoded string, putting the result in 'result' | ||
char* idris_utf8_rev(char* s, char* result); | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
foo : String | ||
foo = "λx→x" | ||
|
||
bar : String | ||
bar = "λx→x" | ||
|
||
baz : Char | ||
baz = 'λ' | ||
|
||
quux : String | ||
quux = "\x0a\x80\xC9\xFF\n3\n4" | ||
|
||
appMany : Nat -> String | ||
appMany Z = foo | ||
appMany (S k) = bar ++ appMany k | ||
|
||
main : IO () | ||
main = do putStrLn foo | ||
putStrLn (foo ++ bar) | ||
putStrLn (reverse (foo ++ bar)) | ||
printLn (length foo) | ||
printLn baz | ||
let x = 4 | ||
let newstr = appMany (toNat x) | ||
putStrLn newstr | ||
printLn (strHead newstr) | ||
printLn (length newstr) | ||
printLn (strIndex newstr 4) | ||
putStrLn (strCons (strIndex newstr 4) "") | ||
putStrLn ("Tail: " ++ strTail newstr) | ||
putStrLn ("Tail Tail: " ++ strTail (strTail newstr)) | ||
putStrLn ("Cons: " ++ strCons 'λ' newstr) | ||
putStrLn ("Reverse: " ++ reverse newstr) | ||
|
Oops, something went wrong.