Skip to content

Commit

Permalink
Add Codepoint.Utf8.length_of_codepoint .
Browse files Browse the repository at this point in the history
  • Loading branch information
jasone committed Aug 25, 2020
1 parent 389a1bb commit fde6082
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 25 deletions.
16 changes: 8 additions & 8 deletions bootstrap/src/basis/bytes.ml
Original file line number Diff line number Diff line change
Expand Up @@ -136,14 +136,14 @@ module String_seq = struct
let rec vlength ~on_invalid seq vindex =
match on_invalid, (Codepoint_seq.to_codepoint seq) with
| _, Some (Valid (cp, seq')) -> begin
let vindex' = vindex + Codepoint.Utf8.(length (of_codepoint cp)) in
let vindex' = vindex + (Codepoint.Utf8.length_of_codepoint cp) in
vlength ~on_invalid seq' vindex'
end
| Error, Some (Invalid _) -> None
| Replace, Some (Invalid seq') -> begin
let cp = Codepoint.replacement in
let vindex' =
vindex + Codepoint.Utf8.(length (of_codepoint cp)) in
vindex + (Codepoint.Utf8.length_of_codepoint cp) in
vlength ~on_invalid seq' vindex'
end
| Halt, Some (Invalid _) -> halt "Invalid utf8 sequence"
Expand All @@ -161,15 +161,15 @@ module String_seq = struct
let next t =
match t.on_invalid, (Codepoint_seq.to_codepoint t.seq) with
| _, Some (Valid (cp, seq')) -> begin
let vincr = Codepoint.Utf8.(length (of_codepoint cp)) in
let vincr = Codepoint.Utf8.length_of_codepoint cp in
let vindex' = t.vindex + vincr in
let t' = {t with seq=seq'; vindex=vindex'} in
cp, t'
end
| Error, Some (Invalid _) -> not_reached ()
| Replace, Some (Invalid seq') -> begin
let cp = Codepoint.replacement in
let vincr = Codepoint.Utf8.(length (of_codepoint cp)) in
let vincr = Codepoint.Utf8.length_of_codepoint cp in
let vindex' = t.vindex + vincr in
let t' = {t with seq=seq'; vindex=vindex'} in
cp, t'
Expand Down Expand Up @@ -409,13 +409,13 @@ module String_replace_seq_rev = struct
match Codepoint_seq_rev.to_codepoint seq with
| Some (Valid (cp, seq')) -> begin
let vlength' =
Codepoint.Utf8.(length (of_codepoint cp)) + vlength in
(Codepoint.Utf8.length_of_codepoint cp) + vlength in
fn seq' vlength'
end
| Some (Invalid seq') -> begin
let cp = Codepoint.replacement in
let vlength' =
Codepoint.Utf8.(length (of_codepoint cp)) + vlength in
(Codepoint.Utf8.length_of_codepoint cp) + vlength in
fn seq' vlength'
end
| None -> vlength
Expand All @@ -430,14 +430,14 @@ module String_replace_seq_rev = struct
let next t =
match Codepoint_seq_rev.to_codepoint t.seq with
| Some (Valid (cp, seq')) -> begin
let vincr = Codepoint.Utf8.(length (of_codepoint cp)) in
let vincr = Codepoint.Utf8.length_of_codepoint cp in
let vpast' = t.vpast - vincr in
let t' = {seq=seq'; vpast=vpast'} in
cp, t'
end
| Some (Invalid seq') -> begin
let cp = Codepoint.replacement in
let vincr = Codepoint.Utf8.(length (of_codepoint cp)) in
let vincr = Codepoint.Utf8.length_of_codepoint cp in
let vpast' = t.vpast - vincr in
let t' = {seq=seq'; vpast=vpast'} in
cp, t'
Expand Down
27 changes: 18 additions & 9 deletions bootstrap/src/basis/codepoint.ml
Original file line number Diff line number Diff line change
Expand Up @@ -86,26 +86,35 @@ module Utf8 = struct
| Three of byte * byte * byte
| Four of byte * byte * byte * byte

let of_codepoint cp =
let length_of_codepoint cp =
assert (cp <= max_codepoint);
let lz = bit_clz cp in
let sigbits = 21 - lz in
Caml.Array.get [|
1; 1; 1; 1; 1; 1; 1; 1; (* [0..7] *)
2; 2; 2; 2; (* [8..11] *)
3; 3; 3; 3; 3; (* [12..16] *)
4; 4; 4; 4; 4; (* [17..21] *)
|] sigbits

let of_codepoint cp =
assert (cp <= max_codepoint);
let u = to_uns cp in
if sigbits < 8 then
One (Byte.of_uns u)
else if sigbits < 12 then
match length_of_codepoint cp with
| 1 -> One (Byte.of_uns u)
| 2 ->
Two (
Byte.of_uns Uns.(bit_or 0b110_00000 (bit_usr ~shift:6 u)),
Byte.of_uns Uns.(bit_or 0b10_000000 (bit_and u 0x3f))
)
else if sigbits < 17 then
| 3 ->
Three (
Byte.of_uns Uns.(bit_or 0b1110_0000 (bit_usr ~shift:12 u)),
Byte.of_uns Uns.(bit_or 0b10_000000
(bit_and (bit_usr ~shift:6 u) 0x3f)),
Byte.of_uns Uns.(bit_or 0b10_000000 (bit_and u 0x3f))
)
else if sigbits < 22 then
| 4 ->
Four (
Byte.of_uns Uns.(bit_or 0b11110_000 (bit_usr ~shift:18 u)),
Byte.of_uns Uns.(bit_or 0b10_000000
Expand All @@ -114,7 +123,7 @@ module Utf8 = struct
(bit_and (bit_usr ~shift:6 u) 0x3f)),
Byte.of_uns Uns.(bit_or 0b10_000000 (bit_and u 0x3f))
)
else not_reached ()
| _ -> not_reached ()

let to_codepoint = function
| One b0 -> of_uns (Byte.to_uns b0)
Expand Down Expand Up @@ -282,7 +291,7 @@ module Seq = struct
match fragment.nrem with
| 0 -> begin
let cp = of_uns fragment.u in
match Uns.(Utf8.(length (of_codepoint cp) = fragment.n)) with
match Uns.((Utf8.length_of_codepoint cp) = fragment.n) with
| true -> Valid (cp, t)
| false -> Invalid t
end
Expand Down Expand Up @@ -346,7 +355,7 @@ module Seq = struct

let validate fragment t =
let cp = of_uns fragment.u in
match Uns.(Utf8.(length (of_codepoint cp) = fragment.n)) with
match Uns.((Utf8.length_of_codepoint cp) = fragment.n) with
| true -> Valid (cp, t)
| false -> Invalid t (* Overlong. *)

Expand Down
4 changes: 4 additions & 0 deletions bootstrap/src/basis/codepoint.mli
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,10 @@ module Utf8 : sig
type outer = t
type t

val length_of_codepoint: outer -> uns
(** [length_of_codepoint cp] returns the length in bytes of the UTF-8 encoding
which corresponds to [cp]. *)

val of_codepoint: outer -> t
(** Initialize from {!type:codepoint}. *)

Expand Down
16 changes: 8 additions & 8 deletions bootstrap/src/basis/string.ml
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ module Seq = struct
match !rem_bytes with
| [] -> begin
let cp, t' = T.next !tmut in
assert (Uns.(Codepoint.Utf8.(length (of_codepoint cp)) +
assert (Uns.((Codepoint.Utf8.length_of_codepoint cp) +
(T.length t') = (T.length !tmut)));
tmut := t';
let b, tl = match Codepoint.to_bytes cp with
Expand Down Expand Up @@ -398,7 +398,7 @@ module Seq = struct
| 0 -> cps
| _ -> begin
let cp, t' = T.next t in
assert (Uns.(Codepoint.Utf8.(length (of_codepoint cp)) +
assert (Uns.((Codepoint.Utf8.length_of_codepoint cp) +
(T.length t') = (T.length t)));
let cps' = cp :: cps in
fn t' cps'
Expand Down Expand Up @@ -677,7 +677,7 @@ module Slice = struct

let next t =
let codepoint = t.f t.cindex in
let cp_nbytes = Codepoint.Utf8.(length (of_codepoint codepoint)) in
let cp_nbytes = Codepoint.Utf8.length_of_codepoint codepoint in
let blength' = t.blength - cp_nbytes in
let t' = {t with cindex=(Uns.succ t.cindex);
blength=blength'} in
Expand All @@ -693,7 +693,7 @@ module Slice = struct
| true -> nbytes
| false -> begin
let codepoint, seq' = f seq in
let cp_nbytes = Codepoint.Utf8.(length (of_codepoint codepoint)) in
let cp_nbytes = Codepoint.Utf8.length_of_codepoint codepoint in
let nbytes' = nbytes + cp_nbytes in
fn ~seq:seq' (Uns.succ cindex) nbytes'
end
Expand Down Expand Up @@ -730,7 +730,7 @@ module Slice = struct
| cp :: cps -> cp, cps
| [] -> not_reached ()
in
let nbytes = Codepoint.Utf8.(length (of_codepoint codepoint)) in
let nbytes = Codepoint.Utf8.length_of_codepoint codepoint in
let blength = t.blength - nbytes in
codepoint, {codepoints; blength}
end
Expand Down Expand Up @@ -843,7 +843,7 @@ module Slice = struct
let next t =
let codepoint = Cursor.rget t.cursor in
let codepoint' = t.f t.cindex codepoint in
let utf8_length = Codepoint.Utf8.(length (of_codepoint codepoint')) in
let utf8_length = Codepoint.Utf8.length_of_codepoint codepoint' in
let cursor' = Cursor.succ t.cursor in
let cindex' = Uns.succ t.cindex in
let blength' = t.blength - utf8_length in
Expand All @@ -860,7 +860,7 @@ module Slice = struct
foldi t ~init:(0, false) ~f:(fun i (blength, modified) codepoint ->
let codepoint' = f i codepoint in
let modified' = modified || Codepoint.(codepoint' <> codepoint) in
(blength + Codepoint.Utf8.(length (of_codepoint codepoint'))), modified'
(blength + (Codepoint.Utf8.length_of_codepoint codepoint')), modified'
)

let map ~f t =
Expand Down Expand Up @@ -985,7 +985,7 @@ module Slice = struct
let slice = f cp in
let modified' = modified
|| Uns.((blength slice)
<> Codepoint.Utf8.(length (of_codepoint cp)))
<> (Codepoint.Utf8.length_of_codepoint cp))
|| Codepoint.(Cursor.(rget (base slice)) <> cp) in
let slices' = slice :: slices in
modified', slices'
Expand Down

0 comments on commit fde6082

Please sign in to comment.