Skip to content

Commit

Permalink
XXX Simplify Text API, implement lazy extension.
Browse files Browse the repository at this point in the history
  • Loading branch information
jasone committed Aug 19, 2020
1 parent fc60eb6 commit 83432e5
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 37 deletions.
119 changes: 117 additions & 2 deletions bootstrap/src/basis/text.ml
@@ -1,7 +1,122 @@
open Rudiments

type t = {
src: codepoint Stream.t;
path: string option;
(* Number of bytes currently in excerpts, *not* the text length unless no
* further extensions can be forced. *)
blength: uns;
(* bindex->excerpt map for strings already forced into text. *)
excerpts: (uns, string, Uns.cmper_witness) Ordmap.t;
(* Lazy suspension which produces extended text. *)
extend: t option Lazy.t;
}

(* XXX *)
let init ?path stream =
let blength = 0 in
let excerpts = Ordmap.empty (module Uns) in
let rec susp_extend path blength excerpts stream = lazy begin
match Stream.is_empty stream with
| true -> None
| false -> begin
let excerpt, stream' = Stream.pop stream in
let blength' = blength + (String.blength excerpt) in
let excerpts' = Ordmap.insert_hlt ~k:blength ~v:excerpt excerpts in
let extend' = susp_extend path blength' excerpts' stream' in
let t' = {path; blength=blength'; excerpts=excerpts'; extend=extend'} in
Some t'
end
end in
let extend = susp_extend path blength excerpts stream in
{path; blength; excerpts; extend}

let path t =
t.path

let force t =
let rec fn t = begin
match Lazy.force (t.extend) with
| None -> t
| Some t' -> fn t'
end in
fn t

let blength t =
(force t).blength

module Pos = struct
type t = {
line: uns;
col: uns;
}

let init ~line ~col =
{line; col}

let line t =
t.line

let col t =
t.col
end

module Cursor = struct
module T = struct
type container = t
type elm = codepoint
type t = {
text: container;
bindex: uns;
pos: Pos.t;
cindex: uns;
(* Excerpts cursor, used for iterating over excerpts. *)
ecursor: (uns, string, Uns.cmper_witness) Ordmap.Cursor.t;
(* String cursor, used for iterating over codepoints within a single
* excerpt. Note that for the positions between excerpts, there are two
* logically equivalent cursors -- one at (String.Cursor.tl
* (Ordmap.Cursor.lget ecursor)), and the other at (String.Cursor.hd
* (Ordmap.Cursor.rget ecursor)). In principle it is possible for accesses
* across excerpt boundaries to dominate performance. However, the Ordmap
* cursor provides constant-time access to both the left and right, which
* means that even if we have to access a codepoint in the excerpt
* adjacent to the one in which scursor resides, the additional overhead
* is constant. *)
scursor: String.Cursor.t;
}

let cmp t0 t1 =
Uns.cmp t0.bindex t1.bindex

let hd text =
let ecursor = Ordmap.Cursor.hd text.excerpts in
let scursor = String.Cursor.hd (
match Ordmap.length text.excerpts > 0 with
| true -> (match Ordmap.Cursor.rget ecursor with _, s -> s)
| false -> ""
) in
{text; bindex=0; cindex=0; pos=Pos.init ~line:1 ~col:0; ecursor; scursor}

let tl text =
XXX

let succ t =
XXX

let pred t =
XXX

let lget t =
XXX

let rget t =
XXX

end
include T
include Cmpable.Make_mono(T)
end

module Slice = struct
type container = t

include Slice.Make_mono(Cursor)
end
46 changes: 11 additions & 35 deletions bootstrap/src/basis/text.mli
Expand Up @@ -3,48 +3,26 @@
converted to '�', thus maintaining the invariant that the text exclusively
comprises codepoints.
The text is presented as an incremental persistent buffer that is grown each
time text is read. The read position is always at the end of the buffer.
The text is presented as a linear sequence of codepoints which is lazily
streamed as needed to satisfy cursor operations, length queries, etc. The
lazy suspension has no impact on the API, but the intent is to support
linear forward scanning such that if the tail of the stream is never needed,
the application does not incur the cost of complete text initialization.
*)

open Rudiments

type t
(** Text. *)

val of_stream: ?path:string -> codepoint Stream.t -> t
(** [of_stream ~path stream] returns a text which is streams from [stream]. *)

val to_stream: t -> codepoint Stream.t
(** [to_stream t] returns a stream which is layered on top of [t], with its
position at 0. The stream sources from [t] for as long as already-processed
data are available, then seamlessly switches to operating on [t]. Texts
cannot have their read positions moved backward, but this function provides
a mechanism for streaming a text repeatedly. *)
val init: ?path:string -> string Stream.t -> t
(** [init ~path stream] returns a text which streams from [stream]. *)

val path: t -> string option
(** [path t] returns the optional path associated with the text. *)

val end_reached: t -> bool
(* [end_reached t] returns whether the current position is at the end of input.
*)

val seek_end: t -> t
(** Return a text with its position at the end of input. *)

val read: t -> codepoint option * t
(** [read t] reads the next codepoint in the stream and returns [(Some cp), t'],
or returns [None, t] if at the end. *)

val read_n: uns -> t -> string option * t
(** [read_n n t] reads at most the next [n] codepoints in the stream and returns
[(Some cps), t'], or returns [None, t] if at the end. *)

val read_line: t -> string option * t
(** [read_line t] reads through the next ['\n'] codepoint in the stream (or to
the end of stream if no ['\n'] codepoints remain) and returns [(Some line),
t'], or returns [None, t] if at the end. No attempt is made to look
backwards in the already-read data for codepoints on the same line. *)
val blength: t -> uns
(** Text length in bytes. Forces source stream. *)

(** Position within a text. The associated text is intentionally not referenced,
lest values retain references to arbitrary incremental texts. *)
Expand All @@ -64,7 +42,8 @@ module Pos: sig
end

(** Text cursor which tracks position (line and column) in addition to codepoint
index. *)
index. [tl] forces the input stream, and should therefore be used with care
during incremental processing. *)
module Cursor : sig
include Cursor_intf.S_mono with type container := t
and type elm := codepoint
Expand All @@ -80,7 +59,4 @@ module Slice : sig
include Slice_intf.S_mono with type container := t
and type cursor := Cursor.t
and type elm := codepoint

val of_text: outer -> t
(** [of_text text] returns a slice which spans the entirety of [text]. *)
end

0 comments on commit 83432e5

Please sign in to comment.