forked from BranchTaken/Hemlock
/
text.ml
300 lines (257 loc) · 7.92 KB
/
text.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
open Rudiments
(* Line/column position independent of previous lines' contents. *)
module Pos = struct
module T = struct
type t = {
line: uns;
col: uns;
}
let cmp t0 t1 =
let open Cmp in
match Uns.cmp t0.line t1.line with
| Lt -> Lt
| Eq -> Uns.cmp t0.col t1.col
| Gt -> Gt
let init ~line ~col =
{line; col}
let succ cp t =
match cp with
| cp when Codepoint.(cp = nl) -> {line=Uns.succ t.line; col=0}
| _ -> {t with col=Uns.succ t.col}
let line t =
t.line
let col t =
t.col
end
include T
include Cmpable.Make(T)
end
(* Absolute position from beginning of text. *)
module Apos = struct
module T = struct
type t = {
bindex: uns;
}
let cmp t0 t1 =
Uns.cmp t0.bindex t1.bindex
let init ~bindex =
{bindex}
let succ t =
{bindex=Uns.succ t.bindex}
let bindex t =
t.bindex
end
include T
include Cmpable.Make(T)
end
(* Text excerpt with end-of-excerpt absolute position built in. *)
module Excerpt = struct
module T = struct
type t = {
(* Absolute position at end of string. *)
apos: Apos.t;
(* Raw excerpt. *)
bytes: Bytes.Slice.t;
}
let hash_fold t state =
state |> Uns.hash_fold (Apos.bindex t.apos)
let cmp t0 t1 =
Apos.cmp t0.apos t1.apos
let pp ppf t =
Bytes.Slice.pp ppf t.bytes
(* Base excerpt, always hd of excerpts sets. *)
let base = {
apos=Apos.init ~bindex:0;
bytes=Bytes.Slice.of_container [||];
}
(* val of_bytes_slice: t -> Byte_seq.fragment option -> Bytes.Slice.t
* -> t * Byte_seq.fragment option *)
let of_bytes_slice pred fragment_opt _bytes =
pred, fragment_opt (* XXX *)
(* val of_string_slice: t -> Bytes.Slice.t -> t *)
let of_string_slice pred slice =
let apos = Apos.init
~bindex:((Apos.bindex pred.apos) + (String.Slice.blength slice)) in
let bytes = Bytes.Slice.of_string_slice slice in
{apos; bytes}
let apos t =
t.apos
let bytes t =
t.bytes
end
include T
(* XXX Remove?
include Cmpable.Make(T)
*)
include Cmper.Make_mono(T)
end
type t = {
(* Filesystem path. *)
path: string option;
(* Excerpts already forced into text. The set is initialized with a base
* excerpt, which simplifies various logic. *)
excerpts: (Excerpt.t, Excerpt.cmper_witness) Ordset.t;
(* Lazy suspension which produces extended text. *)
extend: t option Lazy.t;
}
let of_bytes_stream ?path stream =
let rec susp_extend path excerpts fragment_opt stream = lazy begin
match Stream.is_empty stream with
| true -> None
| false -> begin
let bytes, stream' = Stream.pop stream in
let pred_excerpt = Ordset.Cursor.(lget (tl excerpts)) in
let excerpt, fragment_opt' =
Excerpt.of_bytes_slice pred_excerpt fragment_opt bytes in
let excerpts' = Ordset.insert excerpt excerpts in
let extend' = susp_extend path excerpts' fragment_opt' stream' in
let t' = {path; excerpts=excerpts'; extend=extend'} in
Some t'
end
end in
let excerpts = Ordset.singleton (module Excerpt) Excerpt.base in
let extend = susp_extend path excerpts None stream in
{path; excerpts; extend}
let of_string_slice ?path slice =
let susp_extend () = lazy None in
let excerpts =
Ordset.singleton (module Excerpt) Excerpt.base
|> Ordset.insert Excerpt.(of_string_slice base slice) in
let extend = susp_extend () in
{path; excerpts; extend}
let path t =
t.path
let force t =
let rec fn t = begin
match Lazy.force (t.extend) with
| None -> t
| Some t' -> fn t'
end in
fn t
let blength t =
Apos.bindex Ordset.Cursor.(lget (tl (force t).excerpts)).apos
module Cursor = struct
module T = struct
type container = t
type elm = codepoint
type t = {
text: container;
apos: Apos.t;
pos: Pos.t;
(* Excerpts cursor, used for iterating over excerpts. *)
ecursor: (Excerpt.t, Excerpt.cmper_witness) Ordset.Cursor.t;
(* String cursor, used for iterating over codepoints within a single
* excerpt. Note that for the positions between excerpts, there are two
* logically equivalent cursors -- one at (String.Cursor.tl
* (Ordmap.Cursor.lget ecursor)), and the other at (String.Cursor.hd
* (Ordmap.Cursor.rget ecursor)). In principle it is possible for accesses
* across excerpt boundaries to dominate performance. However, the Ordmap
* cursor provides constant-time access to both the left and right, which
* means that even if we have to access a codepoint in the excerpt
* adjacent to the one in which scursor resides, the additional overhead
* is constant. *)
bcursor: Bytes.Cursor.t;
}
let cmp t0 t1 =
Apos.cmp t0.apos t1.apos
let container t =
t.text
let pos t =
t.pos
let index t =
Apos.bindex t.apos
let hd text =
let ecursor = Ordset.Cursor.hd text.excerpts in
let excerpt = Ordset.Cursor.rget ecursor in
let bcursor = Bytes.Slice.base excerpt.bytes in
{
text;
apos=Excerpt.apos excerpt;
pos=Pos.init ~line:1 ~col:0;
ecursor;
bcursor;
}
module Codepoint_seq = struct
module T = struct
type nonrec t = t
let init cursor =
cursor
let rec next t =
match Bytes.Cursor.(
t.bcursor < Bytes.Cursor.tl (Bytes.Cursor.container t.bcursor)) with
| true -> begin
let b, bcursor' = Bytes.Cursor.next t.bcursor in
Some (b, {t with apos=Apos.succ t.apos; bcursor=bcursor'})
end
| false -> begin
match Ordset.Cursor.(t.ecursor < tl t.text.excerpts) with
| true -> begin
let excerpt', ecursor' = Ordset.Cursor.next t.ecursor in
let bcursor' = Bytes.Slice.base (Excerpt.bytes excerpt') in
next {t with ecursor=ecursor'; bcursor=bcursor'}
end
| false -> begin
match Lazy.force (t.text.extend) with
| None -> None
| Some text' -> next {t with text=text'}
end
end
end
include T
include Codepoint.Seq.Make(T)
end
let next_opt t =
match Codepoint_seq.(to_codepoint (init t)) with
| None -> None
| Some (Valid (cp, t')) -> begin
let pos' = Pos.succ cp t.pos in
Some (cp, {t' with pos=pos'})
end
| Some (Invalid t') -> begin
let cp = Codepoint.replacement in
let pos' = Pos.succ cp t.pos in
Some (cp, {t' with pos=pos'})
end
| Some (Fragment _) -> begin
let cp = Codepoint.replacement in
let pos' = Pos.succ cp t.pos in
let bcursor' = Bytes.Cursor.(tl (container t.bcursor)) in
let apos' = Apos.init ~bindex:((Apos.bindex t.apos) +
((Bytes.Cursor.index bcursor') - (Bytes.Cursor.index t.bcursor))) in
Some (cp, {t with apos=apos'; pos=pos'; bcursor=bcursor'})
end
let rget_opt t =
match next_opt t with
| None -> None
| Some (cp, _) -> Some cp
let rget t =
match rget_opt t with
| None -> halt "Out of bounds"
| Some cp -> cp
let next t =
match next_opt t with
| None -> halt "Out of bounds"
| Some (cp, t') -> cp, t'
let succ t =
match next t with _, t' -> t'
let seek_fwd offset t =
let rec fn offset t = begin
match offset with
| 0 -> t
| _ -> fn (Uns.pred offset) (succ t)
end in
fn offset t
let tl text =
let rec fn cursor = begin
match next_opt cursor with
| None -> cursor
| Some (_, cursor') -> fn cursor'
end in
fn (hd text)
end
include T
include Cmpable.Make(T)
end
module Slice = struct
include Slice.Make_mono_fwd(Cursor)
end