/
DocumentSplitters.java
48 lines (44 loc) · 2.6 KB
/
DocumentSplitters.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.model.Tokenizer;
public class DocumentSplitters {
/**
* This is a recommended {@link DocumentSplitter} for generic text.
* It tries to split the document into paragraphs first and fits
* as many paragraphs into a single {@link dev.langchain4j.data.segment.TextSegment} as possible.
* If some paragraphs are too long, they are recursively split into lines, then sentences,
* then words, and then characters until they fit into a segment.
*
* @param maxSegmentSizeInTokens The maximum size of the segment, defined in tokens.
* @param maxOverlapSizeInTokens The maximum size of the overlap, defined in tokens.
* Only full sentences are considered for the overlap.
* @param tokenizer The tokenizer that is used to count tokens in the text.
* @return recursive document splitter
*/
public static DocumentSplitter recursive(int maxSegmentSizeInTokens,
int maxOverlapSizeInTokens,
Tokenizer tokenizer) {
return new DocumentByParagraphSplitter(maxSegmentSizeInTokens, maxOverlapSizeInTokens, tokenizer,
new DocumentByLineSplitter(maxSegmentSizeInTokens, maxOverlapSizeInTokens, tokenizer,
new DocumentBySentenceSplitter(maxSegmentSizeInTokens, maxOverlapSizeInTokens, tokenizer,
new DocumentByWordSplitter(maxSegmentSizeInTokens, maxOverlapSizeInTokens, tokenizer)
)
)
);
}
/**
* This is a recommended {@link DocumentSplitter} for generic text.
* It tries to split the document into paragraphs first and fits
* as many paragraphs into a single {@link dev.langchain4j.data.segment.TextSegment} as possible.
* If some paragraphs are too long, they are recursively split into lines, then sentences,
* then words, and then characters until they fit into a segment.
*
* @param maxSegmentSizeInChars The maximum size of the segment, defined in characters.
* @param maxOverlapSizeInChars The maximum size of the overlap, defined in characters.
* Only full sentences are considered for the overlap.
* @return recursive document splitter
*/
public static DocumentSplitter recursive(int maxSegmentSizeInChars, int maxOverlapSizeInChars) {
return recursive(maxSegmentSizeInChars, maxOverlapSizeInChars, null);
}
}