-
-
Notifications
You must be signed in to change notification settings - Fork 107
/
lexical-model.ts
150 lines (134 loc) · 4.61 KB
/
lexical-model.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/**
* Interfaces and constants used by the lexical model compiler. These target
* the LMLayer's internal worker code, so we provide those definitions too.
*/
interface LexicalModelDeclaration {
readonly format: 'trie-1.0'|'fst-foma-1.0'|'custom-1.0',
//... metadata ...
}
/**
* Keyman 14.0+ word breaker specification:
*
* Can support all old word breaking specification,
* but can also be extended with options.
*
* @since 14.0
*/
interface WordBreakerSpec {
readonly use: SimpleWordBreakerSpec;
/**
* If present, joins words that were split by the word breaker
* together at the given strings. e.g.,
*
* joinWordsAt: ['-'] // to keep hyphenated items together
*
* @since 14.0
*/
readonly joinWordsAt?: string[];
/**
* Overrides word splitting behaviour for certain scripts.
* For example, specifing that spaces break words in certain South-East
* Asian scripts that otherwise do not use spaces.
*
* @since 14.0
*/
readonly overrideScriptDefaults?: OverrideScriptDefaults;
}
/**
* Simplified word breaker specification.
*
* @since 11.0
*/
type SimpleWordBreakerSpec = 'default' | 'ascii' | WordBreakingFunction;
/**
* Simplifies input text to facilitate finding entries within a lexical model's
* lexicon.
* @since 11.0
*/
type SimpleWordformToKeySpec = (term: string) => string;
/**
* Simplifies input text to facilitate finding entries within a lexical model's
* lexicon, using the model's `applyCasing` function to assist in the keying process.
* @since 14.0
*/
type CasedWordformToKeySpec = (term: string, applyCasing?: CasingFunction) => string;
/**
* Simplifies input text to facilitate finding entries within a lexical model's
* lexicon.
*/
type WordformToKeySpec = SimpleWordformToKeySpec | CasedWordformToKeySpec;
/**
* Override the default word breaking behaviour for some scripts.
*
* There is currently only one option:
*
* 'break-words-at-spaces'
* : some South-East Asian scripts conventionally do not use space or any
* explicit word boundary character to write word breaks. These scripts are:
*
* * Burmese
* * Khmer
* * Thai
* * Laos
*
* (this list may be incomplete and extended in the future)
*
* For these scripts, the default word breaker breaks at **every**
* letter/syllable/ideograph. However, in languages that use these scripts BUT
* use spaces (or some other delimier) as word breaks, enable
* 'break-words-at-spaces'; enabling 'break-words-at-spaces' prevents the word
* breaker from making too many breaks in these scripts.
*
* @since 14.0
*/
type OverrideScriptDefaults = 'break-words-at-spaces';
interface LexicalModelSource extends LexicalModelDeclaration {
readonly sources: Array<string>;
/**
* The name of the type to instantiate (without parameters) as the base object for a custom predictive model.
*/
readonly rootClass?: string
/**
* When set to `true`, suggestions will attempt to match the case of the input text even if
* the lexicon entries use a different casing scheme due to search term keying effects.
* @since 14.0
*/
readonly languageUsesCasing?: boolean
/**
* Specifies the casing rules for a language. Should implement three casing forms:
* - 'lower' -- a fully-lowercased version of the text appropriate for the language's
* use of the writing system.
* - 'upper' -- a fully-uppercased version of the text
* - 'initial' -- a version preserving the input casing aside from the initial character,
* which is uppercased (like with proper nouns and sentence-initial words in English
* sentences.)
*
* This is only utilized if `languageUsesCasing` is defined and set to `true`.
* @since 14.0
*/
readonly applyCasing?: CasingFunction
/**
* Which word breaker to use. Choose from:
*
* - 'default' -- breaks according to Unicode UAX #29 §4.1 Default Word
* Boundary Specification, which works well for *most* languages.
* - 'ascii' -- a very simple word breaker, for demonstration purposes only.
* - word breaking function -- provide your own function that breaks words.
* - class-based word-breaker - may be supported in the future.
*/
readonly wordBreaker?: WordBreakerSpec | SimpleWordBreakerSpec;
/**
* How to simplify words, to convert them into simplified search keys
* This often involves removing accents, lowercasing, etc.
*/
readonly searchTermToKey?: WordformToKeySpec;
/**
* Punctuation and spacing suggested by the model.
*
* @see LexicalModelPunctuation
*/
readonly punctuation?: LexicalModelPunctuation;
}
interface LexicalModelCompiled extends LexicalModelDeclaration {
readonly id: string;
}