-
Notifications
You must be signed in to change notification settings - Fork 213
/
jcseg.properties
98 lines (70 loc) · 2.75 KB
/
jcseg.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# Jcseg properties file.
# @Note:
# true | 1 | on for open the specified configuration or
# false | 0 | off to close it.
# bug report chenxin <chenxin619315@gmail.com>
# Jcseg function
#maximum match length. (5-7)
jcseg.maxlen = 7
#Whether to recognized the Chinese name.
jcseg.icnname = true
#maximum length for pair punctuation text.
jcseg.pptmaxlen = 7
#maximum length for Chinese last name andron.
jcseg.cnmaxlnadron = 1
#Whether to clear the stopwords.
jcseg.clearstopword = false
#Whether to convert the Chinese numeric to Arabic number. like '\u4E09\u4E07' to 30000.
jcseg.cnnumtoarabic = true
#Whether to convert the Chinese fraction to Arabic fraction.
#@Note: for lucene,solr,elasticsearch eg.. close it.
jcseg.cnfratoarabic = false
#Whether to keep the unrecognized word.
jcseg.keepunregword = true
#Whether to do the secondary segmentation for the complex English words
jcseg.ensecondseg = true
#minimum length of the secondary segmentation token.
jcseg.ensecminlen = 1
#Whether to do the English word segmentation
#the jcseg.ensecondseg must set to true before active this function
jcseg.enwordseg = true
#maximum match length for English extracted word
jcseg.enmaxlen = 16
#threshold for Chinese name recognize.
# better not change it before you know what you are doing.
jcseg.nsthreshold = 1000000
#The punctuation set that will be keep in an token.(Not the end of the token).
jcseg.keeppunctuations = @#%.&+
#Whether to append the pinyin of the entry.
jcseg.appendpinyin = false
#Whether to load and append the synonyms words of the entry.
jcseg.appendsyn = true
####for Tokenizer
#default delimiter for JcsegDelimiter tokenizer
#set to default or whitespace will use the default whitespace as delimiter
#or set to the char you want, like ',' or whatever
jcseg.delimiter = default
#default length for the N-gram tokenizer
jcseg.gram = 1
####about the lexicon
#absolute path of the lexicon file.
#Multiple path support from jcseg 1.9.2, use ';' to split different path.
#example: lexicon.path = /home/chenxin/lex1;/home/chenxin/lex2 (Linux)
# : lexicon.path = D:/jcseg/lexicon/1;D:/jcseg/lexicon/2 (WinNT)
#lexicon.path=/Code/java/JavaSE/jcseg/lexicon
#lexicon.path = {jar.dir}/lexicon ({jar.dir} means the base directory of jcseg-core-{version}.jar)
#@since 1.9.9 Jcseg default to load the lexicons in the classpath
lexicon.path = null
#Whether to load the modified lexicon file auto.
lexicon.autoload = false
#Poll time for auto load. (seconds)
lexicon.polltime = 300
####lexicon load
#Whether to load the part of speech of the entry.
jcseg.loadpos = true
#Whether to load the pinyin of the entry.
jcseg.loadpinyin = true
#Whether to load the synonyms words of the entry.
jcseg.loadsyn = true
#Whether to load the entity of the entry
jcseg.loadentity = true