-
Notifications
You must be signed in to change notification settings - Fork 10k
/
NatureDictionaryMaker.java
150 lines (140 loc) · 5.18 KB
/
NatureDictionaryMaker.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/9/18 19:47</create-date>
*
* <copyright file="NatureDictionaryMaker.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.corpus.dictionary;
import com.hankcs.hanlp.corpus.document.CorpusLoader;
import com.hankcs.hanlp.corpus.document.Document;
import com.hankcs.hanlp.corpus.document.sentence.word.IWord;
import com.hankcs.hanlp.corpus.document.sentence.word.Word;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.corpus.util.CorpusUtil;
import com.hankcs.hanlp.corpus.util.Precompiler;
import com.hankcs.hanlp.utility.TextUtility;
import com.hankcs.hanlp.utility.Predefine;
import java.util.LinkedList;
import java.util.List;
import java.util.TreeSet;
import static com.hankcs.hanlp.utility.Predefine.logger;
/**
* @author hankcs
*/
public class NatureDictionaryMaker extends CommonDictionaryMaker
{
public NatureDictionaryMaker()
{
super(null);
}
@Override
protected void addToDictionary(List<List<IWord>> sentenceList)
{
logger.info("开始制作词典");
// 制作NGram词典
for (List<IWord> wordList : sentenceList)
{
IWord pre = null;
for (IWord word : wordList)
{
// 制作词性词频词典
dictionaryMaker.add(word);
if (pre != null)
{
nGramDictionaryMaker.addPair(pre, word);
}
pre = word;
}
}
}
@Override
protected void roleTag(List<List<IWord>> sentenceList)
{
logger.info("开始标注");
int i = 0;
for (List<IWord> wordList : sentenceList)
{
logger.info(++i + " / " + sentenceList.size());
for (IWord word : wordList)
{
Precompiler.compile(word); // 编译为等效字符串
}
LinkedList<IWord> wordLinkedList = (LinkedList<IWord>) wordList;
wordLinkedList.addFirst(new Word(Predefine.TAG_BIGIN, Nature.begin.toString()));
wordLinkedList.addLast(new Word(Predefine.TAG_END, Nature.end.toString()));
}
}
/**
* 指定语料库文件夹,制作一份词频词典
* @return
*/
static boolean makeCoreDictionary(String inPath, String outPath)
{
final DictionaryMaker dictionaryMaker = new DictionaryMaker();
final TreeSet<String> labelSet = new TreeSet<String>();
CorpusLoader.walk(inPath, new CorpusLoader.Handler()
{
@Override
public void handle(Document document)
{
for (List<Word> sentence : document.getSimpleSentenceList(true))
{
for (Word word : sentence)
{
if (shouldInclude(word))
dictionaryMaker.add(word);
}
}
// for (List<Word> sentence : document.getSimpleSentenceList(false))
// {
// for (Word word : sentence)
// {
// if (shouldInclude(word))
// dictionaryMaker.add(word);
// }
// }
}
/**
* 是否应当计算这个词语
* @param word
* @return
*/
boolean shouldInclude(Word word)
{
if ("m".equals(word.label) || "mq".equals(word.label) || "w".equals(word.label) || "t".equals(word.label))
{
if (!TextUtility.isAllChinese(word.value)) return false;
}
else if ("nr".equals(word.label))
{
return false;
}
return true;
}
});
if (outPath != null)
return dictionaryMaker.saveTxtTo(outPath);
return false;
}
public static void main(String[] args)
{
// makeCoreDictionary("D:\\JavaProjects\\CorpusToolBox\\data\\2014", "data/dictionary/CoreNatureDictionary.txt");
// EasyDictionary dictionary = EasyDictionary.create("data/dictionary/CoreNatureDictionary.txt");
final NatureDictionaryMaker dictionaryMaker = new NatureDictionaryMaker();
CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler()
{
@Override
public void handle(Document document)
{
dictionaryMaker.compute(CorpusUtil.convert2CompatibleList(document.getSimpleSentenceList(false))); // 再打一遍不拆分的
dictionaryMaker.compute(CorpusUtil.convert2CompatibleList(document.getSimpleSentenceList(true))); // 先打一遍拆分的
}
});
dictionaryMaker.saveTxtTo("data/test/CoreNatureDictionary");
}
}