Permalink
Browse files

v0.1.1 修正同义词规则,避免循环转换

  • Loading branch information...
1 parent 8546710 commit ffb52aceaa0b430fe0531d13f33f07170fef301c @leizongmin committed Oct 29, 2015
Showing with 10 additions and 7 deletions.
  1. +4 −4 README.md
  2. +4 −1 lib/Segment.js
  3. +1 −1 package.json
  4. +1 −1 test/test_segment.js
View
@@ -102,10 +102,10 @@ segment.loadSynonymDict('synonym.txt');
```
什么时候,何时
-入睡,入眠
+入眠,入睡
```
-在分词时设置`convertSynonym=true`则结果中的`"什么时候"`将被转换为`"何时"``"入睡"`将被转换为`"入眠"`
+在分词时设置`convertSynonym=true`则结果中的`"什么时候"`将被转换为`"何时"``"入眠"`将被转换为`"入睡"`
```javascript
var text = '什么时候我也开始夜夜无法入睡';
@@ -124,7 +124,7 @@ console.log(result);
{ w: '开始', p: 4096 },
{ w: '夜夜', p: 131072 },
{ w: '无法', p: 134217728 },
- { w: '入眠', p: 4096 } ]
+ { w: '入睡', p: 4096 } ]
```
### 去除停止符
@@ -142,7 +142,7 @@ segment.loadStopwordDict('stopword.txt');
因为
```
-在分词时设置`stripStopword=true`则结果中的`"之所以"``"因为"``"入睡"`将被去除:
+在分词时设置`stripStopword=true`则结果中的`"之所以"``"因为"`将被去除:
```javascript
var text = '之所以要编写一个纯JS的分词器是因为当时没有一个简单易用的Node.js模块';
View
@@ -143,6 +143,9 @@ Segment.prototype.loadSynonymDict = function (name) {
var n1 = blocks[0].trim();
var n2 = blocks[1].trim();
TABLE[n2] = n1;
+ if (TABLE[n2] === n1) {
+ delete TABLE[n2];
+ }
}
});
@@ -266,7 +269,7 @@ Segment.prototype.doSegment = function (text, options) {
do {
var result = convertSynonym(ret);
ret = result.list;
- } while (result.count < 1);
+ } while (result.count > 0);
}
// 去除停止符
View
@@ -1,7 +1,7 @@
{
"name": "segment",
"main": "./index.js",
- "version": "0.1.0",
+ "version": "0.1.1",
"description": "Chinese word segmentation 中文分词模块",
"keywords": ["segment", "chinese", "中文", "分词"],
"author": "Zongmin Lei <leizongmin@gmail.com>",
@@ -106,7 +106,7 @@ describe('ChsNameTokenizer', function () {
});
it('options: convertSynonym=true', function () {
- assert.equal(segment.doSegment('入睡', {simple: true, convertSynonym: true}).join('\t'),
+ assert.equal(segment.doSegment('入眠', {simple: true, convertSynonym: true}).join('\t'),
['入眠'].join('\t'));
});

0 comments on commit ffb52ac

Please sign in to comment.