Skip to content

Commit

Permalink
feat(segment): 提供开启分词选项,并默认关闭分词。fixed #45, #26, #19.
Browse files Browse the repository at this point in the history
  • Loading branch information
hotoo committed Jun 18, 2015
1 parent ed5c68b commit 6a0eebe
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 72 deletions.
17 changes: 10 additions & 7 deletions bin/pinyin
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#!/usr/bin/env node

var commander = require('commander');
var pinyin = require("../src/pinyin");

commander.
version(require('../package').version).
usage('[options] 汉字').
option('-v, --version', 'output the version number').
option('-s, --style <style>', 'pinyin styles').
option('-S, --segment', 'segmentation word to phrases').
option('-h, --heteronym', 'output heteronym pinyins').
parse(process.argv);

Expand All @@ -20,11 +20,14 @@ if (commander.args.length === 0) {
commander.help();
}

console.log(pinyin(commander.args.join(" "), {
var pinyin = require("../src/pinyin");
var options = {
style: pinyin["STYLE_" + (commander.style || "TONE").toUpperCase()],
heteronym: commander.heteronym || false
}).join(" "));
heteronym: commander.heteronym || false,
segment: commander.segment || false,
};
var words = commander.args.join(" ");
var py = pinyin(words, options).join(" ");
console.log(py);

/*
vim:ft=javascript
*/
// vim:ft=javascript
7 changes: 5 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
"keywords": [
"拼音",
"汉语",
"汉字",
"中文",
"Pinyin"
],
"homepage": "http://pinyin.hotoo.me/",
Expand Down Expand Up @@ -32,9 +34,10 @@
"nodejieba": "~1.0.1"
},
"devDependencies": {
"mocha": "1.17.1",
"expect.js": "0.3.1",
"request": "2.33.0"
"mocha": "1.17.1",
"request": "2.33.0",
"should": "^6.0.3"
},
"spm": {
"main": "index.js",
Expand Down
15 changes: 8 additions & 7 deletions src/pinyin.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,13 @@ function buildPinyinCache(dict_combo){
return uncomboed;
}

function segment(hans) {
jieba = jieba || module['require']('nodejieba');
// 词语拼音库。
PHRASES_DICT = PHRASES_DICT || module["require"]("./phrases-dict");
return jieba.cut(hans)
}
if(isNode){
jieba = module['require']('nodejieba');

// 词语拼音库。
PHRASES_DICT = module["require"]("./phrases-dict");

// 拼音词库,node 版无需使用压缩合并的拼音库。
PINYIN_DICT = module["require"]("./dict-zi");
}else{
Expand Down Expand Up @@ -64,6 +65,7 @@ var RE_PHONETIC_SYMBOL = new RegExp('(['+re_phonetic_symbol_source+'])', 'g');
var RE_TONE2 = /([aeoiuvnm])([0-4])$/;
var DEFAULT_OPTIONS = {
style: PINYIN_STYLE.TONE, // 风格
segment: false, // 分词。
heteronym: false // 多音字
};

Expand Down Expand Up @@ -188,8 +190,7 @@ function pinyin(hans, options){

options = extend(DEFAULT_OPTIONS, options || {});

var phrases = isNode ? jieba.cut(hans) : hans;
var len = hans.length;
var phrases = isNode && options.segment ? segment(hans) : hans;
var pys = [];

for(var i=0,nohans="",firstCharCode,words,l=phrases.length; i<l; i++){
Expand Down
115 changes: 59 additions & 56 deletions tests/test.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

var expect = require("expect.js");
var should = require("should");
var pinyin = require("../src/pinyin");


Expand Down Expand Up @@ -51,17 +52,38 @@ var cases = [
// 多音字,单音词。分词后可以准确识别读音。
[ "中国", {
STYLE_NORMAL: [["zhong"],["guo"]],
STYLE_TONE: [["zhōng"],["guó"]],
STYLE_TONE2: [["zhong1"],["guo2"]],
STYLE_TONE: {
normal: [["zhōng","zhòng"],["guó"]],
segment: [["zhōng"],["guó"]]
},
STYLE_TONE2: {
normal: [["zhong1","zhong4"],["guo2"]],
segment: [["zhong1"],["guo2"]]
},
STYLE_INITIALS: [["zh"],["g"]],
STYLE_FIRST_LETTER: [["z"],["g"]]
} ],
[ "重心", {
STYLE_NORMAL: [["zhong"],["xin"]],
STYLE_TONE: [["zhòng"],["xīn"]],
STYLE_TONE2: [["zhong4"],["xin1"]],
STYLE_INITIALS: [["zh"],["x"]],
STYLE_FIRST_LETTER: [["z"],["x"]],
STYLE_NORMAL: {
normal: [["zhong","chong"],["xin"]],
segment: [["zhong"],["xin"]]
},
STYLE_TONE: {
normal: [["zhòng","chóng"],["xīn"]],
segment: [["zhòng"],["xīn"]]
},
STYLE_TONE2: {
normal: [["zhong4","chong2"],["xin1"]],
segment: [["zhong4"],["xin1"]]
},
STYLE_INITIALS: {
normal: [["zh","ch"],["x"]],
segment: [["zh"],["x"]]
},
STYLE_FIRST_LETTER: {
normal: [["z","c"],["x"]],
segment: [["z"],["x"]]
},
} ],

// 英文
Expand Down Expand Up @@ -106,8 +128,14 @@ var cases = [
// 中英混合,多音字,单音词。
[ "中国(china)", {
STYLE_NORMAL: [["zhong"],["guo"],["(china)"]],
STYLE_TONE: [["zhōng"],["guó"],["(china)"]],
STYLE_TONE2: [["zhong1"],["guo2"],["(china)"]],
STYLE_TONE: {
normal: [["zhōng","zhòng"],["guó"],["(china)"]],
segment: [["zhōng"],["guó"],["(china)"]]
},
STYLE_TONE2: {
normal: [["zhong1","zhong4"],["guo2"],["(china)"]],
segment: [["zhong1"],["guo2"],["(china)"]]
},
STYLE_INITIALS: [["zh"],["g"],["(china)"]],
STYLE_FIRST_LETTER: [["z"],["g"],["(china)"]]
} ],
Expand All @@ -130,6 +158,12 @@ describe('pinyin', function() {
for(var style in opt){
(function(han, opt, style){
var py = opt[style];
var pys = py;
// 有多音字的词组。
if (py.normal && py.segment) {
pys = py.segment;
py = py.normal;
}
var single_pinyin = [];
for(var i=0,l=py.length; i<l; i++){
single_pinyin[i] = [py[i][0]];
Expand All @@ -140,61 +174,30 @@ describe('pinyin', function() {
it('pinyin("'+han+'", '+style+') : '+
JSON.stringify(_py)+' === '+JSON.stringify(single_pinyin), function() {

expect(deepEquals(_py, single_pinyin)).to.equal(true);
_py.should.eql(single_pinyin);
});

// 多音字模式
// 普通多音字模式
var _py2 = pinyin(han, {style: pinyin[style], heteronym:true});
it('pinyin("'+han+'", '+style+',heteronym) : '+
JSON.stringify(_py2)+' === '+JSON.stringify(py), function() {

expect(deepEquals(_py2, py)).to.equal(true);
_py2.should.eql(py);
});

// 分词多音字模式。
var _py2s = pinyin(han, {
style: pinyin[style],
heteronym: true,
segment: true,
});
it('pinyin("'+han+'", '+style+',heteronym,segment) : '+
JSON.stringify(_py2s)+' === '+JSON.stringify(pys), function() {

_py2s.should.eql(pys);
});

})(han, opt, style);
}
}
});

function deepEquals(a, b){
if(a === b){return true;}
var typeA = Object.prototype.toString.call(a);
var typeB = Object.prototype.toString.call(b);
if(typeA !== typeB){return false;}
var eq = true;
var re_blank = /\s{2,}/, s_blank = " ";
switch(typeA){
case '[object String]':
case '[object Number]':
case '[object Boolean]':
return a === b;
case '[object RegExp]':
return a.source === b.source &&
a.ignoreCase === b.ignoreCase &&
a.multiline == b.multiline &&
a.global === b.global;
case '[object Object]':
for(var k in a){
if(!a.hasOwnProperty(k)){continue;}
if(!b.hasOwnProperty(k)){return false;}
eq = eq && deepEquals(a[k], b[k]);
}
if(!eq){return false;}
for(var k in b){
if(!b.hasOwnProperty(k)){continue;}
if(!a.hasOwnProperty(k)){return false;}
}
return true;
case '[object Array]':
if(a.length !== b.length){return false;}
for(var i=0,l=a.length; i<l; i++){
eq = eq && deepEquals(a[i], b[i]);
}
return eq;
case '[object Function]':
return a.toString().replace(re_blank, s_blank) ===
b.toString().replace(re_blank, s_blank);
default:
throw new Error("Not support type "+typeA);
break;
}
}

0 comments on commit 6a0eebe

Please sign in to comment.