Skip to content

Commit

Permalink
索引模式可选分词结果最小颗粒度:#670
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Nov 22, 2017
1 parent 520884b commit 706240f
Show file tree
Hide file tree
Showing 7 changed files with 35 additions and 13 deletions.
2 changes: 1 addition & 1 deletion src/main/java/com/hankcs/hanlp/seg/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public class Config
/**
* 是否是索引分词(合理地最小分割)
*/
public boolean indexMode = false;
public int indexMode = 0;
/**
* 是否识别中国人名
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public List<Term> segSentence(char[] sentence)

if (config.useCustomDictionary)
{
if (config.indexMode)
if (config.indexMode > 0)
combineByCustomDictionary(vertexList, wordNetAll);
else combineByCustomDictionary(vertexList);
}
Expand Down Expand Up @@ -107,7 +107,7 @@ public List<Term> segSentence(char[] sentence)
}

// 如果是索引模式则全切分
if (config.indexMode)
if (config.indexMode > 0)
{
return decorateResultForIndexMode(vertexList, wordNetAll);
}
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/com/hankcs/hanlp/seg/NShort/NShortSegment.java
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ public List<Term> segSentence(char[] sentence)
}

// 如果是索引模式则全切分
if (config.indexMode)
if (config.indexMode > 0)
{
return decorateResultForIndexMode(vertexList, wordNetAll);
}
Expand All @@ -130,7 +130,7 @@ public List<Term> segSentence(char[] sentence)

if (config.useCustomDictionary)
{
if (config.indexMode)
if (config.indexMode > 0)
combineByCustomDictionary(vertexList, wordNetAll);
else combineByCustomDictionary(vertexList);
}
Expand Down
22 changes: 18 additions & 4 deletions src/main/java/com/hankcs/hanlp/seg/Segment.java
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ protected static List<AtomNode> quickAtomSegment(char[] charArray, int start, in
if (offsetAtom+1 < end)
{
int nextType = CharType.get(charArray[offsetAtom+1]);
if (nextType == CharType.CT_NUM)
if (nextType == CharType.CT_NUM)
{
continue;
}
Expand Down Expand Up @@ -354,7 +354,7 @@ protected void mergeNumberQuantifier(List<Vertex> termList, WordNet wordNetAll,
{
if ((cur.hasNature(Nature.q) || cur.hasNature(Nature.qv) || cur.hasNature(Nature.qt)))
{
if (config.indexMode)
if (config.indexMode > 0)
{
wordNetAll.add(line, new Vertex(sbQuantifier.toString(), new CoreDictionary.Attribute(Nature.m)));
}
Expand Down Expand Up @@ -454,7 +454,7 @@ public List<Term> seg(String text)
return Collections.emptyList();
}
List<Term> termList = new LinkedList<Term>();
if (config.offset || config.indexMode) // 由于分割了句子,所以需要重新校正offset
if (config.offset || config.indexMode > 0) // 由于分割了句子,所以需要重新校正offset
{
int sentenceOffset = 0;
for (int i = 0; i < sentenceArray.length; ++i)
Expand Down Expand Up @@ -557,7 +557,21 @@ public List<List<Term>> seg2sentence(String text)
*/
public Segment enableIndexMode(boolean enable)
{
config.indexMode = enable;
config.indexMode = enable ? 2 : 0;
return this;
}

/**
* 索引模式下的最小切分颗粒度(设为1可以最小切分为单字)
*
* @param minimalLength 三字词及以上的词语将会被切分为大于等于此长度的子词语。默认取2。
* @return
*/
public Segment enableIndexMode(int minimalLength)
{
if (minimalLength < 1) throw new IllegalArgumentException("最小长度应当大于等于1");
config.indexMode = minimalLength;

return this;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ protected List<Term> segSentence(char[] sentence)

if (config.useCustomDictionary)
{
if (config.indexMode)
if (config.indexMode > 0)
combineByCustomDictionary(vertexList, wordNetAll);
else combineByCustomDictionary(vertexList);
}
Expand Down Expand Up @@ -109,7 +109,7 @@ protected List<Term> segSentence(char[] sentence)
}

// 如果是索引模式则全切分
if (config.indexMode)
if (config.indexMode > 0)
{
return decorateResultForIndexMode(vertexList, wordNetAll);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ public void hit(int begin, int end, CoreDictionary.Attribute value)
* @param vertexList
* @param wordNetAll
*/
protected static List<Term> decorateResultForIndexMode(List<Vertex> vertexList, WordNet wordNetAll)
protected List<Term> decorateResultForIndexMode(List<Vertex> vertexList, WordNet wordNetAll)
{
List<Term> termList = new LinkedList<Term>();
int line = 1;
Expand All @@ -496,7 +496,7 @@ protected static List<Term> decorateResultForIndexMode(List<Vertex> vertexList,
Vertex smallVertex = iterator.next();
if (
((termMain.nature == Nature.mq && smallVertex.hasNature(Nature.q)) ||
smallVertex.realWord.length() > 1)
smallVertex.realWord.length() >= config.indexMode)
&& smallVertex != vertex // 防止重复添加
&& currentLine + smallVertex.realWord.length() <= line + vertex.realWord.length() // 防止超出边界
)
Expand Down
8 changes: 8 additions & 0 deletions src/test/java/com/hankcs/demo/DemoIndexSegment.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,13 @@ public static void main(String[] args)
{
System.out.println(term + " [" + term.offset + ":" + (term.offset + term.word.length()) + "]");
}

System.out.println("\n最细颗粒度切分:");
IndexTokenizer.SEGMENT.enableIndexMode(1);
termList = IndexTokenizer.segment("主副食品");
for (Term term : termList)
{
System.out.println(term + " [" + term.offset + ":" + (term.offset + term.word.length()) + "]");
}
}
}

3 comments on commit 706240f

@mamba8
Copy link

@mamba8 mamba8 commented on 706240f Nov 27, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

谢谢 谢谢

@mamba8
Copy link

@mamba8 mamba8 commented on 706240f Dec 19, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hankcs hanlp-lucene-plugin对于enableIndexMode整型相关参数的支持,开放提交么

@hankcs
Copy link
Owner Author

@hankcs hankcs commented on 706240f Dec 19, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

欢迎提交pr,谢谢

Please sign in to comment.