Skip to content

Commit

Permalink
update, some experiments with tf/idf
Browse files Browse the repository at this point in the history
  • Loading branch information
jprante committed Apr 9, 2015
1 parent cb3cfe8 commit 02b696f
Show file tree
Hide file tree
Showing 9 changed files with 223 additions and 62 deletions.
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -8,6 +8,7 @@ indexes.

| Elasticsearch | Plugin | Release date |
| -------------- | ------------ | ------------ |
| 1.5.0 | 1.5.0.0 | Apr 9, 2015 |
| 1.4.4 | 1.4.4.0 | Mar 15, 2015 |
| 1.4.0 | 1.4.0.2 | Feb 19, 2015 |
| 1.4.0 | 1.4.0.1 | Jan 14, 2015 |
Expand Down
12 changes: 2 additions & 10 deletions pom.xml
Expand Up @@ -6,7 +6,7 @@

<groupId>org.xbib.elasticsearch.plugin</groupId>
<artifactId>elasticsearch-index-termlist</artifactId>
<version>1.4.4.0</version>
<version>1.5.0.0</version>

<packaging>jar</packaging>

Expand Down Expand Up @@ -49,7 +49,7 @@
<github.global.server>github</github.global.server>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<java.compile.version>1.7</java.compile.version>
<elasticsearch.version>1.4.4</elasticsearch.version>
<elasticsearch.version>1.5.0</elasticsearch.version>
</properties>

<dependencies>
Expand All @@ -68,14 +68,6 @@
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-all</artifactId>
<version>1.1</version>
<type>jar</type>
<scope>test</scope>
</dependency>

<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
Expand Down
74 changes: 53 additions & 21 deletions src/main/java/org/xbib/elasticsearch/action/termlist/TermInfo.java
Expand Up @@ -3,19 +3,22 @@
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Streamable;
import org.elasticsearch.common.xcontent.ToXContent;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;

import java.io.IOException;

public class TermInfo implements Streamable {
public class TermInfo implements Streamable, ToXContent {

private Integer termFreq;

private Integer docCount;

private Integer docFreq;

private Long totalFreq;

private Double tfidf;

public TermInfo setTermFreq(int termFreq) {
this.termFreq = termFreq;
return this;
Expand All @@ -25,15 +28,6 @@ public Integer getTermFreq() {
return termFreq;
}

public TermInfo setDocCount(int docCount) {
this.docCount = docCount;
return this;
}

public Integer getDocCount() {
return docCount;
}

public TermInfo setDocFreq(int docFreq) {
this.docFreq = docFreq;
return this;
Expand All @@ -52,6 +46,15 @@ public Long getTotalFreq() {
return totalFreq;
}

public TermInfo setTfIdf(double tfidf) {
this.tfidf = tfidf;
return this;
}

public Double getTfIdf() {
return tfidf;
}

@Override
public void readFrom(StreamInput in) throws IOException {
boolean b = in.readBoolean();
Expand All @@ -60,15 +63,15 @@ public void readFrom(StreamInput in) throws IOException {
}
b = in.readBoolean();
if (b) {
setDocCount(in.readInt());
setDocFreq(in.readInt());
}
b = in.readBoolean();
if (b) {
setDocFreq(in.readInt());
setTotalFreq(in.readVLong());
}
b = in.readBoolean();
if (b) {
setTotalFreq(in.readVLong());
setTfIdf(in.readDouble());
}
}

Expand All @@ -80,12 +83,6 @@ public void writeTo(StreamOutput out) throws IOException {
} else {
out.writeBoolean(false);
}
if (docCount != null) {
out.writeBoolean(true);
out.writeInt(docCount);
} else {
out.writeBoolean(false);
}
if (docFreq != null) {
out.writeBoolean(true);
out.writeInt(docFreq);
Expand All @@ -98,5 +95,40 @@ public void writeTo(StreamOutput out) throws IOException {
} else {
out.writeBoolean(false);
}
if (tfidf != null) {
out.writeBoolean(true);
out.writeDouble(tfidf);
} else {
out.writeBoolean(false);
}
}

public String toString() {
try {
XContentBuilder builder = XContentFactory.jsonBuilder().prettyPrint();
builder.startObject();
toXContent(builder, EMPTY_PARAMS);
builder.endObject();
return builder.string();
} catch (IOException e) {
return "{ \"error\" : \"" + e.getMessage() + "\"}";
}
}

@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
if (termFreq != null) {
builder.field("termFreq", termFreq);
}
if (docFreq != null) {
builder.field("docFreq", docFreq);
}
if (totalFreq != null) {
builder.field("totalFreq", totalFreq);
}
if (tfidf != null) {
builder.field("tfidf", tfidf);
}
return builder;
}
}
Expand Up @@ -33,7 +33,7 @@ public class TermlistRequest extends BroadcastOperationRequest<TermlistRequest>
TermlistRequest() {
}

public TermlistRequest(String[] indices) {
public TermlistRequest(String... indices) {
super(indices);
}

Expand Down
Expand Up @@ -43,6 +43,11 @@ public TermlistRequestBuilder withDocFreq() {
return this;
}

public TermlistRequestBuilder withTermFreq() {
request.setWithTermFreq(true);
return this;
}

public TermlistRequestBuilder withTotalFreq() {
request.setWithTotalFreq(true);
return this;
Expand All @@ -58,7 +63,6 @@ public TermlistRequestBuilder sortByTotalFreq(boolean sortByTotalFreq) {
return this;
}


@Override
protected void doExecute(ActionListener<TermlistResponse> listener) {
client.execute(TermlistAction.INSTANCE, request, listener);
Expand Down
Expand Up @@ -12,6 +12,7 @@
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
Expand All @@ -30,7 +31,7 @@
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.engine.Engine;
import org.elasticsearch.index.shard.service.InternalIndexShard;
import org.elasticsearch.index.shard.IndexShard;
import org.elasticsearch.indices.IndicesService;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.TransportService;
Expand All @@ -48,7 +49,8 @@ public class TransportTermlistAction

@Inject
public TransportTermlistAction(Settings settings, ThreadPool threadPool, ClusterService clusterService,
TransportService transportService, IndicesService indicesService,
TransportService transportService,
IndicesService indicesService,
ActionFilters actionFilters) {
super(settings, TermlistAction.NAME, threadPool, clusterService, transportService, actionFilters);
this.indicesService = indicesService;
Expand Down Expand Up @@ -132,7 +134,7 @@ protected ClusterBlockException checkRequestBlock(ClusterState state, TermlistRe

@Override
protected ShardTermlistResponse shardOperation(ShardTermlistRequest request) throws ElasticsearchException {
InternalIndexShard indexShard = (InternalIndexShard) indicesService.indexServiceSafe(request.getIndex()).shardSafe(request.shardId().id());
IndexShard indexShard = indicesService.indexServiceSafe(request.getIndex()).shardSafe(request.shardId().id());
Engine.Searcher searcher = indexShard.engine().acquireSearcher("termlist");
try {
Map<String, TermInfo> map = new CompactHashMap<String, TermInfo>();
Expand All @@ -158,13 +160,19 @@ protected ShardTermlistResponse shardOperation(ShardTermlistRequest request) thr
if (termsEnum.totalTermFreq() < 1) {
continue;
}
String term = text.utf8ToString();
Term term = new Term(field, text);
TermInfo termInfo = new TermInfo();
if (request.getRequest().getWithTermFreq()) {
// just get first document and pos (which is absurd...)
DocsAndPositionsEnum docPosEnum = termsEnum.docsAndPositions(null, null);
docPosEnum.nextDoc();
termInfo.setTermFreq(docPosEnum.freq());
int freq = 0;
// ??? how to mark doc to select for tf/idf ???
while (docPosEnum.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) {
// doc ID is docPosEnum.docID();
// add term freq per doc
freq += docPosEnum.freq();
}
// total term freq ... wrong
termInfo.setTermFreq(freq);
}
if (request.getRequest().getWithDocFreq()) {
// the number of documents containing this term
Expand All @@ -176,8 +184,9 @@ protected ShardTermlistResponse shardOperation(ShardTermlistRequest request) thr
// doc that has this term).
termInfo.setTotalFreq(termsEnum.totalTermFreq());
}
if (request.getRequest().getTerm() == null || term.startsWith(request.getRequest().getTerm())) {
map.put(term, termInfo);

if (request.getRequest().getTerm() == null || term.text().startsWith(request.getRequest().getTerm())) {
map.put(term.text(), termInfo);
}
}
}
Expand Down Expand Up @@ -207,16 +216,6 @@ private void merge(Map<String, TermInfo> map, Map<String, TermInfo> other) {
info.setTermFreq(t.getValue().getTermFreq());
}
}
Integer docCount = info.getDocCount();
if (docCount != null) {
if (t.getValue().getDocCount() != null) {
info.setDocCount(docCount + t.getValue().getDocCount());
}
} else {
if (t.getValue().getDocCount() != null) {
info.setDocCount(t.getValue().getDocCount());
}
}
Integer docFreq = info.getDocFreq();
if (docFreq != null) {
if (t.getValue().getDocFreq() != null) {
Expand All @@ -237,6 +236,11 @@ private void merge(Map<String, TermInfo> map, Map<String, TermInfo> other) {
info.setTotalFreq(t.getValue().getTotalFreq());
}
}
if (info.getTermFreq() != null && info.getTotalFreq() != null && info.getDocFreq() != null) {
double tf = Math.sqrt(info.getTermFreq());
double idf = Math.log((info.getTotalFreq() / (double) info.getDocFreq() + 1) + 1.0);
info.setTfIdf(tf * idf);
}
} else {
map.put(t.getKey(), t.getValue());
}
Expand Down
Expand Up @@ -65,23 +65,14 @@ public RestResponse buildResponse(TermlistResponse response, XContentBuilder bui
builder.startObject().field("name", t.getKey());
if (t.getValue().getTermFreq() != null) {
builder.field("termfreq", t.getValue().getTermFreq());
}
if (t.getValue().getDocCount() != null) {
builder.field("doccount", t.getValue().getDocCount());
builder.field("tfidf", t.getValue().getTfIdf());
}
if (t.getValue().getDocFreq() != null) {
builder.field("docfreq", t.getValue().getDocFreq());
}
if (t.getValue().getTotalFreq() != null) {
builder.field("totalfreq", t.getValue().getTotalFreq());
}
// tf/idf possible?
if (t.getValue().getTermFreq() != null && t.getValue().getDocFreq() != null) {
double tf = Math.sqrt(t.getValue().getTermFreq());
builder.field("tf", tf);
double idf = 1 + Math.log(totalDocs / t.getValue().getDocFreq() + 1);
builder.field("idf", idf);
}
builder.endObject();
}
builder.endArray();
Expand Down

0 comments on commit 02b696f

Please sign in to comment.