Skip to content

Commit

Permalink
Add piv+ normalization support
Browse files Browse the repository at this point in the history
  • Loading branch information
ivmarkp committed Jul 27, 2016
1 parent 198abd3 commit e31a46e
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 16 deletions.
10 changes: 8 additions & 2 deletions xapian-core/include/xapian/weight.h
Expand Up @@ -409,6 +409,9 @@ class XAPIAN_VISIBILITY_DEFAULT TfIdfWeight : public Weight {
/// The factor to multiply with the weight.
double factor;

/// Parameters in the Piv+ normalization weighting formula.
double param_s, param_delta;

TfIdfWeight * clone() const;

void init(double factor);
Expand All @@ -417,7 +420,7 @@ class XAPIAN_VISIBILITY_DEFAULT TfIdfWeight : public Weight {
should be accessed by these functions. */
double get_wdfn(Xapian::termcount wdf, char c) const;
double get_idfn(Xapian::doccount termfreq, char c) const;
double get_wtn(double wt, char c) const;
double get_wtn(Xapian::termcount len, double wt, char c) const;

public:
/** Construct a TfIdfWeight
Expand Down Expand Up @@ -459,7 +462,7 @@ class XAPIAN_VISIBILITY_DEFAULT TfIdfWeight : public Weight {
* Implementing support for more normalizations of each type would require
* extending the backend to track more statistics.
*/
explicit TfIdfWeight(const std::string &normalizations);
explicit TfIdfWeight(const std::string &normalizations, double param_s, double param_delta);

TfIdfWeight()
: normalizations("ntn")
Expand All @@ -468,6 +471,9 @@ class XAPIAN_VISIBILITY_DEFAULT TfIdfWeight : public Weight {
need_stat(WDF);
need_stat(WDF_MAX);
need_stat(COLLECTION_SIZE);
need_stat(AVERAGE_LENGTH);
need_stat(DOC_LENGTH);
need_stat(WQF);
}

std::string name() const;
Expand Down
68 changes: 54 additions & 14 deletions xapian-core/weight/tfidfweight.cc
Expand Up @@ -33,8 +33,8 @@ using namespace std;

namespace Xapian {

TfIdfWeight::TfIdfWeight(const std::string &normals)
: normalizations(normals)
TfIdfWeight::TfIdfWeight(const std::string &normals, double s, double delta)
: normalizations(normals), param_s(s), param_delta(delta)
{
if (normalizations.length() != 3 ||
!strchr("nbsl", normalizations[0]) ||
Expand All @@ -45,14 +45,17 @@ TfIdfWeight::TfIdfWeight(const std::string &normals)
need_stat(TERMFREQ);
need_stat(COLLECTION_SIZE);
}
need_stat(AVERAGE_LENGTH);
need_stat(DOC_LENGTH);
need_stat(WQF);
need_stat(WDF);
need_stat(WDF_MAX);
}

TfIdfWeight *
TfIdfWeight::clone() const
{
return new TfIdfWeight(normalizations);
return new TfIdfWeight(normalizations, param_s, param_delta);
}

void
Expand All @@ -70,26 +73,43 @@ TfIdfWeight::name() const
string
TfIdfWeight::serialise() const
{
return normalizations;
string result = static_cast<const string>(normalizations);
result += serialise_double(param_s);
result += serialise_double(param_delta);
return result;
}

TfIdfWeight *
TfIdfWeight::unserialise(const string & s) const
TfIdfWeight::unserialise(const string & str) const
{
if (s.length() != 3)
const char *ptr = str.data();
const char *end = ptr + str.size();
const string normals = (*(ptr)++);
double s = unserialise_double(&ptr, end);
double delta = unserialise_double(&ptr, end);
if (rare(ptr != end))
throw Xapian::SerialisationError("Extra data in TfIdfWeight::unserialise()");
return new TfIdfWeight(s);
return new TfIdfWeight(normals, s, delta);
if (str.length() != 3)
throw Xapian::SerialisationError("Extra data in TfIdfWeight::unserialise()");
return new TfIdfWeight(normals, s, delta);
}

double
TfIdfWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount,
TfIdfWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen,
Xapian::termcount) const
{
Xapian::doccount termfreq = 1;
double wqf_double = get_wqf();
if (normalizations[1] != 'n') termfreq = get_termfreq();
double wt = get_wdfn(wdf, normalizations[0]) *
get_idfn(termfreq, normalizations[1]);
return get_wtn(wt, normalizations[2]) * factor;
if (normalizations[2] == 'P') {
wt = get_wtn(doclen, wt, normalizations[2]) + param_delta * get_idfn(termfreq, normalizations[1]);
return wqf_double * get_wtn(get_doclength_lower_bound(), wt, normalizations[2]) * factor;
}
else
return get_wtn(doclen, wt, normalizations[2]) * factor;
}

// An upper bound can be calculated simply on the basis of wdf_max as termfreq
Expand All @@ -98,11 +118,17 @@ double
TfIdfWeight::get_maxpart() const
{
Xapian::doccount termfreq = 1;
double wqf_double = get_wqf();
if (normalizations[1] != 'n') termfreq = get_termfreq();
Xapian::termcount wdf_max = get_wdf_upper_bound();
double wt = get_wdfn(wdf_max, normalizations[0]) *
get_idfn(termfreq, normalizations[1]);
return get_wtn(wt, normalizations[2]) * factor;
if (normalizations[2] =='P') {
wt = get_wtn(get_doclength_lower_bound(), wt, normalizations[2]) + param_delta * get_idfn(termfreq, normalizations[1]);
return wqf_double * get_wtn(get_doclength_lower_bound(), wt, normalizations[2]) * factor;
}
else
return get_wtn(get_doclength_lower_bound(), wt, normalizations[2]) * factor;
}

// There is no extra per document component in the TfIdfWeighting scheme.
Expand Down Expand Up @@ -131,6 +157,9 @@ TfIdfWeight::get_wdfn(Xapian::termcount wdf, char c) const
case 'l':
if (wdf == 0) return 0;
return (1 + log(double(wdf)));
case 'P':
if (wdf == 0) return 0;
return (1 + log(1 + log(double(wdf))));
default:
AssertEq(c, 'n');
return wdf;
Expand All @@ -153,18 +182,29 @@ TfIdfWeight::get_idfn(Xapian::doccount termfreq, char c) const
return (1.0 / termfreq);
case 's':
return pow(log(N / termfreq), 2.0);
case 'P':
return (log((N+1) / termfreq));
default:
AssertEq(c, 't');
return (log(N / termfreq));
}
}

double
TfIdfWeight::get_wtn(double wt, char c) const
TfIdfWeight::get_wtn(Xapian::termcount doclen, double wt, char c) const
{
(void)c;
AssertEq(c, 'n');
return wt;
double normlen;
switch (c) {
case 'P':
{
normlen = doclen / get_average_length();
double norm_factor = 1 / (1 - param_s + (param_s * normlen));
return wt * norm_factor;
}
default:
AssertEq(c, 'n');
return wt;
}
}

}

0 comments on commit e31a46e

Please sign in to comment.