Skip to content

Commit

Permalink
Better parsing of M/Counter
Browse files Browse the repository at this point in the history
Mimetype may contain a parameters.
Then, the mimetype would be something like "text/html;foo=bar;foz=baz"

It will contains a `;` and `=` and it conflicts with the same operators
we use to separate the items in our list.

We have to use a more advanced algorithm which takes the context into
account.

Fix #416
  • Loading branch information
mgautierfr committed Oct 28, 2020
1 parent ef42abe commit 08464f2
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 32 deletions.
2 changes: 1 addition & 1 deletion include/tools/stringTools.h
Expand Up @@ -43,7 +43,7 @@ void loadICUExternalTables();
std::string urlEncode(const std::string& value, bool encodeReserved = false);
std::string urlDecode(const std::string& value, bool component = false);

std::vector<std::string> split(const std::string&, const std::string&, bool trimEmpty = true);
std::vector<std::string> split(const std::string& str, const std::string& delims, bool trimEmpty = true, bool keepDelim = false);
std::string join(const std::vector<std::string>& list, const std::string& sep);

std::string ucAll(const std::string& word);
Expand Down
52 changes: 40 additions & 12 deletions src/tools/otherTools.cpp
Expand Up @@ -280,24 +280,52 @@ bool kiwix::convertStrToBool(const std::string& value)
throw std::domain_error(ss.str());
}


#define get_token() if (currentIt==tokens.end()) {break;} else { token = *currentIt++; }
kiwix::MimeCounterType kiwix::parseMimetypeCounter(const std::string& counterData)
{
// The counter metadata format is a list of item separated by a `;` :
// item0;item1;item2
// Each item is a "tuple" mimetype=number.
// However, the mimetype may contains parameters:
// text/html;raw=true;foo=bar
// So the final format may be complex to parse:
// key0=value0;key1;foo=bar=value1;key2=value2

kiwix::MimeCounterType counters;
std::string item;
unsigned int counter;

std::stringstream ssContent(counterData);
auto tokens = split(counterData, ";=", true, true);
auto currentIt = tokens.begin();
std::string token;

while (getline(ssContent, item, ';')) {
std::string mimeType, counterString;
std::stringstream ssItem(item);
getline(ssItem, mimeType, '=');
getline(ssItem, counterString, '=');
if (!counterString.empty() && !mimeType.empty()) {
if (sscanf(counterString.c_str(), "%u", &counter))
counters.insert(std::pair<std::string, int>(mimeType, counter));
while (true) {
get_token();
auto mimeType = token;
get_token();
while (token == ";") {
//read param
mimeType += ";";
get_token();
mimeType += token; //key
get_token();
if (token != "=")
break;
mimeType += "=";
get_token();
mimeType += token; //value
get_token();
}
if (currentIt == tokens.end() || token != "=")
break;

//read count
zim::article_index_type count;
get_token();
if(!sscanf(token.c_str(), "%u", &count))
break;
counters.insert({mimeType, count});
get_token();
if (token != ";")
break;
}

return counters;
Expand Down
6 changes: 5 additions & 1 deletion src/tools/stringTools.cpp
Expand Up @@ -268,7 +268,8 @@ std::string kiwix::urlDecode(const std::string& value, bool component)
/* Split string in a token array */
std::vector<std::string> kiwix::split(const std::string& str,
const std::string& delims,
bool trimEmpty)
bool trimEmpty,
bool keepDelim)
{
std::string::size_type lastPos = 0;
std::string::size_type pos = 0;
Expand All @@ -279,6 +280,9 @@ std::vector<std::string> kiwix::split(const std::string& str,
if (!trimEmpty || !token.empty()) {
tokens.push_back(token);
}
if (keepDelim) {
tokens.push_back(str.substr(pos, 1));
}
lastPos = pos + 1;
}

Expand Down
43 changes: 34 additions & 9 deletions test/counterParsing.cpp
Expand Up @@ -51,30 +51,55 @@ TEST(ParseCounterTest, simpleMimeType)
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
}
}
/*

TEST(ParseCounterTest, paramMimeType)
{
{
std::string counterStr = "text/html;raw=true=1";
CounterType counterMap = {{"foo", 1}};
ASSERT_EQ(parse(counterStr), counterMap);
CounterType counterMap = {{"text/html;raw=true", 1}};
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
}
{
std::string counterStr = "foo=1;text/html;raw=true=50;bar=2";
CounterType counterMap = {{"foo", 1}, {"text/html;raw=true", 50}, {"bar", 2}};
ASSERT_EQ(parse(counterStr), counterMap);
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
}
{
std::string counterStr = "foo=1;text/html;raw=true;param=value=50;bar=2";
CounterType counterMap = {{"foo", 1}, {"text/html;raw=true;param=value", 50}, {"bar", 2}};
ASSERT_EQ(parse(counterStr), counterMap);
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
}
{
std::string counterStr = "foo=1;text/html;raw=true=50;bar=2";
CounterType counterMap = {{"foo", 1}, {"text/html;raw=true", 50}, {"bar", 2}};
ASSERT_EQ(parse(counterStr), counterMap);
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
}
}*/
{
std::string counterStr = "application/javascript=8;text/html=3;application/warc-headers=28364;text/html;raw=true=6336;text/css=47;text/javascript=98;image/png=968;image/webp=24;application/json=3694;image/gif=10274;image/jpeg=1582;font/woff2=25;text/plain=284;application/atom+xml=247;application/x-www-form-urlencoded=9;video/mp4=9;application/x-javascript=7;application/xml=1;image/svg+xml=5";
CounterType counterMap = {
{"application/javascript", 8},
{"text/html", 3},
{"application/warc-headers", 28364},
{"text/html;raw=true", 6336},
{"text/css", 47},
{"text/javascript", 98},
{"image/png", 968},
{"image/webp", 24},
{"application/json", 3694},
{"image/gif", 10274},
{"image/jpeg", 1582},
{"font/woff2", 25},
{"text/plain", 284},
{"application/atom+xml", 247},
{"application/x-www-form-urlencoded", 9},
{"video/mp4", 9},
{"application/x-javascript", 7},
{"application/xml", 1},
{"image/svg+xml", 5}
};
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
}
}

TEST(ParseCounterTest, wrongType)
{
Expand All @@ -96,14 +121,14 @@ TEST(ParseCounterTest, wrongType)
CounterType counterMap = {{"text/html", 50}};
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
}
/*{
{
std::string counterStr = "text/html;foo=20";
ASSERT_EQ(parse(counterStr), empty) << counterStr;
}
{
std::string counterStr = "text/html;foo=20;";
ASSERT_EQ(parse(counterStr), empty) << counterStr;
}*/
}
{
std::string counterStr = "text/html=50;;foo";
CounterType counterMap = {{"text/html", 50}};
Expand Down
23 changes: 14 additions & 9 deletions test/stringTools.cpp
Expand Up @@ -23,7 +23,7 @@

namespace kiwix {
std::string join(const std::vector<std::string>& list, const std::string& sep);
std::vector<std::string> split(const std::string& base, const std::string& sep, bool trimEmpty);
std::vector<std::string> split(const std::string& base, const std::string& sep, bool trimEmpty, bool keepDelim);
};

using namespace kiwix;
Expand All @@ -40,17 +40,22 @@ TEST(stringTools, join)
TEST(stringTools, split)
{
std::vector<std::string> list1 = { "a", "b", "c" };
ASSERT_EQ(split("a;b;c", ";", false), list1);
ASSERT_EQ(split("a;b;c", ";", true), list1);
ASSERT_EQ(split("a;b;c", ";", false, false), list1);
ASSERT_EQ(split("a;b;c", ";", true, false), list1);
std::vector<std::string> list2 = { "", "a", "b", "c" };
ASSERT_EQ(split(";a;b;c", ";", false), list2);
ASSERT_EQ(split(";a;b;c", ";", true), list1);
ASSERT_EQ(split(";a;b;c", ";", false, false), list2);
ASSERT_EQ(split(";a;b;c", ";", true, false), list1);
std::vector<std::string> list3 = { "", "a", "b", "c", ""};
ASSERT_EQ(split(";a;b;c;", ";", false), list3);
ASSERT_EQ(split(";a;b;c;", ";", true), list1);
ASSERT_EQ(split(";a;b;c;", ";", false, false), list3);
ASSERT_EQ(split(";a;b;c;", ";", true, false), list1);
std::vector<std::string> list4 = { "", "a", "b", "", "c", ""};
ASSERT_EQ(split(";a;b;;c;", ";", false), list4);
ASSERT_EQ(split(";a;b;;c;", ";", true), list1);
ASSERT_EQ(split(";a;b;;c;", ";", false, false), list4);
ASSERT_EQ(split(";a;b;;c;", ";", true, false), list1);

std::vector<std::string> list5 = { ";", "a", ";", "b", "=", ";", "c", "=", "d", ";"};
ASSERT_EQ(split(";a;b=;c=d;", ";=", true, true), list5);
std::vector<std::string> list6 = { "", ";", "a", ";", "b", "=", "", ";", "c", "=", "d", ";", ""};
ASSERT_EQ(split(";a;b=;c=d;", ";=", false, true), list6);
}

};

0 comments on commit 08464f2

Please sign in to comment.