Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Loading…

#116 : refactor to allow url with basic auth a:b@toto.com #118

Merged
merged 14 commits into from

3 participants

@bpaquet

It's break a little the design, but I do not see a better solution :(

http_parser.c
@@ -346,6 +354,8 @@ enum header_states
#define IS_NUM(c) ((c) >= '0' && (c) <= '9')
#define IS_ALPHANUM(c) (IS_ALPHA(c) || IS_NUM(c))
#define IS_HEX(c) (IS_NUM(c) || (LOWER(c) >= 'a' && LOWER(c) <= 'f'))
+#define IS_MARK(c) ((c) == '-' || (c) == '_' || (c) == '.' || (c) == '!' || (c) == '~' || (c) == '*' || (c) == '\'' || (c) == '(' || (c) == ')')
+#define IS_USERINFO_CHAR(c) (IS_ALPHANUM(c) || IS_MARK(c) || (c) == '%' || (c) == ';' || (c) == ':' || (c) == '&' || (c) == '=' || (c) == '+' || (c) == '$' || (c) == ',')

Wrap at 80 columns.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
http_parser.c
@@ -402,7 +412,7 @@ enum header_states
* URL and non-URL states by looking for these.
*/
static enum state
-parse_url_char(enum state s, const char ch)
+parse_url_char(enum state s, const char ch, int * found_at)

Style: too much whitespace.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
http_parser.c
@@ -1002,7 +970,7 @@ size_t http_parser_execute (http_parser *parser,
parser->state = s_req_host_start;
}
- parser->state = parse_url_char((enum state)parser->state, ch);
+ parser->state = parse_url_char((enum state)parser->state, ch, 0);

Use NULL, not zero. Signals to the reader that it's a pointer.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
@bnoordhuis bnoordhuis commented on the diff
http_parser.c
((15 lines not shown))
+ }
+ break;
+
+ case s_http_host_start:
+ if (ch == '[') {
+ return s_http_host_v6_start;
+ }
+
+ if (IS_HOST_CHAR(ch)) {
+ return s_http_host;
+ }
+
+ break;
+
+ case s_http_host:
+

Superfluous blank line.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
http_parser.c
((55 lines not shown))
+
+ case s_http_host_port:
+ case s_http_host_port_start:
+ if (IS_NUM(ch)) {
+ return s_http_host_port;
+ }
+
+ break;
+
+ default:
+ break;
+ }
+ return s_http_host_dead;
+}
+
+#include <stdio.h>

Why is that here?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
http_parser.c
@@ -2012,21 +2109,16 @@ const char * http_method_str (enum http_method m)
old_uf = uf;
}
- /* CONNECT requests can only contain "hostname:port" */
- if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) {
- return 1;
+ // host must be present if there is a schema

Don't use C++/C99 comments.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
test.c
((35 lines not shown))
+ ,.port=0
+ ,.field_data=
+ {{ 0, 0 } /* UF_SCHEMA */
+ ,{ 0, 0 } /* UF_HOST */
+ ,{ 0, 0 } /* UF_PORT */
+ ,{ 0, 10 } /* UF_PATH */
+ ,{ 0, 0 } /* UF_QUERY */
+ ,{ 11, 4 } /* UF_FRAGMENT */
+ ,{ 0, 0 } /* UF_USERINFO */
+ }
+ }
+ ,.rv=0
+ }
+
+, {.name="complex URL fragment"
+ ,.url="http://www.webmasterworld.com/r.cgi?f=21&d=8405&url=http://www.example.com/index.html?foo=bar&hello=world#midpage"

Long line.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
test.c
((76 lines not shown))
+ ,{ 7, 8 } /* UF_HOST */
+ ,{ 16, 4 } /* UF_PORT */
+ ,{ 20, 8 } /* UF_PATH */
+ ,{ 29, 12 } /* UF_QUERY */
+ ,{ 42, 4 } /* UF_FRAGMENT */
+ ,{ 0, 0 } /* UF_USERINFO */
+ }
+ }
+ ,.rv=0
+ }
+
+, {.name="complex URL with basic auth from node js url parser doc"
+ ,.url="http://a:b@host.com:8080/p/a/t/h?query=string#hash"
+ ,.is_connect=0
+ ,.u=
+ {.field_set= (1<<UF_SCHEMA) | (1<<UF_HOST) | (1<<UF_PORT) | (1<<UF_PATH) | (1<<UF_QUERY) | (1<<UF_FRAGMENT) | (1<<UF_USERINFO)

Long line.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
http_parser.c
((59 lines not shown))
}
- /* FALLTHROUGH */
- case s_req_port_start:
- if (IS_NUM(ch)) {
- return s_req_port;
+ if (ch == '@') {
+ if (found_at) {
+ *found_at = 1;

Why the side channel? Why not e.g. a separate s_req_host_at state?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Bertrand Paquet added some commits
@bpaquet

I have accepted all your remarks.

@bnoordhuis

@pgriess You should review this as well.

url_parser.c
((3 lines not shown))
+#include <string.h>
+
+void
+dump_url (const char *url, const struct http_parser_url *u)
+{
+ char part[512];
+ unsigned int i;
+
+ printf("\tfield_set: 0x%x, port: %u\n", u->field_set, u->port);
+ for (i = 0; i < UF_MAX; i++) {
+ if ((u->field_set & (1 << i)) == 0) {
+ printf("\tfield_data[%u]: unset\n", i);
+ continue;
+ }
+
+ memcpy(part, url + u->field_data[i].off, u->field_data[i].len);
@pgriess Collaborator
pgriess added a note

Rather than copying into part to do null-termination, you could use a format specifier to indicate the number of characters in the string. For example, something like this:

printf("%.*s\n", u->field_data[i].len, url + u->field_data[i].off);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
@pgriess pgriess commented on the diff
http_parser.c
@@ -346,6 +355,12 @@ enum header_states
#define IS_NUM(c) ((c) >= '0' && (c) <= '9')
#define IS_ALPHANUM(c) (IS_ALPHA(c) || IS_NUM(c))
#define IS_HEX(c) (IS_NUM(c) || (LOWER(c) >= 'a' && LOWER(c) <= 'f'))
+#define IS_MARK(c) ((c) == '-' || (c) == '_' || (c) == '.' || \
@pgriess Collaborator
pgriess added a note

These are starting to get long. At some point, we might want to replace this with lookups into an array.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
http_parser.c
((65 lines not shown))
}
- /* FALLTHROUGH */
- case s_req_port_start:
- if (IS_NUM(ch)) {
- return s_req_port;
+ if (ch == '@') {
@pgriess Collaborator
pgriess added a note

Let's check this before the IS_USERINFO_CHAR() ... check above because it's cheaper.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
http_parser.c
@@ -254,12 +254,8 @@ enum state
, s_req_schema_slash
, s_req_schema_slash_slash
, s_req_host_start
- , s_req_host_v6_start
- , s_req_host_v6
- , s_req_host_v6_end
, s_req_host
@pgriess Collaborator
pgriess added a note

Since the "host" parsing is now really parsing full-fledged RFC2396 authorities, maybe we should rename these states and the other symbols that reference host parsing.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
http_parser.c
@@ -455,62 +470,28 @@ enum header_states
break;
- case s_req_host_start:
- if (ch == '[') {
- return s_req_host_v6_start;
- }
-
- if (IS_HOST_CHAR(ch)) {
- return s_req_host;
+ case s_req_host_with_at:
@pgriess Collaborator
pgriess added a note

I'm not sure we really need this state. The caller is feeding in characters one-at-a-time anyway. If they care about whether or not there's a literal @ in the authority, they can check for themselves. The only thing information is used for in parse_url_char() is to make sure that we don't have consecutive @s, but even that's not a complete check -- we should really be checking for whether or not there are more than one. Again, the caller is pretty well positioned to do this with a simple counter.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
@pgriess pgriess commented on the diff
http_parser.c
@@ -2012,21 +2120,16 @@ const char * http_method_str (enum http_method m)
old_uf = uf;
}
- /* CONNECT requests can only contain "hostname:port" */
- if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) {
- return 1;
+ /* host must be present if there is a schema */
@pgriess Collaborator
pgriess added a note

Maybe a comment here about how http_parse_host() will fail for empty hosts? Otherwise it's not clear why we think we're going to be able to correctly fail this condition.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
@pgriess pgriess commented on the diff
http_parser.c
((74 lines not shown))
+ size_t buflen = u->field_data[UF_HOST].off + u->field_data[UF_HOST].len;
+
+ u->field_data[UF_HOST].len = 0;
+
+ s = found_at ? s_http_userinfo_start : s_http_host_start;
+
+ for (p = buf + u->field_data[UF_HOST].off; p < buf + buflen; p++) {
+ enum http_host_state new_s = http_parse_host_char(s, *p);
+
+ if (new_s == s_http_host_dead) {
+ return 1;
+ }
+
+ switch(new_s) {
+ case s_http_host:
+ if (s != s_http_host) {
@pgriess Collaborator
pgriess added a note

Some day it'd be nice to unify the switch statement/logic from here and in http_parser_parse_url(), both of which do basically the same thing.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
@pgriess pgriess commented on the diff
http_parser.c
@@ -1938,6 +1911,144 @@ const char * http_method_str (enum http_method m)
return http_strerror_tab[err].description;
}
+static enum http_host_state
+http_parse_host_char(enum http_host_state s, const char ch) {
+ switch(s) {
+ case s_http_userinfo:
+ case s_http_userinfo_start:
+ if (ch == '@') {
@pgriess Collaborator
pgriess added a note

Why do we need the s_http_userinfo_start state at all? It sees to behave the same as s_http_userinfo.

@bpaquet
bpaquet added a note
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
http_parser.c
((77 lines not shown))
+
+ s = found_at ? s_http_userinfo_start : s_http_host_start;
+
+ for (p = buf + u->field_data[UF_HOST].off; p < buf + buflen; p++) {
+ enum http_host_state new_s = http_parse_host_char(s, *p);
+
+ if (new_s == s_http_host_dead) {
+ return 1;
+ }
+
+ switch(new_s) {
+ case s_http_host:
+ if (s != s_http_host) {
+ u->field_data[UF_HOST].off = p - buf;
+ }
+ u->field_data[UF_HOST].len ++;
@pgriess Collaborator
pgriess added a note

There's some weird spacing going on before the ++ operator, here and below.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
@pgriess
Collaborator

Anyway, none of my comments are really deal-breakers IMO. @bnoordhuis are you cool w/ this going in now? If so, @bpaquet do you want time to make the suggested changes? Or should we just merge this now?

@pgriess
Collaborator

Also, I think this is turning into a case for breaking out the URL parser into its own library.

@bpaquet
@pgriess
Collaborator

Cool, thanks for making those changes @bpaquet! I'll merge this now.

@pgriess pgriess merged commit fb3eeb7 into joyent:master
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Jul 7, 2012
  1. Check host on url with hostname:port

    Bertrand Paquet authored
Commits on Jul 8, 2012
  1. Add url_parser tool

    Bertrand Paquet authored
  2. Refactor host parsing to allow basic auth management

    Bertrand Paquet authored
  3. User info implementation

    Bertrand Paquet authored
  4. Add tests on user info

    Bertrand Paquet authored
  5. Small refactoring, add edge cases

    Bertrand Paquet authored
  6. Add tests

    Bertrand Paquet authored
Commits on Jul 17, 2012
  1. Coding style improvment

    Bertrand Paquet authored
  2. Use new state instead of pointer

    Bertrand Paquet authored
Commits on Jul 24, 2012
  1. Minor speed improvment

    Bertrand Paquet authored
  2. Rename s_req_host* to be compliant with RFC 2396

    Bertrand Paquet authored
  3. Coding style : remove space before ++

    Bertrand Paquet authored
  4. Add a comment

    Bertrand Paquet authored
This page is out of date. Refresh to see the latest.
View
1  .gitignore
@@ -4,6 +4,7 @@ tags
test
test_g
test_fast
+url_parser
*.mk
*.Makefile
*.so
View
8 Makefile
@@ -31,6 +31,12 @@ test_fast: http_parser.o test.o http_parser.h
test.o: test.c http_parser.h Makefile
$(CC) $(CPPFLAGS_FAST) $(CFLAGS_FAST) -c test.c -o $@
+url_parser: http_parser_g.o url_parser.o
+ $(CC) $(CFLAGS_DEBUG) $(LDFLAGS) http_parser_g.o url_parser.o -o $@
+
+url_parser.o: url_parser.c http_parser.h Makefile
+ $(CC) $(CPPFLAGS_DEBUG) $(CFLAGS_DEBUG) -c url_parser.c -o $@
+
http_parser.o: http_parser.c http_parser.h Makefile
$(CC) $(CPPFLAGS_FAST) $(CFLAGS_FAST) -c http_parser.c
@@ -53,6 +59,6 @@ tags: http_parser.c http_parser.h test.c
ctags $^
clean:
- rm -f *.o *.a test test_fast test_g http_parser.tar tags libhttp_parser.so libhttp_parser.o
+ rm -f *.o *.a test test_fast test_g url_parser http_parser.tar tags libhttp_parser.so libhttp_parser.o
.PHONY: clean package test-run test-run-timed test-valgrind
View
292 http_parser.c
@@ -253,13 +253,9 @@ enum state
, s_req_schema
, s_req_schema_slash
, s_req_schema_slash_slash
- , s_req_host_start
- , s_req_host_v6_start
- , s_req_host_v6
- , s_req_host_v6_end
- , s_req_host
- , s_req_port_start
- , s_req_port
+ , s_req_server_start
+ , s_req_server
+ , s_req_server_with_at
, s_req_path
, s_req_query_string_start
, s_req_query_string
@@ -337,6 +333,19 @@ enum header_states
, h_connection_close
};
+enum http_host_state
+ {
+ s_http_host_dead = 1
+ , s_http_userinfo_start
+ , s_http_userinfo
+ , s_http_host_start
+ , s_http_host_v6_start
+ , s_http_host
+ , s_http_host_v6
+ , s_http_host_v6_end
+ , s_http_host_port_start
+ , s_http_host_port
+};
/* Macros for character classes; depends on strict-mode */
#define CR '\r'
@@ -346,6 +355,12 @@ enum header_states
#define IS_NUM(c) ((c) >= '0' && (c) <= '9')
#define IS_ALPHANUM(c) (IS_ALPHA(c) || IS_NUM(c))
#define IS_HEX(c) (IS_NUM(c) || (LOWER(c) >= 'a' && LOWER(c) <= 'f'))
+#define IS_MARK(c) ((c) == '-' || (c) == '_' || (c) == '.' || \
@pgriess Collaborator
pgriess added a note

These are starting to get long. At some point, we might want to replace this with lookups into an array.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+ (c) == '!' || (c) == '~' || (c) == '*' || (c) == '\'' || (c) == '(' || \
+ (c) == ')')
+#define IS_USERINFO_CHAR(c) (IS_ALPHANUM(c) || IS_MARK(c) || (c) == '%' || \
+ (c) == ';' || (c) == ':' || (c) == '&' || (c) == '=' || (c) == '+' || \
+ (c) == '$' || (c) == ',')
#if HTTP_PARSER_STRICT
#define TOKEN(c) (tokens[(unsigned char)c])
@@ -450,67 +465,33 @@ parse_url_char(enum state s, const char ch)
case s_req_schema_slash_slash:
if (ch == '/') {
- return s_req_host_start;
+ return s_req_server_start;
}
break;
- case s_req_host_start:
- if (ch == '[') {
- return s_req_host_v6_start;
- }
-
- if (IS_HOST_CHAR(ch)) {
- return s_req_host;
- }
-
- break;
-
- case s_req_host:
- if (IS_HOST_CHAR(ch)) {
- return s_req_host;
- }
-
- /* FALLTHROUGH */
- case s_req_host_v6_end:
- switch (ch) {
- case ':':
- return s_req_port_start;
-
- case '/':
- return s_req_path;
-
- case '?':
- return s_req_query_string_start;
+ case s_req_server_with_at:
+ if (ch == '@') {
+ return s_dead;
}
- break;
-
- case s_req_host_v6:
- if (ch == ']') {
- return s_req_host_v6_end;
+ /* FALLTHROUGH */
+ case s_req_server_start:
+ case s_req_server:
+ if (ch == '/') {
+ return s_req_path;
}
- /* FALLTHROUGH */
- case s_req_host_v6_start:
- if (IS_HEX(ch) || ch == ':') {
- return s_req_host_v6;
+ if (ch == '?') {
+ return s_req_query_string_start;
}
- break;
-
- case s_req_port:
- switch (ch) {
- case '/':
- return s_req_path;
- case '?':
- return s_req_query_string_start;
+ if (ch == '@') {
+ return s_req_server_with_at;
}
- /* FALLTHROUGH */
- case s_req_port_start:
- if (IS_NUM(ch)) {
- return s_req_port;
+ if (IS_USERINFO_CHAR(ch) || ch == '[' || ch == ']') {
+ return s_req_server;
}
break;
@@ -632,13 +613,9 @@ size_t http_parser_execute (http_parser *parser,
case s_req_schema:
case s_req_schema_slash:
case s_req_schema_slash_slash:
- case s_req_host_start:
- case s_req_host_v6_start:
- case s_req_host_v6:
- case s_req_host_v6_end:
- case s_req_host:
- case s_req_port_start:
- case s_req_port:
+ case s_req_server_start:
+ case s_req_server:
+ case s_req_server_with_at:
case s_req_query_string_start:
case s_req_query_string:
case s_req_fragment_start:
@@ -999,7 +976,7 @@ size_t http_parser_execute (http_parser *parser,
MARK(url);
if (parser->method == HTTP_CONNECT) {
- parser->state = s_req_host_start;
+ parser->state = s_req_server_start;
}
parser->state = parse_url_char((enum state)parser->state, ch);
@@ -1014,10 +991,7 @@ size_t http_parser_execute (http_parser *parser,
case s_req_schema:
case s_req_schema_slash:
case s_req_schema_slash_slash:
- case s_req_host_start:
- case s_req_host_v6_start:
- case s_req_host_v6:
- case s_req_port_start:
+ case s_req_server_start:
{
switch (ch) {
/* No whitespace allowed here */
@@ -1037,9 +1011,8 @@ size_t http_parser_execute (http_parser *parser,
break;
}
- case s_req_host:
- case s_req_host_v6_end:
- case s_req_port:
+ case s_req_server:
+ case s_req_server_with_at:
case s_req_path:
case s_req_query_string_start:
case s_req_query_string:
@@ -1938,6 +1911,144 @@ http_errno_description(enum http_errno err) {
return http_strerror_tab[err].description;
}
+static enum http_host_state
+http_parse_host_char(enum http_host_state s, const char ch) {
+ switch(s) {
+ case s_http_userinfo:
+ case s_http_userinfo_start:
+ if (ch == '@') {
@pgriess Collaborator
pgriess added a note

Why do we need the s_http_userinfo_start state at all? It sees to behave the same as s_http_userinfo.

@bpaquet
bpaquet added a note
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+ return s_http_host_start;
+ }
+
+ if (IS_USERINFO_CHAR(ch)) {
+ return s_http_userinfo;
+ }
+ break;
+
+ case s_http_host_start:
+ if (ch == '[') {
+ return s_http_host_v6_start;
+ }
+
+ if (IS_HOST_CHAR(ch)) {
+ return s_http_host;
+ }
+
+ break;
+
+ case s_http_host:
+ if (IS_HOST_CHAR(ch)) {

Superfluous blank line.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+ return s_http_host;
+ }
+
+ /* FALLTHROUGH */
+ case s_http_host_v6_end:
+ if (ch == ':') {
+ return s_http_host_port_start;
+ }
+
+ break;
+
+ case s_http_host_v6:
+ if (ch == ']') {
+ return s_http_host_v6_end;
+ }
+
+ /* FALLTHROUGH */
+ case s_http_host_v6_start:
+ if (IS_HEX(ch) || ch == ':') {
+ return s_http_host_v6;
+ }
+
+ break;
+
+ case s_http_host_port:
+ case s_http_host_port_start:
+ if (IS_NUM(ch)) {
+ return s_http_host_port;
+ }
+
+ break;
+
+ default:
+ break;
+ }
+ return s_http_host_dead;
+}
+
+static int
+http_parse_host(const char * buf, struct http_parser_url *u, int found_at) {
+ enum http_host_state s;
+
+ const char *p;
+ size_t buflen = u->field_data[UF_HOST].off + u->field_data[UF_HOST].len;
+
+ u->field_data[UF_HOST].len = 0;
+
+ s = found_at ? s_http_userinfo_start : s_http_host_start;
+
+ for (p = buf + u->field_data[UF_HOST].off; p < buf + buflen; p++) {
+ enum http_host_state new_s = http_parse_host_char(s, *p);
+
+ if (new_s == s_http_host_dead) {
+ return 1;
+ }
+
+ switch(new_s) {
+ case s_http_host:
+ if (s != s_http_host) {
@pgriess Collaborator
pgriess added a note

Some day it'd be nice to unify the switch statement/logic from here and in http_parser_parse_url(), both of which do basically the same thing.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+ u->field_data[UF_HOST].off = p - buf;
+ }
+ u->field_data[UF_HOST].len++;
+ break;
+
+ case s_http_host_v6:
+ if (s != s_http_host_v6) {
+ u->field_data[UF_HOST].off = p - buf;
+ }
+ u->field_data[UF_HOST].len++;
+ break;
+
+ case s_http_host_port:
+ if (s != s_http_host_port) {
+ u->field_data[UF_PORT].off = p - buf;
+ u->field_data[UF_PORT].len = 0;
+ u->field_set |= (1 << UF_PORT);
+ }
+ u->field_data[UF_PORT].len++;
+ break;
+
+ case s_http_userinfo:
+ if (s != s_http_userinfo) {
+ u->field_data[UF_USERINFO].off = p - buf ;
+ u->field_data[UF_USERINFO].len = 0;
+ u->field_set |= (1 << UF_USERINFO);
+ }
+ u->field_data[UF_USERINFO].len++;
+ break;
+
+ default:
+ break;
+ }
+ s = new_s;
+ }
+
+ /* Make sure we don't end somewhere unexpected */
+ switch (s) {
+ case s_http_host_start:
+ case s_http_host_v6_start:
+ case s_http_host_v6:
+ case s_http_host_port_start:
+ case s_http_userinfo:
+ case s_http_userinfo_start:
+ return 1;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
int
http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
struct http_parser_url *u)
@@ -1945,9 +2056,10 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
enum state s;
const char *p;
enum http_parser_url_fields uf, old_uf;
+ int found_at = 0;
u->port = u->field_set = 0;
- s = is_connect ? s_req_host_start : s_req_spaces_before_url;
+ s = is_connect ? s_req_server_start : s_req_spaces_before_url;
uf = old_uf = UF_MAX;
for (p = buf; p < buf + buflen; p++) {
@@ -1961,10 +2073,7 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
/* Skip delimeters */
case s_req_schema_slash:
case s_req_schema_slash_slash:
- case s_req_host_start:
- case s_req_host_v6_start:
- case s_req_host_v6_end:
- case s_req_port_start:
+ case s_req_server_start:
case s_req_query_string_start:
case s_req_fragment_start:
continue;
@@ -1973,13 +2082,12 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
uf = UF_SCHEMA;
break;
- case s_req_host:
- case s_req_host_v6:
- uf = UF_HOST;
- break;
+ case s_req_server_with_at:
+ found_at = 1;
- case s_req_port:
- uf = UF_PORT;
+ /* FALLTROUGH */
+ case s_req_server:
+ uf = UF_HOST;
break;
case s_req_path:
@@ -2012,21 +2120,17 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
old_uf = uf;
}
- /* CONNECT requests can only contain "hostname:port" */
- if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) {
- return 1;
+ /* host must be present if there is a schema */
@pgriess Collaborator
pgriess added a note

Maybe a comment here about how http_parse_host() will fail for empty hosts? Otherwise it's not clear why we think we're going to be able to correctly fail this condition.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+ /* parsing http:///toto will fail */
+ if ((u->field_set & ((1 << UF_SCHEMA) | (1 << UF_HOST))) != 0) {
+ if (http_parse_host(buf, u, found_at) != 0) {
+ return 1;
+ }
}
- /* Make sure we don't end somewhere unexpected */
- switch (s) {
- case s_req_host_v6_start:
- case s_req_host_v6:
- case s_req_host_v6_end:
- case s_req_host:
- case s_req_port_start:
+ /* CONNECT requests can only contain "hostname:port" */
+ if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) {
return 1;
- default:
- break;
}
if (u->field_set & (1 << UF_PORT)) {
View
3  http_parser.h
@@ -256,7 +256,8 @@ enum http_parser_url_fields
, UF_PATH = 3
, UF_QUERY = 4
, UF_FRAGMENT = 5
- , UF_MAX = 6
+ , UF_USERINFO = 6
+ , UF_MAX = 7
};
View
325 test.c
@@ -50,6 +50,8 @@ struct message {
char query_string[MAX_ELEMENT_SIZE];
char body[MAX_ELEMENT_SIZE];
size_t body_size;
+ const char *host;
+ const char *userinfo;
uint16_t port;
int num_headers;
enum { NONE=0, FIELD, VALUE } last_header_element;
@@ -630,6 +632,7 @@ const struct message requests[] =
,.fragment= ""
,.request_path= ""
,.request_url= "http://hypnotoad.org?hail=all"
+ ,.host= "hypnotoad.org"
,.num_headers= 0
,.headers= { }
,.body= ""
@@ -649,6 +652,7 @@ const struct message requests[] =
,.fragment= ""
,.request_path= ""
,.request_url= "http://hypnotoad.org:1234?hail=all"
+ ,.host= "hypnotoad.org"
,.port= 1234
,.num_headers= 0
,.headers= { }
@@ -669,6 +673,7 @@ const struct message requests[] =
,.fragment= ""
,.request_path= ""
,.request_url= "http://hypnotoad.org:1234"
+ ,.host= "hypnotoad.org"
,.port= 1234
,.num_headers= 0
,.headers= { }
@@ -870,6 +875,28 @@ const struct message requests[] =
,.body= ""
}
+#define PROXY_WITH_BASIC_AUTH 33
+, {.name= "host:port and basic_auth"
+ ,.type= HTTP_REQUEST
+ ,.raw= "GET http://a%12:b!&*$@hypnotoad.org:1234/toto HTTP/1.1\r\n"
+ "\r\n"
+ ,.should_keep_alive= TRUE
+ ,.message_complete_on_eof= FALSE
+ ,.http_major= 1
+ ,.http_minor= 1
+ ,.method= HTTP_GET
+ ,.fragment= ""
+ ,.request_path= "/toto"
+ ,.request_url= "http://a%12:b!&*$@hypnotoad.org:1234/toto"
+ ,.host= "hypnotoad.org"
+ ,.userinfo= "a%12:b!&*$"
+ ,.port= 1234
+ ,.num_headers= 0
+ ,.headers= { }
+ ,.body= ""
+ }
+
+
, {.name= NULL } /* sentinel */
};
@@ -1794,6 +1821,14 @@ message_eq (int index, const struct message *expected)
abort();
}
+ if (expected->host) {
+ MESSAGE_CHECK_URL_EQ(&u, expected, m, host, UF_HOST);
+ }
+
+ if (expected->userinfo) {
+ MESSAGE_CHECK_URL_EQ(&u, expected, m, userinfo, UF_USERINFO);
+ }
+
m->port = (u.field_set & (1 << UF_PORT)) ?
u.port : 0;
@@ -1966,6 +2001,26 @@ const struct url_test url_tests[] =
,{ 15, 1 } /* UF_PATH */
,{ 0, 0 } /* UF_QUERY */
,{ 0, 0 } /* UF_FRAGMENT */
+ ,{ 0, 0 } /* UF_USERINFO */
+ }
+ }
+ ,.rv=0
+ }
+
+, {.name="proxy request with port"
+ ,.url="http://hostname:444/"
+ ,.is_connect=0
+ ,.u=
+ {.field_set=(1 << UF_SCHEMA) | (1 << UF_HOST) | (1 << UF_PORT) | (1 << UF_PATH)
+ ,.port=444
+ ,.field_data=
+ {{ 0, 4 } /* UF_SCHEMA */
+ ,{ 7, 8 } /* UF_HOST */
+ ,{ 16, 3 } /* UF_PORT */
+ ,{ 19, 1 } /* UF_PATH */
+ ,{ 0, 0 } /* UF_QUERY */
+ ,{ 0, 0 } /* UF_FRAGMENT */
+ ,{ 0, 0 } /* UF_USERINFO */
}
}
,.rv=0
@@ -1984,11 +2039,18 @@ const struct url_test url_tests[] =
,{ 0, 0 } /* UF_PATH */
,{ 0, 0 } /* UF_QUERY */
,{ 0, 0 } /* UF_FRAGMENT */
+ ,{ 0, 0 } /* UF_USERINFO */
}
}
,.rv=0
}
+, {.name="CONNECT request but not connect"
+ ,.url="hostname:443"
+ ,.is_connect=0
+ ,.rv=1
+ }
+
, {.name="proxy ipv6 request"
,.url="http://[1:2::3:4]/"
,.is_connect=0
@@ -2002,6 +2064,26 @@ const struct url_test url_tests[] =
,{ 17, 1 } /* UF_PATH */
,{ 0, 0 } /* UF_QUERY */
,{ 0, 0 } /* UF_FRAGMENT */
+ ,{ 0, 0 } /* UF_USERINFO */
+ }
+ }
+ ,.rv=0
+ }
+
+, {.name="proxy ipv6 request with port"
+ ,.url="http://[1:2::3:4]:67/"
+ ,.is_connect=0
+ ,.u=
+ {.field_set=(1 << UF_SCHEMA) | (1 << UF_HOST) | (1 << UF_PORT) | (1 << UF_PATH)
+ ,.port=67
+ ,.field_data=
+ {{ 0, 4 } /* UF_SCHEMA */
+ ,{ 8, 8 } /* UF_HOST */
+ ,{ 18, 2 } /* UF_PORT */
+ ,{ 20, 1 } /* UF_PATH */
+ ,{ 0, 0 } /* UF_QUERY */
+ ,{ 0, 0 } /* UF_FRAGMENT */
+ ,{ 0, 0 } /* UF_USERINFO */
}
}
,.rv=0
@@ -2020,13 +2102,16 @@ const struct url_test url_tests[] =
,{ 0, 0 } /* UF_PATH */
,{ 0, 0 } /* UF_QUERY */
,{ 0, 0 } /* UF_FRAGMENT */
+ ,{ 0, 0 } /* UF_USERINFO */
}
}
,.rv=0
}
, {.name="extra ? in query string"
- ,.url="http://a.tbcdn.cn/p/fp/2010c/??fp-header-min.css,fp-base-min.css,fp-channel-min.css,fp-product-min.css,fp-mall-min.css,fp-category-min.css,fp-sub-min.css,fp-gdp4p-min.css,fp-css3-min.css,fp-misc-min.css?t=20101022.css"
+ ,.url="http://a.tbcdn.cn/p/fp/2010c/??fp-header-min.css,fp-base-min.css,"
+ "fp-channel-min.css,fp-product-min.css,fp-mall-min.css,fp-category-min.css,"
+ "fp-sub-min.css,fp-gdp4p-min.css,fp-css3-min.css,fp-misc-min.css?t=20101022.css"
,.is_connect=0
,.u=
{.field_set=(1<<UF_SCHEMA) | (1<<UF_HOST) | (1<<UF_PATH) | (1<<UF_QUERY)
@@ -2038,11 +2123,118 @@ const struct url_test url_tests[] =
,{ 17, 12 } /* UF_PATH */
,{ 30,187 } /* UF_QUERY */
,{ 0, 0 } /* UF_FRAGMENT */
+ ,{ 0, 0 } /* UF_USERINFO */
+ }
+ }
+ ,.rv=0
+ }
+
+, {.name="space URL encoded"
+ ,.url="/toto.html?toto=a%20b"
+ ,.is_connect=0
+ ,.u=
+ {.field_set= (1<<UF_PATH) | (1<<UF_QUERY)
+ ,.port=0
+ ,.field_data=
+ {{ 0, 0 } /* UF_SCHEMA */
+ ,{ 0, 0 } /* UF_HOST */
+ ,{ 0, 0 } /* UF_PORT */
+ ,{ 0, 10 } /* UF_PATH */
+ ,{ 11, 10 } /* UF_QUERY */
+ ,{ 0, 0 } /* UF_FRAGMENT */
+ ,{ 0, 0 } /* UF_USERINFO */
+ }
+ }
+ ,.rv=0
+ }
+
+
+, {.name="URL fragment"
+ ,.url="/toto.html#titi"
+ ,.is_connect=0
+ ,.u=
+ {.field_set= (1<<UF_PATH) | (1<<UF_FRAGMENT)
+ ,.port=0
+ ,.field_data=
+ {{ 0, 0 } /* UF_SCHEMA */
+ ,{ 0, 0 } /* UF_HOST */
+ ,{ 0, 0 } /* UF_PORT */
+ ,{ 0, 10 } /* UF_PATH */
+ ,{ 0, 0 } /* UF_QUERY */
+ ,{ 11, 4 } /* UF_FRAGMENT */
+ ,{ 0, 0 } /* UF_USERINFO */
+ }
+ }
+ ,.rv=0
+ }
+
+, {.name="complex URL fragment"
+ ,.url="http://www.webmasterworld.com/r.cgi?f=21&d=8405&url="
+ "http://www.example.com/index.html?foo=bar&hello=world#midpage"
+ ,.is_connect=0
+ ,.u=
+ {.field_set= (1<<UF_SCHEMA) | (1<<UF_HOST) | (1<<UF_PATH) | (1<<UF_QUERY) |\
+ (1<<UF_FRAGMENT)
+ ,.port=0
+ ,.field_data=
+ {{ 0, 4 } /* UF_SCHEMA */
+ ,{ 7, 22 } /* UF_HOST */
+ ,{ 0, 0 } /* UF_PORT */
+ ,{ 29, 6 } /* UF_PATH */
+ ,{ 36, 69 } /* UF_QUERY */
+ ,{106, 7 } /* UF_FRAGMENT */
+ ,{ 0, 0 } /* UF_USERINFO */
+ }
+ }
+ ,.rv=0
+ }
+
+, {.name="complex URL from node js url parser doc"
+ ,.url="http://host.com:8080/p/a/t/h?query=string#hash"
+ ,.is_connect=0
+ ,.u=
+ {.field_set= (1<<UF_SCHEMA) | (1<<UF_HOST) | (1<<UF_PORT) | (1<<UF_PATH) |\
+ (1<<UF_QUERY) | (1<<UF_FRAGMENT)
+ ,.port=8080
+ ,.field_data=
+ {{ 0, 4 } /* UF_SCHEMA */
+ ,{ 7, 8 } /* UF_HOST */
+ ,{ 16, 4 } /* UF_PORT */
+ ,{ 20, 8 } /* UF_PATH */
+ ,{ 29, 12 } /* UF_QUERY */
+ ,{ 42, 4 } /* UF_FRAGMENT */
+ ,{ 0, 0 } /* UF_USERINFO */
+ }
+ }
+ ,.rv=0
+ }
+
+, {.name="complex URL with basic auth from node js url parser doc"
+ ,.url="http://a:b@host.com:8080/p/a/t/h?query=string#hash"
+ ,.is_connect=0
+ ,.u=
+ {.field_set= (1<<UF_SCHEMA) | (1<<UF_HOST) | (1<<UF_PORT) | (1<<UF_PATH) |\
+ (1<<UF_QUERY) | (1<<UF_FRAGMENT) | (1<<UF_USERINFO)
+ ,.port=8080
+ ,.field_data=
+ {{ 0, 4 } /* UF_SCHEMA */
+ ,{ 11, 8 } /* UF_HOST */
+ ,{ 20, 4 } /* UF_PORT */
+ ,{ 24, 8 } /* UF_PATH */
+ ,{ 33, 12 } /* UF_QUERY */
+ ,{ 46, 4 } /* UF_FRAGMENT */
+ ,{ 7, 3 } /* UF_USERINFO */
}
}
,.rv=0
}
+, {.name="double @"
+ ,.url="http://a:b@@hostname:443/"
+ ,.is_connect=0
+ ,.rv=1
+ }
+
, {.name="proxy empty host"
,.url="http://:443/"
,.is_connect=0
@@ -2055,6 +2247,12 @@ const struct url_test url_tests[] =
,.rv=1
}
+, {.name="CONNECT with basic auth"
+ ,.url="a:b@hostname:443"
+ ,.is_connect=1
+ ,.rv=1
+ }
+
, {.name="CONNECT empty host"
,.url=":443"
,.is_connect=1
@@ -2078,16 +2276,130 @@ const struct url_test url_tests[] =
,.rv=1 /* s_dead */
}
+, {.name="proxy basic auth with space url encoded"
+ ,.url="http://a%20:b@host.com/"
+ ,.is_connect=0
+ ,.u=
+ {.field_set= (1<<UF_SCHEMA) | (1<<UF_HOST) | (1<<UF_PATH) | (1<<UF_USERINFO)
+ ,.port=0
+ ,.field_data=
+ {{ 0, 4 } /* UF_SCHEMA */
+ ,{ 14, 8 } /* UF_HOST */
+ ,{ 0, 0 } /* UF_PORT */
+ ,{ 22, 1 } /* UF_PATH */
+ ,{ 0, 0 } /* UF_QUERY */
+ ,{ 0, 0 } /* UF_FRAGMENT */
+ ,{ 7, 6 } /* UF_USERINFO */
+ }
+ }
+ ,.rv=0
+ }
+
, {.name="carriage return in URL"
,.url="/foo\rbar/"
,.rv=1 /* s_dead */
}
+, {.name="proxy double : in URL"
+ ,.url="http://hostname::443/"
+ ,.rv=1 /* s_dead */
+ }
+
+, {.name="proxy basic auth with double :"
+ ,.url="http://a::b@host.com/"
+ ,.is_connect=0
+ ,.u=
+ {.field_set= (1<<UF_SCHEMA) | (1<<UF_HOST) | (1<<UF_PATH) | (1<<UF_USERINFO)
+ ,.port=0
+ ,.field_data=
+ {{ 0, 4 } /* UF_SCHEMA */
+ ,{ 12, 8 } /* UF_HOST */
+ ,{ 0, 0 } /* UF_PORT */
+ ,{ 20, 1 } /* UF_PATH */
+ ,{ 0, 0 } /* UF_QUERY */
+ ,{ 0, 0 } /* UF_FRAGMENT */
+ ,{ 7, 4 } /* UF_USERINFO */
+ }
+ }
+ ,.rv=0
+ }
+
, {.name="line feed in URL"
,.url="/foo\nbar/"
,.rv=1 /* s_dead */
}
+, {.name="proxy empty basic auth"
+ ,.url="http://@hostname/fo"
+ ,.u=
+ {.field_set= (1<<UF_SCHEMA) | (1<<UF_HOST) | (1<<UF_PATH)
+ ,.port=0
+ ,.field_data=
+ {{ 0, 4 } /* UF_SCHEMA */
+ ,{ 8, 8 } /* UF_HOST */
+ ,{ 0, 0 } /* UF_PORT */
+ ,{ 16, 3 } /* UF_PATH */
+ ,{ 0, 0 } /* UF_QUERY */
+ ,{ 0, 0 } /* UF_FRAGMENT */
+ ,{ 0, 0 } /* UF_USERINFO */
+ }
+ }
+ ,.rv=0
+ }
+, {.name="proxy line feed in hostname"
+ ,.url="http://host\name/fo"
+ ,.rv=1 /* s_dead */
+ }
+
+, {.name="proxy % in hostname"
+ ,.url="http://host%name/fo"
+ ,.rv=1 /* s_dead */
+ }
+
+, {.name="proxy ; in hostname"
+ ,.url="http://host;ame/fo"
+ ,.rv=1 /* s_dead */
+ }
+
+, {.name="proxy basic auth with unreservedchars"
+ ,.url="http://a!;-_!=+$@host.com/"
+ ,.is_connect=0
+ ,.u=
+ {.field_set= (1<<UF_SCHEMA) | (1<<UF_HOST) | (1<<UF_PATH) | (1<<UF_USERINFO)
+ ,.port=0
+ ,.field_data=
+ {{ 0, 4 } /* UF_SCHEMA */
+ ,{ 17, 8 } /* UF_HOST */
+ ,{ 0, 0 } /* UF_PORT */
+ ,{ 25, 1 } /* UF_PATH */
+ ,{ 0, 0 } /* UF_QUERY */
+ ,{ 0, 0 } /* UF_FRAGMENT */
+ ,{ 7, 9 } /* UF_USERINFO */
+ }
+ }
+ ,.rv=0
+ }
+
+, {.name="proxy only empty basic auth"
+ ,.url="http://@/fo"
+ ,.rv=1 /* s_dead */
+ }
+
+, {.name="proxy only basic auth"
+ ,.url="http://toto@/fo"
+ ,.rv=1 /* s_dead */
+ }
+
+, {.name="proxy emtpy hostname"
+ ,.url="http:///fo"
+ ,.rv=1 /* s_dead */
+ }
+
+, {.name="proxy = in URL"
+ ,.url="http://host=ame/fo"
+ ,.rv=1 /* s_dead */
+ }
+
#if HTTP_PARSER_STRICT
, {.name="tab in URL"
@@ -2113,6 +2425,7 @@ const struct url_test url_tests[] =
,{ 0, 9 } /* UF_PATH */
,{ 0, 0 } /* UF_QUERY */
,{ 0, 0 } /* UF_FRAGMENT */
+ ,{ 0, 0 } /* UF_USERINFO */
}
}
,.rv=0
@@ -2129,6 +2442,7 @@ const struct url_test url_tests[] =
,{ 0, 9 } /* UF_PATH */
,{ 0, 0 } /* UF_QUERY */
,{ 0, 0 } /* UF_FRAGMENT */
+ ,{ 0, 0 } /* UF_USERINFO */
}
}
,.rv=0
@@ -2139,7 +2453,6 @@ const struct url_test url_tests[] =
void
dump_url (const char *url, const struct http_parser_url *u)
{
- char part[512];
unsigned int i;
printf("\tfield_set: 0x%x, port: %u\n", u->field_set, u->port);
@@ -2149,14 +2462,12 @@ dump_url (const char *url, const struct http_parser_url *u)
continue;
}
- memcpy(part, url + u->field_data[i].off, u->field_data[i].len);
- part[u->field_data[i].len] = '\0';
-
- printf("\tfield_data[%u]: off: %u len: %u part: \"%s\"\n",
+ printf("\tfield_data[%u]: off: %u len: %u part: \"%.*s\n",
i,
u->field_data[i].off,
u->field_data[i].len,
- part);
+ u->field_data[i].len,
+ url + u->field_data[i].off);
}
}
View
44 url_parser.c
@@ -0,0 +1,44 @@
+#include "http_parser.h"
+#include <stdio.h>
+#include <string.h>
+
+void
+dump_url (const char *url, const struct http_parser_url *u)
+{
+ unsigned int i;
+
+ printf("\tfield_set: 0x%x, port: %u\n", u->field_set, u->port);
+ for (i = 0; i < UF_MAX; i++) {
+ if ((u->field_set & (1 << i)) == 0) {
+ printf("\tfield_data[%u]: unset\n", i);
+ continue;
+ }
+
+ printf("\tfield_data[%u]: off: %u len: %u part: \"%.*s\n",
+ i,
+ u->field_data[i].off,
+ u->field_data[i].len,
+ u->field_data[i].len,
+ url + u->field_data[i].off);
+ }
+}
+
+int main(int argc, char ** argv) {
+ if (argc != 3) {
+ printf("Syntax : %s connect|get url\n", argv[0]);
+ return 1;
+ }
+ struct http_parser_url u;
+ int len = strlen(argv[2]);
+ int connect = strcmp("connect", argv[1]) == 0 ? 1 : 0;
+ printf("Parsing %s, connect %d\n", argv[2], connect);
+
+ int result = http_parser_parse_url(argv[2], len, connect, &u);
+ if (result != 0) {
+ printf("Parse error : %d\n", result);
+ return result;
+ }
+ printf("Parse ok, result : \n");
+ dump_url(argv[2], &u);
+ return 0;
+}
Something went wrong with that request. Please try again.