Skip to content

Commit

Permalink
Merge branch 'master' into form-login-extra-inputs
Browse files Browse the repository at this point in the history
* also fix new FormLoginProcessorTest.testFormLoginExtraInputs() in context of form-login-multipart changes

* master:
  fix NullPointerException when using old HtmlFormCredential login
  to make this branch only about multipart/form-data, undo the changes to handle more complicated forms with extra input fields; add unit test; store CrawlURI submit data in http-library agnostic way and remove httpcomponents-specific code from HTMLForm, refactor creation of post data accordingly
  Be sure to return at least one of the candidates
  Remove unneeded constant
  Make sure to check form encoding type when addign form to curi data
  More changes for https://webarchive.jira.com/browse/ARI-4656
  More changes
  Save enctype of form in submission curi
  Add ability to submit forms with enctype="multipart/form-data"
  If multiple username inputs in login form, favor the one with the word 'login' in it.
  • Loading branch information
nlevitt committed Jan 15, 2016
2 parents c1abbfa + 02e3a55 commit c51c3df
Show file tree
Hide file tree
Showing 10 changed files with 241 additions and 52 deletions.
4 changes: 4 additions & 0 deletions commons/pom.xml
Expand Up @@ -27,6 +27,10 @@
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
</dependency>
<dependency>
<groupId>com.sleepycat</groupId>
<artifactId>je</artifactId>
Expand Down
Expand Up @@ -118,10 +118,11 @@ public interface CoreAttributeConstants {
public static final String A_HTTP_RESPONSE_HEADERS = "http-response-headers";

public static final String A_HTTP_AUTH_CHALLENGES = "http-auth-challenges";
// FORMS support - a persistent member (survives frontier enqueue/dequeue/retries)

// FORMS support - persistent members (survive frontier enqueue/dequeue/retries)
public static final String A_SUBMIT_DATA = "submit-data";

public static final String A_SUBMIT_ENCTYPE = "submit-enctype";

// arbitrary additions to WARC response record headers
public static final String A_WARC_RESPONSE_HEADERS = "warc-response-headers";

Expand Down
3 changes: 2 additions & 1 deletion modules/src/main/java/org/archive/modules/CrawlURI.java
Expand Up @@ -32,6 +32,7 @@
import static org.archive.modules.CoreAttributeConstants.A_PREREQUISITE_URI;
import static org.archive.modules.CoreAttributeConstants.A_SOURCE_TAG;
import static org.archive.modules.CoreAttributeConstants.A_SUBMIT_DATA;
import static org.archive.modules.CoreAttributeConstants.A_SUBMIT_ENCTYPE;
import static org.archive.modules.CoreAttributeConstants.A_WARC_RESPONSE_HEADERS;
import static org.archive.modules.SchedulingConstants.NORMAL;
import static org.archive.modules.fetcher.FetchStatusCodes.S_BLOCKED_BY_CUSTOM_PROCESSOR;
Expand Down Expand Up @@ -256,7 +257,7 @@ public static enum FetchType { HTTP_GET, HTTP_POST, UNKNOWN };
*/
private static final Collection<String> persistentKeys
= new CopyOnWriteArrayList<String>(
new String [] {A_CREDENTIALS_KEY, A_HTTP_AUTH_CHALLENGES, A_SUBMIT_DATA, A_WARC_RESPONSE_HEADERS, A_ANNOTATIONS});
new String [] {A_CREDENTIALS_KEY, A_HTTP_AUTH_CHALLENGES, A_SUBMIT_DATA, A_WARC_RESPONSE_HEADERS, A_ANNOTATIONS, A_SUBMIT_ENCTYPE});

/** maximum length for pathFromSeed/hopsPath; longer truncated with leading counter **/
private static final int MAX_HOPS_DISPLAYED = 50;
Expand Down
Expand Up @@ -25,6 +25,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.InetAddress;
import java.net.Socket;
import java.net.UnknownHostException;
Expand All @@ -34,6 +35,7 @@
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
Expand All @@ -45,6 +47,7 @@

import org.apache.commons.httpclient.URIException;
import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpException;
import org.apache.http.HttpHeaders;
Expand All @@ -65,7 +68,6 @@
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.AbstractExecutionAwareRequest;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.config.ConnectionConfig;
import org.apache.http.config.MessageConstraints;
import org.apache.http.config.Registry;
Expand All @@ -82,7 +84,8 @@
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.entity.ContentLengthStrategy;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.entity.mime.HttpMultipartMode;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.impl.DefaultBHttpClientConnection;
import org.apache.http.impl.client.BasicAuthCache;
import org.apache.http.impl.client.BasicCredentialsProvider;
Expand All @@ -105,6 +108,7 @@
import org.archive.modules.credential.HtmlFormCredential;
import org.archive.modules.credential.HttpAuthenticationCredential;
import org.archive.modules.extractor.LinkContext;
import org.archive.modules.forms.HTMLForm.NameValue;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.CrawlServer;
import org.archive.modules.net.ServerCache;
Expand All @@ -113,7 +117,7 @@
/**
* @contributor nlevitt
*/
class FetchHTTPRequest {
public class FetchHTTPRequest {

/**
* Implementation of {@link DnsResolver} that uses the server cache which is
Expand Down Expand Up @@ -183,12 +187,9 @@ public FetchHTTPRequest(FetchHTTP fetcher, CrawlURI curi) throws URIException {
BasicExecutionAwareEntityEnclosingRequest postRequest = new BasicExecutionAwareEntityEnclosingRequest(
"POST", requestLineUri, httpVersion);
this.request = postRequest;
String submitData = (String) curi.getData().get(CoreAttributeConstants.A_SUBMIT_DATA);
if (submitData != null) {
// XXX brittle, doesn't support multipart form data etc
ContentType contentType = ContentType.create(URLEncodedUtils.CONTENT_TYPE, "UTF-8");
StringEntity formEntity = new StringEntity(submitData, contentType);
postRequest.setEntity(formEntity);
if (curi.containsDataKey(CoreAttributeConstants.A_SUBMIT_DATA)) {
HttpEntity entity = buildPostRequestEntity(curi);
postRequest.setEntity(entity);
}
} else {
this.request = new BasicExecutionAwareRequest("GET",
Expand All @@ -210,6 +211,79 @@ public FetchHTTPRequest(FetchHTTP fetcher, CrawlURI curi) throws URIException {
populateHttpProxyCredential();
}

/**
* Returns a copy of the string with non-ascii characters replaced by their
* html numeric character reference in decimal (e.g. &amp;#12345;).
*
* <p>
* The purpose of this is to produce a multipart/formdata submission that
* any server should be able to handle, based on experiments using a modern
* browser (chromium 47.0.2526.106 for mac). What chromium posts depends on
* what it considers the character encoding of the page containing the form,
* and maybe other factors. It would be too complicated to try to simulate
* that behavior in heritrix.
*
* <p>
* Instead what we do is approximately what the browser does when the form
* page is plain ascii. It html-escapes characters outside of the
* latin1/cp1252 range. Characters in the U+0080-U+00FF range are encoded in
* latin1/cp1252. That is the one way that we differ from chromium. We
* html-escape those characters (U+0080-U+00FF) as well. That way the http
* post is plain ascii, and should work regardless of which encoding the
* server expects.
*
* <p>
* N.b. chromium doesn't indicate the encoding of the request in any way (no
* charset in the content-type or anything like that). Also of note is that
* when it considers the form page to be utf-8, it submits in utf-8. That's
* part of the complicated behavior we don't want to try to simulate.
*/
public static String escapeForMultipart(String str) {
StringBuilder buf = new StringBuilder();
for (int i = 0; i < str.length(); ) {
int codepoint = str.codePointAt(i);
if (codepoint <= 0x7f) {
buf.appendCodePoint(codepoint);
} else {
buf.append("&#" + codepoint + ";");
}
i += Character.charCount(codepoint);
}
return buf.toString();
}

protected HttpEntity buildPostRequestEntity(CrawlURI curi) {
String enctype = (String) curi.getData().get(CoreAttributeConstants.A_SUBMIT_ENCTYPE);
if (enctype == null) {
enctype = ContentType.APPLICATION_FORM_URLENCODED.getMimeType();
}

@SuppressWarnings("unchecked")
List<NameValue> submitData = (List<NameValue>) curi.getData().get(CoreAttributeConstants.A_SUBMIT_DATA);

if (enctype.equals(ContentType.APPLICATION_FORM_URLENCODED.getMimeType())) {
LinkedList<NameValuePair> nvps = new LinkedList<NameValuePair>();
for (NameValue nv: submitData) {
nvps.add(new BasicNameValuePair(nv.name, nv.value));
}
try {
return new UrlEncodedFormEntity(nvps, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new IllegalStateException(e);
}
} else if (enctype.equals(ContentType.MULTIPART_FORM_DATA.getMimeType())) {
MultipartEntityBuilder entityBuilder = MultipartEntityBuilder.create();
entityBuilder.setMode(HttpMultipartMode.BROWSER_COMPATIBLE);
for (NameValue nv: submitData) {
entityBuilder.addTextBody(escapeForMultipart(nv.name),
escapeForMultipart(nv.value));
}
return entityBuilder.build();
} else {
throw new IllegalStateException("unsupported form submission enctype='" + enctype + "'");
}
}

protected void configureRequestHeaders() {
if (fetcher.getAcceptCompression()) {
request.addHeader("Accept-Encoding", "gzip,deflate");
Expand Down
Expand Up @@ -145,9 +145,11 @@ protected void analyze(CrawlURI curi, CharSequence cs) {
CharSequence relevantSequence = cs.subSequence(offsetInt, cs.length());
String method = findAttributeValueGroup("(?i)^[^>]*\\smethod\\s*=\\s*([^>\\s]+)[^>]*>",1,relevantSequence);
String action = findAttributeValueGroup("(?i)^[^>]*\\saction\\s*=\\s*([^>\\s]+)[^>]*>",1,relevantSequence);
String enctype = findAttributeValueGroup("(?i)^[^>]*\\senctype\\s*=\\s*([^>\\s]+)[^>]*>",1,relevantSequence);
HTMLForm form = new HTMLForm();
form.setMethod(method);
form.setAction(action);
form.setAction(action);
form.setEnctype(enctype);
for(CharSequence input : findGroups("(?i)(<input\\s[^>]*>)|(</?form>)",1,relevantSequence)) {
String type = findAttributeValueGroup("(?i)^[^>]*\\stype\\s*=\\s*([^>\\s]+)[^>]*>",1,input);
String name = findAttributeValueGroup("(?i)^[^>]*\\sname\\s*=\\s*([^>\\s]+)[^>]*>",1,input);
Expand Down
Expand Up @@ -266,10 +266,12 @@ protected void createFormSubmissionAttempt(CrawlURI curi, HTMLForm templateForm,
CrawlURI submitCuri = curi.createCrawlURI(submitUrl, lc, Hop.SUBMIT);
submitCuri.setFetchType(FetchType.HTTP_POST);
submitCuri.getData().put(
CoreAttributeConstants.A_SUBMIT_DATA,
templateForm.asFormDataString(
CoreAttributeConstants.A_SUBMIT_DATA,
templateForm.formData(
getLoginUsername(),
getLoginPassword()));
submitCuri.getData().put(CoreAttributeConstants.A_SUBMIT_ENCTYPE,
templateForm.getEnctype());
submitCuri.setSchedulingDirective(SchedulingConstants.HIGH);
submitCuri.setForceFetch(true);
curi.getOutLinks().add(submitCuri);
Expand Down
50 changes: 32 additions & 18 deletions modules/src/main/java/org/archive/modules/forms/HTMLForm.java
Expand Up @@ -24,7 +24,6 @@
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.archive.util.TextUtils;

/**
* Simple representation of a discovered HTML Form.
Expand All @@ -46,13 +45,14 @@ public String toString() {
return str;
}
}

String method;
String action;

List<FormInput> allInputs = new ArrayList<FormInput>();
List<FormInput> candidateUsernameInputs = new ArrayList<FormInput>();
List<FormInput> candidatePasswordInputs = new ArrayList<FormInput>();

protected String method;
protected String action;
protected String enctype;

protected List<FormInput> allInputs = new ArrayList<FormInput>();
protected List<FormInput> candidateUsernameInputs = new ArrayList<FormInput>();
protected List<FormInput> candidatePasswordInputs = new ArrayList<FormInput>();

/**
* Add a discovered INPUT, tracking it as potential
Expand Down Expand Up @@ -99,11 +99,19 @@ public void setMethod(String method) {
public String getAction() {
return action;
}

public void setAction(String action) {
this.action = action;
}

public String getEnctype() {
return enctype;
}

public void setEnctype(String enctype) {
this.enctype = enctype;
}

/**
* For now, we consider a POST form with only 1 password
* field and 1 potential username field (type text or email)
Expand All @@ -117,6 +125,7 @@ public boolean seemsLoginForm() {
&& presumedUsernameInput() != null;
}

<<<<<<< HEAD
protected FormInput presumedUsernameInput() {
if (candidateUsernameInputs.size() < 1) {
return null;
Expand All @@ -139,24 +148,29 @@ protected FormInput presumedUsernameInput() {
}
}

public String asFormDataString(String username, String password) {
List<String> nameVals = new LinkedList<String>();
public static class NameValue {
public String name, value;
public NameValue(String name, String value) {
this.name = name;
this.value = value;
}
}

public LinkedList<NameValue> formData(String username, String password) {
LinkedList<NameValue> nameVals = new LinkedList<NameValue>();
for (FormInput input : allInputs) {
if (input == presumedUsernameInput()) {
nameVals.add(TextUtils.urlEscape(input.name) + "=" + TextUtils.urlEscape(username));
} else if (input == candidatePasswordInputs.get(0)) {
nameVals.add(TextUtils.urlEscape(input.name) + "=" + TextUtils.urlEscape(password));
nameVals.add(new NameValue(input.name, username));
} else if(input == candidatePasswordInputs.get(0)) {
nameVals.add(new NameValue(input.name, password));
} else if (StringUtils.isNotEmpty(input.name)
&& StringUtils.isNotEmpty(input.value)
&& (!"radio".equalsIgnoreCase(input.type)
&& !"checkbox".equals(input.type) || input.checked)) {
nameVals.add(TextUtils.urlEscape(input.name) + "="
+ TextUtils.urlEscape(input.value));
nameVals.add(new NameValue(input.name, input.value));
}
}

return StringUtils.join(nameVals, '&');
return nameVals;
}

public String toString() {
Expand Down
Expand Up @@ -53,10 +53,7 @@

import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.apache.http.NameValuePair;
import org.apache.http.NoHttpResponseException;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.message.BasicNameValuePair;
import org.archive.httpclient.ConfigurableX509TrustManager.TrustLevel;
import org.archive.modules.CoreAttributeConstants;
import org.archive.modules.CrawlMetadata;
Expand All @@ -65,6 +62,7 @@
import org.archive.modules.ProcessorTestBase;
import org.archive.modules.credential.HttpAuthenticationCredential;
import org.archive.modules.deciderules.RejectDecideRule;
import org.archive.modules.forms.HTMLForm.NameValue;
import org.archive.modules.recrawl.FetchHistoryProcessor;
import org.archive.modules.revisit.ServerNotModifiedRevisit;
import org.archive.net.UURI;
Expand Down Expand Up @@ -856,16 +854,14 @@ public void testHttpPost() throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/");
curi.setFetchType(FetchType.HTTP_POST);

List<NameValuePair> params = new LinkedList<NameValuePair>();
params.add(new BasicNameValuePair("name1", "value1"));
params.add(new BasicNameValuePair("name1", "value2"));
params.add(new BasicNameValuePair("funky name 2", "whoa crazy\t && 🍺 🍻 \n crazier \rooo"));
String submitData = URLEncodedUtils.format(params, "UTF-8");
assertEquals("name1=value1&name1=value2&funky+name+2=whoa+crazy%09+%26%26+%F0%9F%8D%BA+%F0%9F%8D%BB+%0A+crazier+%0Dooo", submitData);
List<NameValue> params = new LinkedList<NameValue>();
params.add(new NameValue("name1", "value1"));
params.add(new NameValue("name1", "value2"));
params.add(new NameValue("funky name 2", "whoa crazy\t && 🍺 🍻 \n crazier \rooo"));
curi.getData().put(CoreAttributeConstants.A_SUBMIT_DATA, params);

curi.getData().put(CoreAttributeConstants.A_SUBMIT_DATA, submitData);
fetcher().process(curi);

assertTrue(httpRequestString(curi).startsWith("POST / HTTP/1.0\r\n"));
assertTrue(httpRequestString(curi).endsWith("\r\n\r\nname1=value1&name1=value2&funky+name+2=whoa+crazy%09+%26%26+%F0%9F%8D%BA+%F0%9F%8D%BB+%0A+crazier+%0Dooo"));
assertEquals(FetchType.HTTP_POST, curi.getFetchType());
Expand Down

0 comments on commit c51c3df

Please sign in to comment.