Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix form login #53

Merged
merged 7 commits into from
Apr 1, 2014
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.archive.crawler.selftest;

import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import org.mortbay.jetty.Handler;
import org.mortbay.jetty.Server;
import org.mortbay.jetty.bio.SocketConnector;
import org.mortbay.jetty.handler.DefaultHandler;
import org.mortbay.jetty.handler.HandlerList;
import org.mortbay.jetty.handler.ResourceHandler;
import org.mortbay.jetty.servlet.ServletHandler;
import org.mortbay.jetty.servlet.ServletHolder;

/**
* Test form-based authentication
*
* @contributor stack
* @contributor gojomo
*/
public class FormLoginSelfTest
extends SelfTestBase
{
/**
* Files to find as a list.
*/
final private static Set<String> EXPECTED = Collections.unmodifiableSet(
new HashSet<String>(Arrays.asList(new String[] {
"index.html", "login/login.html", "success.html", "robots.txt", "favicon.ico"
})));

@Override
protected void verify() throws Exception {
Set<String> found = this.filesInArcs();
assertEquals("wrong files in ARCs",EXPECTED,found);
}

@Override
protected void startHttpServer() throws Exception {
Server server = new Server();

SocketConnector sc = new SocketConnector();
sc.setHost("127.0.0.1");
sc.setPort(7777);
server.addConnector(sc);
ResourceHandler rhandler = new ResourceHandler();
rhandler.setResourceBase(getSrcHtdocs().getAbsolutePath());

ServletHandler servletHandler = new ServletHandler();

HandlerList handlers = new HandlerList();
handlers.setHandlers(new Handler[] {
rhandler,
servletHandler,
new DefaultHandler() });
server.setHandler(handlers);

ServletHolder holder = new ServletHolder(new FormAuthServlet());
servletHandler.addServletWithMapping(holder, "/login/*");

this.httpServer = server;
this.httpServer.start();
}

protected String getSeedsString() {
return "http://127.0.0.1:7777/index.html";
}

@Override
protected String changeGlobalConfig(String config) {
String formLoginConfig =
" <bean id='extractorForms' class='org.archive.modules.forms.ExtractorHTMLForms'/>\n"
+ " <bean id='formFiller' class='org.archive.modules.forms.FormLoginProcessor'>\n"
+ " <property name='loginUsername' value='Mr. Happy Pants' />\n"
+ " <property name='loginPassword' value='xyzzy' />\n"
+ " </bean>\n";
config = config.replace("<!--@@MORE_EXTRACTORS@@-->", formLoginConfig);
return super.changeGlobalConfig(config);
}

}

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<a href="index.html">index</a>

<a href="link1.html">Link 1</a>

<a href="link2.html"> Link 2</a>

<a href="link3.html"> Link 3</a>

<a href="basic/index.html">Secure</a>
5 changes: 5 additions & 0 deletions engine/testdata/selftest/FormLoginSelfTest/htdocs/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<form action="login/login.html" method="post">
<input type="text" name="username"/>
<input type="password" name="password"/>
</form>

7 changes: 7 additions & 0 deletions engine/testdata/selftest/FormLoginSelfTest/htdocs/link1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<a href="index.html">index</a>

<a href="link1.html">Link 1</a>

<a href="link2.html"> Link 2</a>

<a href="link3.html"> Link 3</a>
7 changes: 7 additions & 0 deletions engine/testdata/selftest/FormLoginSelfTest/htdocs/link2.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<a href="index.html">index</a>

<a href="link1.html">Link 1</a>

<a href="link2.html"> Link 2</a>

<a href="link3.html"> Link 3</a>
7 changes: 7 additions & 0 deletions engine/testdata/selftest/FormLoginSelfTest/htdocs/link3.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<a href="index.html">index</a>

<a href="link1.html">Link 1</a>

<a href="link2.html"> Link 2</a>

<a href="link3.html"> Link 3</a>
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<a href="index.html">index</a>

1 change: 1 addition & 0 deletions engine/testdata/selftest/conf/selftest-crawler-beans.cxml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ crawlController.pauseAtStart=false
<ref bean="extractorCss"/>
<ref bean="extractorJs"/>
<ref bean="extractorSwf"/>
<!--@@MORE_EXTRACTORS@@-->
<ref bean="arcWriterProcessor"/>
</list>
</property>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.AbstractExecutionAwareRequest;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.config.ConnectionConfig;
import org.apache.http.config.MessageConstraints;
import org.apache.http.config.Registry;
Expand All @@ -79,6 +80,8 @@
import org.apache.http.conn.ssl.AllowAllHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.entity.ContentLengthStrategy;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.DefaultBHttpClientConnection;
import org.apache.http.impl.client.BasicAuthCache;
import org.apache.http.impl.client.BasicCredentialsProvider;
Expand All @@ -93,6 +96,7 @@
import org.apache.http.protocol.HTTP;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.Args;
import org.archive.modules.CoreAttributeConstants;
import org.archive.modules.CrawlURI;
import org.archive.modules.CrawlURI.FetchType;
import org.archive.modules.Processor;
Expand Down Expand Up @@ -175,8 +179,16 @@ public FetchHTTPRequest(FetchHTTP fetcher, CrawlURI curi) throws URIException {
}

if (curi.getFetchType() == FetchType.HTTP_POST) {
this.request = new BasicExecutionAwareEntityEnclosingRequest("POST",
requestLineUri, httpVersion);
BasicExecutionAwareEntityEnclosingRequest postRequest = new BasicExecutionAwareEntityEnclosingRequest(
"POST", requestLineUri, httpVersion);
this.request = postRequest;
String submitData = (String) curi.getData().get(CoreAttributeConstants.A_SUBMIT_DATA);
if (submitData != null) {
// XXX brittle, doesn't support multipart form data etc
ContentType contentType = ContentType.create(URLEncodedUtils.CONTENT_TYPE, "UTF-8");
StringEntity formEntity = new StringEntity(submitData, contentType);
postRequest.setEntity(formEntity);
}
} else {
this.request = new BasicExecutionAwareRequest("GET",
requestLineUri, httpVersion);
Expand All @@ -196,7 +208,7 @@ public FetchHTTPRequest(FetchHTTP fetcher, CrawlURI curi) throws URIException {
this.addedCredentials = populateTargetCredential();
populateHttpProxyCredential();
}

protected void configureRequestHeaders() {
if (fetcher.getAcceptCompression()) {
request.addHeader("Accept-Encoding", "gzip,deflate");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,11 @@

import static org.archive.modules.CoreAttributeConstants.A_WARC_RESPONSE_HEADERS;

import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;

import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.lang.StringUtils;
import org.archive.checkpointing.Checkpointable;
Expand All @@ -45,6 +42,10 @@
import org.json.JSONObject;
import org.springframework.beans.factory.annotation.Autowired;

import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;

/**
* A step, post-ExtractorHTMLForms, where a followup CrawlURI to
* attempt a form submission may be synthesized.
Expand Down Expand Up @@ -119,24 +120,24 @@ public class FormLoginProcessor extends Processor implements Checkpointable {
Logger.getLogger(FormLoginProcessor.class.getName());

// formProvince (String) -> count
ConcurrentMap<String, AtomicLong> eligibleFormsSeenCount =
protected LoadingCache<String, AtomicLong> eligibleFormsSeenCount =
CacheBuilder.newBuilder()
.<String, AtomicLong>build(
new CacheLoader<String, AtomicLong>() {
public AtomicLong load(String arg0) {
return new AtomicLong(0L);
}
}).asMap();
});

// formProvince (String) -> count
ConcurrentMap<String, AtomicLong> eligibleFormsAttemptsCount =
protected LoadingCache<String, AtomicLong> eligibleFormsAttemptsCount =
CacheBuilder.newBuilder()
.<String, AtomicLong>build(
new CacheLoader<String, AtomicLong>() {
public AtomicLong load(String arg0) {
return new AtomicLong(0L);
}
}).asMap();
});

/**
* SURT prefix against which configured username/password is
Expand Down Expand Up @@ -219,13 +220,17 @@ protected void innerProcess(CrawlURI curi) {
for( Object formObject : curi.getDataList(ExtractorHTMLForms.A_HTML_FORM_OBJECTS)) {
HTMLForm form = (HTMLForm) formObject;
if(form.seemsLoginForm()) {
eligibleFormsSeenCount.get(formProvince).incrementAndGet();
if(eligibleFormsAttemptsCount.get(formProvince).get()<1) {
eligibleFormsAttemptsCount.get(formProvince).incrementAndGet();
createFormSubmissionAttempt(curi,form,formProvince);
} else {
// note decline-to-submit: in volume, may be signal of failed first login
curi.getAnnotations().add("nosubmit:"+submitStatusFor(formProvince));
try {
eligibleFormsSeenCount.get(formProvince).incrementAndGet();
if(eligibleFormsAttemptsCount.get(formProvince).get()<1) {
eligibleFormsAttemptsCount.get(formProvince).incrementAndGet();
createFormSubmissionAttempt(curi,form,formProvince);
} else {
// note decline-to-submit: in volume, may be signal of failed first login
curi.getAnnotations().add("nosubmit:"+submitStatusFor(formProvince));
}
} catch (ExecutionException e) {
throw new RuntimeException(e); // can't happen?
}
return;
}
Expand Down Expand Up @@ -260,10 +265,9 @@ protected void createFormSubmissionAttempt(CrawlURI curi, HTMLForm templateForm,
submitCuri.setFetchType(FetchType.HTTP_POST);
submitCuri.getData().put(
CoreAttributeConstants.A_SUBMIT_DATA,
templateForm.asHttpClientDataWith(
templateForm.asFormDataString(
getLoginUsername(),
getLoginPassword()));
//submitCuri.setSchedulingDirective(Math.max(curi.getSchedulingDirective()-1, 0));
submitCuri.setSchedulingDirective(SchedulingConstants.HIGH);
submitCuri.setForceFetch(true);
curi.getOutCandidates().add(submitCuri);
Expand All @@ -278,9 +282,13 @@ protected String warcHeaderFor(String formProvince) {
}

protected String submitStatusFor(String formProvince) {
return eligibleFormsAttemptsCount.get(formProvince).get()
+","+eligibleFormsSeenCount.get(formProvince).get()
+","+formProvince;
try {
return eligibleFormsAttemptsCount.get(formProvince).get()
+","+eligibleFormsSeenCount.get(formProvince).get()
+","+formProvince;
} catch (ExecutionException e) {
throw new RuntimeException(e);
}
}

@Override
Expand All @@ -295,10 +303,10 @@ protected JSONObject toCheckpointJson() throws JSONException {
protected void fromCheckpointJson(JSONObject json) throws JSONException {
super.fromCheckpointJson(json);
JSONUtils.putAllAtomicLongs(
eligibleFormsAttemptsCount,
eligibleFormsAttemptsCount.asMap(),
json.getJSONObject("eligibleFormsAttemptsCount"));
JSONUtils.putAllAtomicLongs(
eligibleFormsSeenCount,
eligibleFormsSeenCount.asMap(),
json.getJSONObject("eligibleFormsSeenCount"));
}
}
19 changes: 19 additions & 0 deletions modules/src/main/java/org/archive/modules/forms/HTMLForm.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@
package org.archive.modules.forms;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.lang.StringUtils;
import org.archive.util.TextUtils;

/**
* Simple representation of a discovered HTML Form.
Expand Down Expand Up @@ -104,6 +106,7 @@ public boolean seemsLoginForm() {
* @param username
* @param password
* @return
* @deprecated specific to a particular FetchHTTP implementation based on commons-httpclient, use {@link #asFormDataString(String, String)}
*/
public NameValuePair[] asHttpClientDataWith(String username, String password) {
ArrayList<NameValuePair> data = new ArrayList<NameValuePair>(allInputs.size());
Expand All @@ -119,6 +122,22 @@ public NameValuePair[] asHttpClientDataWith(String username, String password) {
}
return data.toArray(new NameValuePair[data.size()]);
}

public String asFormDataString(String username, String password) {
List<String> nameVals = new LinkedList<String>();

for (FormInput input : allInputs) {
if(input == candidateUsernameInputs.get(0)) {
nameVals.add(TextUtils.urlEscape(input.name) + "=" + TextUtils.urlEscape(username));
} else if(input == candidatePasswordInputs.get(0)) {
nameVals.add(TextUtils.urlEscape(input.name) + "=" + TextUtils.urlEscape(password));
} else if (StringUtils.isNotEmpty(input.name) && StringUtils.isNotEmpty(input.value)) {
nameVals.add(TextUtils.urlEscape(input.name) + "=" + TextUtils.urlEscape(input.value));
}
}

return StringUtils.join(nameVals, '&');
}

public String toString() {
StringBuilder sb = new StringBuilder();
Expand Down
Loading