Permalink
Browse files

Use HttpPageReader for better charset handling.

  • Loading branch information...
1 parent 586bb7a commit 5f95c55876b46becba4e49852cd6012e917b55d9 Herb Jiang committed Jun 20, 2012
View
@@ -11,6 +11,11 @@
<classpathentry kind="lib" path="lib/slf4j-log4j12-1.6.1.jar"/>
<classpathentry kind="lib" path="lib/log4j-1.2.16.jar"/>
<classpathentry kind="lib" path="lib/json-20080701.jar"/>
+ <classpathentry kind="lib" path="lib/commons-io-1.4.jar"/>
+ <classpathentry kind="lib" path="lib/commons-lang-2.4.jar"/>
+ <classpathentry kind="lib" path="lib/httpclient-4.1.3.jar"/>
+ <classpathentry kind="lib" path="lib/httpcore-4.1.4.jar"/>
+ <classpathentry kind="lib" path="lib/commons-logging-1.1.1.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="output" path="bin"/>
</classpath>
View
Binary file not shown.
View
Binary file not shown.
Binary file not shown.
View
Binary file not shown.
View
Binary file not shown.
@@ -0,0 +1,96 @@
+package de.jetwick.snacktory;
+/******************************************************************************
+ * Copyright (c) 2010 Basis Technology Corp.
+ *
+ * Basis Technology Corp. licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.commons.io.IOUtils;
+
+public class AbstractPageReader {
+ static final Logger LOG = LoggerFactory.getLogger(HttpPageReader.class);
+ static final Charset UTF8 = Charset.forName("utf-8");
+
+ private PageCharsetDetector charsetDetector;
+ private Charset charset;
+ private boolean serverReturnedEncoding;
+ private boolean respectServerEncoding;
+ private String detectedEncoding;
+
+ protected String readContent(InputStream response, String forceEncoding) throws IOException {
+ byte[] bytes = IOUtils.toByteArray(response);
+ charset = null;
+ String hint = null;
+ if (forceEncoding != null) {
+ serverReturnedEncoding = true;
+ try {
+ charset = Charset.forName(forceEncoding);
+ hint = charset.name();
+ } catch (Exception e) {
+ //
+ }
+ }
+ if (charsetDetector != null && (!respectServerEncoding || charset == null)) {
+ String charsetName = charsetDetector.detect(bytes, hint);
+ if (charsetName != null) {
+ try {
+ charset = Charset.forName(charsetName);
+ detectedEncoding = charset.name();
+ } catch (Exception e) {
+ LOG.warn("Detected character set " + charsetName + " not supported");
+ }
+ }
+ }
+ if (charset == null) {
+ LOG.warn("Defaulting to utf-8");
+ charset = UTF8;
+ }
+ return new String(bytes, charset);
+ }
+
+ public PageCharsetDetector getCharsetDetector() {
+ return charsetDetector;
+ }
+
+ public void setCharsetDetector(PageCharsetDetector charsetDetector) {
+ this.charsetDetector = charsetDetector;
+ }
+
+ public Charset getCharset() {
+ return charset;
+ }
+
+ public boolean isServerReturnedEncoding() {
+ return serverReturnedEncoding;
+ }
+
+ public void setRespectServerEncoding(boolean respectServerEncoding) {
+ this.respectServerEncoding = respectServerEncoding;
+ }
+
+ public boolean isRespectServerEncoding() {
+ return respectServerEncoding;
+ }
+
+ public String getDetectedEncoding() {
+ return detectedEncoding;
+ }
+
+}
@@ -44,7 +44,7 @@
private static final Pattern NEGATIVE_STYLE = Pattern.compile("hidden|display: ?none");
private static final Pattern IGNORE_IMAGE_PATTERN =
- Pattern.compile("ico(/|n|\\.)|spacer|blank|zoom");
+ Pattern.compile("(?i)ico(/|n|\\.)|spacer|blank|zoom|logo|temp");
private static final String IMAGE_CAPTION = "caption";
private static final Set<String> IGNORED_TITLE_PARTS = new LinkedHashSet<String>() {
@@ -53,7 +53,7 @@
add("facebook");
}
};
- private static final OutputFormatter DEFAULT_FORMATTER = new OutputFormatter();
+ private static final OutputFormatter DEFAULT_FORMATTER = new MyOutputFormatter();
/**
* @param html extracts article text from given html string.
@@ -105,7 +105,7 @@ public JResult extractContent(JResult res, Document doc, OutputFormatter formatt
if (bestMatchElement != null) {
Element imgEl = determineImageSource(bestMatchElement);
if (imgEl != null) {
- res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src")));
+ res.setImageUrl(SHelper.replaceSpaces(SHelper.getSrcOrRelFromImageElement(imgEl)));
// TODO remove parent container of image if it is contained in bestMatchElement
// to avoid image subtitles flooding in
}
@@ -122,14 +122,14 @@ public JResult extractContent(JResult res, Document doc, OutputFormatter formatt
if (res.getImageUrl().isEmpty()) {
res.setImageUrl(extractImageUrl(doc));
-
- if(res.getImageUrl().isEmpty()){
- Element imgEl = determineImageSource(doc.select("body").first());
- if (imgEl != null) {
- res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src")));
- }
- }
+ if(res.getImageUrl().isEmpty()){
+ Element imgEl = determineImageSource((bestMatchElement != null && bestMatchElement.parent() != null) ? bestMatchElement.parent() : doc.select("body").first());
+
+ if (imgEl != null) {
+ res.setImageUrl(SHelper.replaceSpaces(SHelper.getSrcOrRelFromImageElement(imgEl)));
+ }
+ }
}
res.setRssUrl(extractRssUrl(doc));
@@ -353,7 +353,7 @@ public Element determineImageSource(Element el) {
double score = 1;
for (Element e : els) {
- String sourceUrl = e.attr("src");
+ String sourceUrl = SHelper.getSrcOrRelFromImageElement(e);
if (sourceUrl.isEmpty() || isAdImage(sourceUrl))
continue;
@@ -365,17 +365,21 @@ public Element determineImageSource(Element el) {
try {
int height = Integer.parseInt(e.attr("height"));
- if (height > 50)
- weight += 20;
+ if (height > 50) {
+ if (height > 150) weight += 41;
+ else weight += 20;
+ }
else if (height < 50)
weight -= 20;
} catch (Exception ex) {
}
try {
int width = Integer.parseInt(e.attr("width"));
- if (width > 50)
- weight += 20;
+ if (width > 50) {
+ if (width > 150) weight += 41;
+ else weight += 20;
+ }
else if (width < 50)
weight -= 20;
} catch (Exception ex) {
@@ -406,6 +410,8 @@ else if (width < 50)
}
private boolean isIconImage(String imageUrl) {
+ if ("http://pg.udn.com/2010/images/udnlogo.png".equals(imageUrl))
+ return true;
return IGNORE_IMAGE_PATTERN.matcher(imageUrl).find();
}
@@ -0,0 +1,111 @@
+package de.jetwick.snacktory;
+/******************************************************************************
+ * Copyright (c) 2010 Basis Technology Corp.
+ *
+ * Basis Technology Corp. licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+
+
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.http.HttpResponse;
+import org.apache.http.HttpStatus;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.DefaultHttpClient;
+import org.apache.http.params.BasicHttpParams;
+import org.apache.http.params.HttpConnectionParams;
+import org.apache.http.params.HttpParams;
+import org.apache.http.protocol.BasicHttpContext;
+import org.apache.http.protocol.HttpContext;
+import org.apache.http.util.EntityUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Warning, not thread safe!
+ */
+public class HijackableHttpPageReader extends HttpPageReader implements PageReader {
+ static final Logger LOG = LoggerFactory.getLogger(HijackableHttpPageReader.class);
+ String url = null;
+ String pageHtml = null;
+
+ public void setPreFetchedPage(String url, String pageHtml) {
+ this.url = url;
+ this.pageHtml = pageHtml;
+ }
+
+ /** {@inheritDoc}*/
+ @SuppressWarnings("deprecation")
+ @Override
+ public String readPage(String url) throws PageReadException {
+ if (url.equals(this.url) && StringUtils.isNotEmpty(pageHtml)) {
+ LOG.info("Use already fetched content of " + url);
+
+ return pageHtml;
+ }
+ else {
+ LOG.info("Reading " + url);
+ this.url = url;
+ this.pageHtml = null;
+
+ HttpParams httpParameters = new BasicHttpParams();
+ // Set the timeout in milliseconds until a connection is established.
+ HttpConnectionParams.setConnectionTimeout(httpParameters, timeoutConnection);
+ // Set the default socket timeout (SO_TIMEOUT)
+ // in milliseconds which is the timeout for waiting for data.
+ int timeoutSocket = 10000;
+ HttpConnectionParams.setSoTimeout(httpParameters, timeoutSocket);
+
+ DefaultHttpClient httpclient = new DefaultHttpClient(httpParameters);
+
+ HttpContext localContext = new BasicHttpContext();
+ HttpGet get = new HttpGet(url);
+ get.setHeader("User-Agent", userAgent);
+ InputStream response = null;
+ HttpResponse httpResponse = null;
+ try {
+ try {
+ httpResponse = httpclient.execute(get, localContext);
+ int resp = httpResponse.getStatusLine().getStatusCode();
+ if (HttpStatus.SC_OK != resp) {
+ LOG.error("Download failed of " + url + " status " + resp + " " + httpResponse.getStatusLine().getReasonPhrase());
+ return null;
+ }
+ String respCharset = EntityUtils.getContentCharSet(httpResponse.getEntity());
+
+ pageHtml = readContent(httpResponse.getEntity().getContent(), respCharset);
+ return pageHtml;
+ } finally {
+ if (response != null) {
+ response.close();
+ }
+ if (httpResponse != null && httpResponse.getEntity() != null) {
+ httpResponse.getEntity().consumeContent();
+ }
+
+ }
+ } catch (IOException e) {
+ LOG.error("Download failed of " + url, e);
+ throw new PageReadException("Failed to read " + url, e);
+ }
+ }
+ }
+
+}
@@ -81,6 +81,7 @@ public static void main(String[] args) throws Exception {
private String accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
private String charset = "UTF-8";
private SCache cache;
+ private HttpPageReader pageReader = null;
private boolean enableCompress = false;
private AtomicInteger cacheCounter = new AtomicInteger(0);
private int maxTextLength = -1;
@@ -176,7 +177,17 @@ public String getReferrer() {
return referrer;
}
- public HtmlFetcher setReferrer(String referrer) {
+ public HttpPageReader getHttpPageReader() {
+ if (pageReader == null)
+ pageReader = new HijackableHttpPageReader();
+ return pageReader;
+ }
+
+ public void setHttpPageReader(HttpPageReader pageReader) {
+ this.pageReader = pageReader;
+ }
+
+ public HtmlFetcher setReferrer(String referrer) {
this.referrer = referrer;
return this;
}
@@ -208,7 +219,7 @@ public boolean isEnableCompress() {
public void setEnableCompress(boolean enableCompress) {
this.enableCompress = enableCompress;
}
-
+
public JResult fetchAndExtract(String url, int timeout, boolean resolve) throws Exception {
String originalUrl = url;
url = SHelper.removeHashbang(url);
@@ -317,8 +328,22 @@ public String fetchAsString(String urlAsString, int timeout)
public String fetchAsString(String urlAsString, int timeout, boolean includeSomeGooseOptions)
throws MalformedURLException, IOException {
- HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, includeSomeGooseOptions);
- hConn.setInstanceFollowRedirects(true);
+ /*HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, includeSomeGooseOptions);
+ // hConn.setInstanceFollowRedirects(true);
+
+ hConn.setInstanceFollowRedirects(false);
+ hConn.connect();
+ final int responseCode = hConn.getResponseCode();
+
+ if (responseCode == 301) {
+ java.lang.System.out.println("Got redirection:" + responseCode);
+ final String location = hConn.getHeaderField("Location");
+
+ if (location != null && !location.isEmpty()) {
+ urlAsString = location;
+ hConn = createUrlConnection(urlAsString, timeout, includeSomeGooseOptions);
+ }
+ }
InputStream is = null;
@@ -338,8 +363,19 @@ public String fetchAsString(String urlAsString, int timeout, boolean includeSome
String enc = Converter.extractEncoding(hConn.getContentType());
String res = createConverter(urlAsString).streamToString(is, enc);
if (logger.isDebugEnabled())
- logger.debug(res.length() + " FetchAsString:" + urlAsString);
- return res;
+ logger.debug(res.length() + " FetchAsString:" + urlAsString);*/
+
+ HttpPageReader pr = getHttpPageReader();
+ pr.setTimeoutConnection(timeout);
+
+ try {
+ return pr.readPage(urlAsString);
+ }
+ catch (PageReadException e) {
+ e.printStackTrace();
+ }
+
+ return null;
}
public Converter createConverter(String url) {
Oops, something went wrong.

0 comments on commit 5f95c55

Please sign in to comment.