Permalink
Browse files

bug fix: MWEB-15

  • Loading branch information...
1 parent f2c87f7 commit feb89e1fa054929dbff22fa34d0f4c3375f91ccd @javasoze committed Mar 7, 2011
@@ -237,9 +237,10 @@ public MeaningfulWebObject extractFromUrl(String url){
public static void main(String[] args) throws Exception{
MetaContentExtractor extractor = new MetaContentExtractor();
- String url = "http://twitpic.com/3sryl9";
-
- OGObject obj = extractor.extractFromUrl(url);
+ //String url = "http://twitpic.com/3sryl9";
+ //String url = "http://www.seobook.com/google-kills-ehows-competitors";
+ String url ="http://techcrunch.com/2011/03/06/apples-jointventure-for-business-gets-official/";
+ MeaningfulWebObject obj = extractor.extractFromUrl(url);
System.out.println(obj);
}
@@ -22,6 +22,7 @@
private Set<String> names = new LinkedHashSet<String>();
private boolean includeAll = false;
+ private boolean skipUnescapingHtml = false;
public Collection<String> getNames() {
return names;
@@ -48,6 +49,14 @@ public void setIncludeAll(boolean includeAll) {
this.includeAll = includeAll;
}
+ public boolean isSkipUnescapingHtml() {
+ return skipUnescapingHtml;
+ }
+
+ public void setSkipUnescapingHtml(boolean skipUnescapingHtml) {
+ this.skipUnescapingHtml = skipUnescapingHtml;
+ }
+
@Override
public boolean processContent(Document document) {
@@ -81,7 +90,8 @@ public boolean processContent(Document document) {
}
// parse the elements with opengraph
- OGObject ogObj = OpenGraphParser.parse(datamap);
+ Set<String> fieldsToUnescape = skipUnescapingHtml ? null : OpenGraphParser.UNESCAPE_HTML_FIELDS;
+ OGObject ogObj = OpenGraphParser.parse(datamap,fieldsToUnescape);
if (!ogObj.isEmpty()) {
Map<String, String> metaMap = ogObj.getMeta();
@@ -64,6 +64,11 @@
<artifactId>commons-httpclient</artifactId>
<version>3.1</version>
</dependency>
+ <dependency>
+ <groupId>commons-lang</groupId>
+ <artifactId>commons-lang</artifactId>
+ <version>2.5</version>
+ </dependency>
<dependency>
<groupId>net.htmlparser.jericho</groupId>
<artifactId>jericho-html</artifactId>
@@ -27,7 +27,7 @@ Licensed to the Apache Software Foundation (ASF) under one
private static final long serialVersionUID = 1L;
- public final static String[] REQUIRED_META = new String[]{"title", "type", "image", "url" };
+ public final static String[] REQUIRED_META = new String[]{OpenGraphVocabulary.TITLE, OpenGraphVocabulary.TYPE, OpenGraphVocabulary.IMAGE, OpenGraphVocabulary.URL };
public static final String MEDIA_URL = "url";
private final Map<String,String> _meta = new HashMap<String,String>();
@@ -68,53 +68,53 @@ public boolean isValid(){
}
public String getTitle(){
- return _meta.get("title");
+ return _meta.get(OpenGraphVocabulary.TITLE);
}
public String getImage(){
- return _meta.get("image");
+ return _meta.get(OpenGraphVocabulary.IMAGE);
}
public String getType(){
- return _meta.get("type");
+ return _meta.get(OpenGraphVocabulary.TYPE);
}
public String getUrl(){
- return _meta.get("url");
+ return _meta.get(OpenGraphVocabulary.URL);
}
public String getDescription(){
- return _meta.get("description");
+ return _meta.get(OpenGraphVocabulary.DESCRIPTION);
}
public String getSiteName(){
- return _meta.get("site_name");
+ return _meta.get(OpenGraphVocabulary.SITE_NAME);
}
public String getStreetAddress(){
- return _meta.get("street-address");
+ return _meta.get(OpenGraphVocabulary.STREET_ADDRESS);
}
public String getLocality(){
- return _meta.get("locality");
+ return _meta.get(OpenGraphVocabulary.LOCALITY);
}
public String getRegion(){
- return _meta.get("region");
+ return _meta.get(OpenGraphVocabulary.REGION);
}
public String getPostalCode(){
- return _meta.get("postal-code");
+ return _meta.get(OpenGraphVocabulary.POSTAL_CODE);
}
public String getCountryName(){
- return _meta.get("country-name");
+ return _meta.get(OpenGraphVocabulary.COUNTRY_NAME);
}
public float getLatitude(){
float lat;
try{
- lat = Float.parseFloat(_meta.get("latitude"));
+ lat = Float.parseFloat(_meta.get(OpenGraphVocabulary.LATITUDE));
}
catch(Exception e){
lat = Float.NaN;
@@ -125,7 +125,7 @@ public float getLatitude(){
public float getLongitude(){
float lon;
try{
- lon = Float.parseFloat(_meta.get("longitude"));
+ lon = Float.parseFloat(_meta.get(OpenGraphVocabulary.LONGITUDE));
}
catch(Exception e){
lon = Float.NaN;
@@ -134,15 +134,15 @@ public float getLongitude(){
}
public String getEmail(){
- return _meta.get("email");
+ return _meta.get(OpenGraphVocabulary.EMAIL);
}
public String getPhoneNumber(){
- return _meta.get("phone_number");
+ return _meta.get(OpenGraphVocabulary.PHONE_NUMBER);
}
public String getFaxNumber(){
- return _meta.get("fax_number");
+ return _meta.get(OpenGraphVocabulary.FAX_NUMBER);
}
public String getAudioProp(String prop){
@@ -4,6 +4,7 @@
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -18,6 +19,7 @@
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.lang.StringEscapeUtils;
/*
@@ -46,11 +48,27 @@ Licensed to the Apache Software Foundation (ASF) under one
private static final int OG_VIDEO_PREFIX_CHAR_COUNT = 6; // length of "video:"
private static final int OG_AUDIO_PREFIX_CHAR_COUNT = 6; // length of "audio:"
+ public static final Set<String> UNESCAPE_HTML_FIELDS = new HashSet<String>();
+
+ static{
+ UNESCAPE_HTML_FIELDS.add(OpenGraphVocabulary.TITLE);
+ UNESCAPE_HTML_FIELDS.add(OpenGraphVocabulary.DESCRIPTION);
+ }
+
+ public static OGObject parse(String html,Set<String> unescapeHtml){
+ return parse(new Source(html),unescapeHtml);
+ }
+
public static OGObject parse(String html){
- return parse(new Source(html));
+ return parse(html,null);
}
+
public static OGObject parse(Map<String,String> ogmap){
+ return parse(ogmap,null);
+ }
+
+ public static OGObject parse(Map<String,String> ogmap,Set<String> unescapeHtml){
OGObject obj = new OGObject();
Map<String,String> meta= obj.getMeta();
Map<String,String> video= obj.getVideo();
@@ -79,13 +97,19 @@ else if (name.startsWith(OpenGraphVocabulary.AUDIO)){
}
}
else{
+ if (unescapeHtml!=null && unescapeHtml.size() > 0 && unescapeHtml.contains(name)){
+ content = StringEscapeUtils.unescapeHtml(content);
+ }
meta.put(name, content);
}
}
return obj;
}
public static OGObject parse(Source source){
+ return parse(source,null);
+ }
+ public static OGObject parse(Source source,Set<String> unescapeHtml){
Element htmlTag = source.getFirstElement(HTMLElementName.HTML);
List<Element> elementList = source.getAllElements(HTMLElementName.META);
Map<String,String> datamap = new HashMap<String,String>();
@@ -100,7 +124,7 @@ public static OGObject parse(Source source){
datamap.put(name, content);
}
}
- return parse(datamap);
+ return parse(datamap,unescapeHtml);
}
private static final String XMLNS_PREFIX = "xmlns:";
@@ -132,6 +156,10 @@ private static String findOpenGraphNamespacePrefix(Element htmlTag)
private static final int BUFFER_SIZE = 4*1024; // 4k buffer
public static OGObject parse(Reader reader) throws IOException{
+ return parse(reader,null);
+ }
+
+ public static OGObject parse(Reader reader,Set<String> unescapeHtml) throws IOException{
StringBuffer buffer = new StringBuffer();
char[] buf = new char[BUFFER_SIZE];
@@ -142,17 +170,18 @@ public static OGObject parse(Reader reader) throws IOException{
buffer.append(buf, 0, len);
}
- return parse(buffer.toString());
+ return parse(buffer.toString(),unescapeHtml);
}
public static OGObject fetchAndParse(HttpClient httpClient,String uri) throws IOException{
+
GetMethod get = null;
try{
get = new GetMethod(uri);
int status = httpClient.executeMethod(get);
if (status==HttpStatus.SC_OK){
InputStreamReader reader = new InputStreamReader(get.getResponseBodyAsStream(),get.getResponseCharSet());
- return parse(reader);
+ return parse(reader,UNESCAPE_HTML_FIELDS);
}
else{
return null;
@@ -166,7 +195,7 @@ public static OGObject fetchAndParse(HttpClient httpClient,String uri) throws IO
}
public static void main(String[] args) throws Exception{
- String url = "http://techcrunch.com/2011/01/18/microsoft-kinect-developer-johnny-chung-lee-jumps-ship-and-lands-at-google/";
+ String url = "http://techcrunch.com/2011/03/06/apples-jointventure-for-business-gets-official/";
HttpClient client = new HttpClient();
OGObject obj = OpenGraphParser.fetchAndParse(client, url);
System.out.println(obj);
@@ -4,18 +4,25 @@
import java.io.FileReader;
import java.io.IOException;
import java.util.Map;
+import java.util.Set;
import junit.framework.TestCase;
+
import org.meaningfulweb.opengraph.OGObject;
import org.meaningfulweb.opengraph.OpenGraphParser;
public class Og4jTestCase extends TestCase {
static final File TestDataDir = new File("src/test/test-data");
+
private static OGObject read(File f) throws IOException{
+ return read(f,null);
+ }
+
+ private static OGObject read(File f,Set<String> unscapeHtml) throws IOException{
FileReader reader = new FileReader(f);
- OGObject obj = OpenGraphParser.parse(reader);
+ OGObject obj = OpenGraphParser.parse(reader,unscapeHtml);
reader.close();
return obj;
}
@@ -27,12 +34,18 @@ public void testEmpty() throws Exception{
assertFalse(obj.isValid());
}
+ public void testUnescapeHtml() throws Exception{
+ File emptyHtml = new File(TestDataDir,"unescapeTestHtml.html");
+ OGObject obj = read(emptyHtml,OpenGraphParser.UNESCAPE_HTML_FIELDS);
+ assertEquals("Shaquille O\'Neal\'", obj.getTitle());
+ assertEquals("Shaquille O\'Neal\' ", obj.getDescription());
+ }
+
public void testInvalid() throws Exception{
File invalidHtml = new File(TestDataDir,"invalid.html");
OGObject obj = read(invalidHtml);
- Map<String,String> metaMap = obj.getMeta();
- assertEquals("img", metaMap.get("image"));
- assertEquals("invalid", metaMap.get("title"));
+ assertEquals("img", obj.getImage());
+ assertEquals("invalid", obj.getTitle());
assertFalse(obj.isValid());
}
@@ -0,0 +1,6 @@
+<html>
+ <head>
+ <meta property="og:title" content="Shaquille O&#39;Neal&#39;"/>
+ <meta property="og:description" content="Shaquille O&#39;Neal&#39;&nbsp;"/>
+ </head>
+</html>

0 comments on commit feb89e1

Please sign in to comment.