Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

initiali checkin

  • Loading branch information...
commit 2bcedb1cbf0822143a17cc21629d67fae0709738 0 parents
@javasoze authored
7 .classpath
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+ <classpathentry kind="src" output="target/classes" path="src/main/java"/>
+ <classpathentry kind="con" path="org.maven.ide.eclipse.MAVEN2_CLASSPATH_CONTAINER"/>
+ <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+ <classpathentry kind="output" path="target/classes"/>
+</classpath>
23 .project
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+ <name>meaningfulweb</name>
+ <comment></comment>
+ <projects>
+ </projects>
+ <buildSpec>
+ <buildCommand>
+ <name>org.eclipse.jdt.core.javabuilder</name>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ <buildCommand>
+ <name>org.maven.ide.eclipse.maven2Builder</name>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ </buildSpec>
+ <natures>
+ <nature>org.maven.ide.eclipse.maven2Nature</nature>
+ <nature>org.eclipse.jdt.core.javanature</nature>
+ </natures>
+</projectDescription>
3  .settings/org.eclipse.core.resources.prefs
@@ -0,0 +1,3 @@
+#Sun Jan 23 16:43:26 PST 2011
+eclipse.preferences.version=1
+encoding/<project>=UTF-8
6 .settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,6 @@
+#Sun Jan 23 17:03:54 PST 2011
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
+org.eclipse.jdt.core.compiler.compliance=1.6
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
+org.eclipse.jdt.core.compiler.source=1.6
8 .settings/org.maven.ide.eclipse.prefs
@@ -0,0 +1,8 @@
+#Sun Jan 23 17:03:46 PST 2011
+activeProfiles=
+eclipse.preferences.version=1
+fullBuildGoals=process-test-resources
+resolveWorkspaceProjects=true
+resourceFilterGoals=process-resources resources\:testResources
+skipCompilerPlugin=true
+version=1
7 bin/mvn-install.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+echo "installing og4j.jar..."
+mvn install:install-file -Dfile=lib/og4j-0.0.1-SNAPSHOT.jar -DgroupId=proj.og4j -DartifactId=og4j -Dversion=0.0.1-SNAPSHOT -Dpackaging=jar
+echo "installing tika core snapshot..."
+mvn install:install-file -Dfile=lib/tika-core-0.9-SNAPSHOT.jar -DgroupId=org.apache.tika -DartifactId=tika-core -Dversion=0.9-SNAPSHOT -Dpackaging=jar
+echo "installing tika parser snapshot..."
+mvn install:install-file -Dfile=lib/tika-parsers-0.9-SNAPSHOT.jar -DgroupId=org.apache.tika -DartifactId=tika-parsers -Dversion=0.9-SNAPSHOT -Dpackaging=jar
BIN  lib/og4j-0.0.1-SNAPSHOT.jar
Binary file not shown
BIN  lib/tika-core-0.9-SNAPSHOT.jar
Binary file not shown
BIN  lib/tika-parsers-0.9-SNAPSHOT.jar
Binary file not shown
112 pom.xml
@@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>org.meaningfulweb</groupId>
+ <artifactId>meaningfulweb</artifactId>
+ <packaging>jar</packaging>
+ <version>0.0.1-SNAPSHOT</version>
+ <name>og4j</name>
+ <description>web content extractor</description>
+ <url>https://github.com/javasoze/meaningfulweb/</url>
+ <licenses>
+ <license>
+ <name>Apache License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+ <distribution>repo</distribution>
+ </license>
+ </licenses>
+
+ <!-- Set the compiler to java6 -->
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.1</version>
+ <configuration>
+ <source>1.6</source>
+ <target>1.6</target>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <version>2.5</version>
+ <configuration>
+ </configuration>
+ </plugin>
+
+ </plugins>
+ </build>
+
+
+ <dependencies>
+ <dependency>
+ <groupId>proj.og4j</groupId>
+ <artifactId>og4j</artifactId>
+ <version>0.0.1-SNAPSHOT</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>0.9-SNAPSHOT</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>0.9-SNAPSHOT</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-httpclient</groupId>
+ <artifactId>commons-httpclient</artifactId>
+ <version>3.1</version>
+ </dependency>
+ <dependency>
+ <groupId>net.htmlparser.jericho</groupId>
+ <artifactId>jericho-html</artifactId>
+ <version>3.1</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-configuration</groupId>
+ <artifactId>commons-configuration</artifactId>
+ <version>1.6</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>1.5.6</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <version>1.5.6</version>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.5</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+</project>
58 src/main/java/org/meaningfulweb/client/BestImageExtraction.java
@@ -0,0 +1,58 @@
+package org.meaningfulweb.client;
+
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.parser.image.ExtractedContents;
+import org.apache.tika.parser.image.ImageFetcher;
+import org.apache.tika.parser.image.ImageFilter;
+import org.apache.tika.parser.image.ImageInfo;
+import org.apache.tika.parser.image.ImageMeta;
+import org.apache.tika.parser.image.ImageSelector;
+
+public class BestImageExtraction {
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) throws Exception{
+ String url = "http://twitpic.com/3sryl9";
+
+ HtmlParser parser = new HtmlParser();
+
+ HttpClient httpClient = new HttpClient();
+
+ GetMethod get = new GetMethod(url);
+
+ httpClient.executeMethod(get);
+
+ Metadata metadata = new Metadata();
+ metadata.add(Metadata.RESOURCE_NAME_KEY, url);
+ metadata.add(Metadata.CONTENT_TYPE, get.getResponseHeader(Metadata.CONTENT_TYPE).getValue());
+
+ List<ImageMeta> imgInfos = new LinkedList<ImageMeta>();
+ org.apache.tika.parser.image.ImageExtractionContentHandler imgHandler = new org.apache.tika.parser.image.ImageExtractionContentHandler(imgInfos);
+
+
+ parser.parse(get.getResponseBodyAsStream(), imgHandler, metadata, new ParseContext());
+
+ ImageFilter imageFilter = new ImageFilter();
+
+ ImageFetcher imageFetcher = new ImageFetcher();
+
+ ImageSelector imgSelector = new ImageSelector(imageFilter,imageFetcher);
+
+ ExtractedContents extractedContents = new ExtractedContents(url,imgInfos);
+
+ ImageInfo mediaContentInfo = imgSelector.getBestImage(extractedContents, url, true, true);
+
+ System.out.println("best image: "+mediaContentInfo);
+ get.releaseConnection();
+ }
+
+}
63 src/main/java/org/meaningfulweb/client/MeaningfulClient.java
@@ -0,0 +1,63 @@
+package org.meaningfulweb.client;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.detect.TypeDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class MeaningfulClient {
+
+ static Detector buildDetector(){
+ return new Detector(){
+ private Detector typeDetector = new TypeDetector();
+ private Detector defaultDetector = new DefaultDetector();
+
+ @Override
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ MediaType type = typeDetector.detect(input, metadata);
+ if (MediaType.OCTET_STREAM == type){
+ System.out.println("fail over to default detector");
+ type = defaultDetector.detect(input, metadata);
+ }
+ System.out.println("returning type: "+type);
+ return type;
+ }
+
+ };
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) throws Exception{
+ String url ="http://sunset.usc.edu/classes/cs572_2010/Introduction_to_Tika.ppt";
+ Detector detector = buildDetector();
+
+ HttpClient httpClient = new HttpClient();
+
+ GetMethod get = new GetMethod(url);
+
+ httpClient.executeMethod(get);
+
+ Metadata meta = new Metadata();
+
+ meta.set(Metadata.CONTENT_TYPE, get.getResponseHeader(Metadata.CONTENT_TYPE).getValue());
+
+ AutoDetectParser parser = new AutoDetectParser(detector);
+
+ parser.parse(get.getResponseBodyAsStream(), new DefaultHandler(), meta);
+
+ System.out.println(meta);
+ get.releaseConnection();
+ }
+
+}
42 src/main/java/org/meaningfulweb/detector/DetectorBuilder.java
@@ -0,0 +1,42 @@
+package org.meaningfulweb.detector;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.configuration.Configuration;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.detect.TypeDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+public class DetectorBuilder {
+
+ private DetectorBuilder(){
+
+ }
+
+ public static DetectorBuilder getInstance(Configuration config){
+ return new DetectorBuilder();
+ }
+
+ public Detector buildDetector(){
+ return new Detector(){
+ private Detector typeDetector = new TypeDetector();
+ private Detector defaultDetector = new DefaultDetector();
+
+ @Override
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ MediaType type = typeDetector.detect(input, metadata);
+ if (MediaType.OCTET_STREAM == type){
+ System.out.println("fail over to default detector");
+ type = defaultDetector.detect(input, metadata);
+ }
+ System.out.println("returning type: "+type);
+ return type;
+ }
+
+ };
+ }
+}
Please sign in to comment.
Something went wrong with that request. Please try again.