Permalink
Browse files

Merge remote branch 'bcui/master'

  • Loading branch information...
2 parents b2d0cab + cc2f3c1 commit 838d72069a2fd345c04e7228588b23c98265dda4 @wonlay wonlay committed Nov 10, 2011
Showing with 3,314 additions and 7 deletions.
  1. +5 −1 .classpath
  2. +54 −5 clients/python/src/sensei_client.py
  3. +47 −0 example/hadoop-indexing/conf/JobCarDemo.job
  4. +65 −0 example/hadoop-indexing/pom.xml
  5. +36 −0 example/hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/demo/CarDemo.java
  6. +24 −0 example/hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/demo/CarMapInputConverter.java
  7. +17 −0 example/hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/demo/CarShardingStrategy.java
  8. +22 −0 example/pom.xml
  9. BIN lib/commons-httpclient-3.1.jar
  10. +2 −0 pom.xml
  11. +3 −1 sensei-core/src/main/java/com/sensei/dataprovider/http/HttpStreamDataProvider.java
  12. +64 −0 sensei-hadoop-indexing/pom.xml
  13. +252 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/job/MapReduceJob.java
  14. +215 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/keyvalueformat/IntermediateForm.java
  15. +250 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/keyvalueformat/Shard.java
  16. +21 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/map/DummyMapInputConverter.java
  17. +17 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/map/DummyShardingStrategy.java
  18. +25 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/map/MapInputConverter.java
  19. +206 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/map/SenseiMapper.java
  20. +351 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/reduce/FileSystemDirectory.java
  21. +66 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/reduce/IndexUpdateOutputFormat.java
  22. +185 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/reduce/MixedDirectory.java
  23. +119 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/reduce/RAMDirectoryUtil.java
  24. +92 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/reduce/SenseiCombiner.java
  25. +120 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/reduce/SenseiReducer.java
  26. +234 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/reduce/ShardWriter.java
  27. +55 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/util/LuceneIndexFileNameFilter.java
  28. +112 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/util/LuceneUtil.java
  29. +53 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/util/MRConfig.java
  30. +286 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/util/MRJobConfig.java
  31. +40 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/util/PropertiesLoader.java
  32. +22 −0 sensei-hadoop-indexing/src/main/java/com/sensei/indexing/hadoop/util/SenseiJobConfig.java
  33. +151 −0 sensei-hadoop-indexing/src/main/java/org/apache/lucene/store/RAMDirectorySerializer.java
  34. +103 −0 sensei-hadoop-indexing/src/main/java/org/apache/lucene/store/TestRAMDirectorySerializer.java
View
@@ -1,8 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path="sensei-core/src/main/java"/>
+ <classpathentry kind="src" path="example/hadoop-indexing/src/main/java"/>
+ <classpathentry kind="src" path="sensei-hadoop-indexing/src/main/java"/>
<classpathentry kind="src" path="perf/src/main/java"/>
<classpathentry kind="src" path="sensei-core/src/test/java"/>
+ <classpathentry kind="lib" path="lib/commons-httpclient-3.1.jar"/>
<classpathentry kind="var" path="M2_REPO/javax/inject/javax.inject/1/javax.inject-1.jar"/>
<classpathentry kind="var" path="M2_REPO/javax/jms/jms/1.1/jms-1.1.jar"/>
<classpathentry kind="var" path="M2_REPO/javax/servlet/servlet-api/2.5/servlet-api-2.5.jar"/>
@@ -26,6 +29,7 @@
<classpathentry kind="var" path="M2_REPO/fastutil/fastutil/5.0.5/fastutil-5.0.5.jar"/>
<classpathentry kind="var" path="M2_REPO/com/google/inject/guice/3.0/guice-3.0.jar"/>
<classpathentry kind="var" path="M2_REPO/org/apache/httpcomponents/httpclient/4.1/httpclient-4.1.jar"/>
+ <classpathentry kind="var" path="M2_REPO/org/apache/hadoop/hadoop-core/0.20.203.0/hadoop-core-0.20.203.0.jar"/>
<classpathentry kind="var" path="M2_REPO/org/apache/httpcomponents/httpcore/4.1/httpcore-4.1.jar"/>
<classpathentry kind="var" path="M2_REPO/org/codehaus/jackson/jackson-core-asl/1.4.2/jackson-core-asl-1.4.2.jar"/>
<classpathentry kind="var" path="M2_REPO/org/codehaus/jackson/jackson-mapper-asl/1.7.5/jackson-mapper-asl-1.7.5.jar"/>
@@ -71,7 +75,7 @@
<classpathentry kind="var" path="M2_REPO/org/springframework/spring-core/2.5.6/spring-core-2.5.6.jar"/>
<classpathentry kind="var" path="M2_REPO/org/apache/velocity/velocity/1.6.4/velocity-1.6.4.jar"/>
<classpathentry kind="var" path="M2_REPO/com/linkedin/zoie/zoie-core/2.7.0-SNAPSHOT/zoie-core-2.7.0-SNAPSHOT.jar"/>
- <classpathentry kind="var" path="M2_REPO/com/browseengine/bobo/bobo-browse/3.0.0-SNAPSHOT/bobo-browse-3.0.0-SNAPSHOT.jar"/>
+ <classpathentry kind="var" path="M2_REPO/com/browseengine/bobo/bobo-browse/3.0.0-SNAPSHOT/bobo-browse-3.0.0-SNAPSHOT.jar"/>
<classpathentry kind="var" path="M2_REPO/com/linkedin/zoie/zoie-jms/2.7.0-SNAPSHOT/zoie-jms-2.7.0-SNAPSHOT.jar" sourcepath="M2_REPO/com/linkedin/zoie/zoie-jms/2.7.0-SNAPSHOT/zoie-jms-2.7.0-SNAPSHOT-sources.jar">
<attributes>
<attribute name="javadoc_location" value="jar:file:/Users/jwang/.m2/repository/com/linkedin/zoie/zoie-jms/2.7.0-SNAPSHOT/zoie-jms-2.7.0-SNAPSHOT-javadoc.jar!/"/>
@@ -158,19 +158,24 @@
<statement> ::= ( <select_stmt> | <describe_stmt> ) [';']
-<select_stmt> ::= SELECT <select_list> <from_clause> [<where_clause>] [<given_clause>] <additional_clauses>
+<select_stmt> ::= SELECT <select_list> <from_clause> [<where_clause>] [<given_clause>]
+ [<additional_clauses>]
+
<describe_stmt> ::= ( DESC | DESCRIBE ) <index_name>
<select_list> ::= '*' | <column_name_list>
+
<column_name_list> ::= <column_name> ( ',' <column_name> )*
<from_clause> ::= FROM <index_name>
<where_clause> ::= WHERE <search_condition>
+
<search_condition> ::= <predicates>
| <cumulative_predicates>
<predicates> ::= <predicate> ( AND <predicate> )*
+
<predicate> ::= <in_predicate>
| <contains_all_predicate>
| <equal_predicate>
@@ -182,92 +187,130 @@
| <same_column_or_pred>
<in_predicate> ::= <column_name> [NOT] IN <value_list> [<except_clause>] [<predicate_props>]
-<contains_all_predicate> ::= <column_name> CONTAINS ALL <value_list> [<except_clause>] [<predicate_props>]
+
+<contains_all_predicate> ::= <column_name> CONTAINS ALL <value_list> [<except_clause>]
+ [<predicate_props>]
+
<equal_predicate> ::= <column_name> '=' <value> [<predicate_props>]
+
<not_equal_predicate> ::= <column_name> '<>' <value> [<predicate_props>]
+
<query_predicate> ::= QUERY IS <quoted_string>
+
<between_predicate> ::= <column_name> [NOT] BETWEEN <value> AND <value>
+
<range_predicate> ::= <column_name> <range_op> <numeric>
+
<time_predicate> ::= <column_name> IN LAST <time_span>
| <column_name> ( SINCE | AFTER | BEFORE ) <time_expr>
-<same_column_or_pred> ::= '(' + <cumulative_predicates> + ')'
+<same_column_or_pred> ::= '(' <cumulative_predicates> ')'
<cumulative_predicates> ::= <cumulative_predicate> ( OR <cumulative_predicate> )*
+
<cumulative_predicate> ::= <in_predicate>
| <equal_predicate>
| <between_predicate>
| <range_predicate>
| <time_predicate>
<value_list> ::= '(' <value> ( ',' <value> )* ')'
+
<value> ::= <quoted_string> | <numeric>
+
<range_op> ::= '<' | '<=' | '>=' | '>'
<except_clause> ::= EXCEPT <value_list>
<predicate_props> ::= WITH <prop_list>
<prop_list> ::= '(' <key_value_pair> ( ',' <key_value_pair> )* ')'
+
<key_value_pair> ::= <quoted_string> ':' <quoted_string>
<given_clause> ::= GIVEN FACET PARAM <facet_param_list>
+
<facet_param_list> ::= <facet_param> ( ',' <facet_param> )*
+
<facet_param> ::= '(' <facet_name> <facet_param_name> <facet_param_type> <facet_param_value> ')'
+
<facet_param_name> ::= <quoted_string>
+
<facet_param_type> ::= BOOLEAN | INT | LONG | STRING | BYTEARRAY | DOUBLE
+
<facet_param_value> ::= <quoted_string>
-<additional_clauses> ::= ( <additional_clause> )*
+<additional_clauses> ::= ( <additional_clause> )+
+
<additional_clause> ::= <order_by_clause>
| <group_by_clause>
| <limit_clause>
| <browse_by_clause>
| <fetching_stored_clause>
<order_by_clause> ::= ORDER BY <sort_specs>
+
<sort_specs> ::= <sort_spec> ( ',', <sort_spec> )*
+
<sort_spec> ::= <column_name> [<ordering_spec>]
+
<ordering_spec> ::= ASC | DESC
<group_by_clause> ::= GROUP BY <group_spec>
+
<group_spec> ::= <facet_name> [TOP <max_per_group>]
<limit_clause> ::= LIMIT [<offset> ','] <count>
+
<offset> ::= ( <digit> )+
+
<count> ::= ( <digit> )+
<browse_by_clause> ::= BROWSE BY <facet_specs>
+
<facet_specs> ::= <facet_spec> ( ',' <facet_spec> )*
+
<facet_spec> ::= <facet_name> [<facet_expression>]
+
<facet_expression> ::= '(' <expand_flag> <count> <count> <facet_ordering> ')'
+
<expand_flag> ::= TRUE | FALSE
+
<facet_ordering> ::= HITS | VALUE
<fetching_stored_clause> ::= FETCHING STORED [<fetching_flag>]
+
<fetching_flag> ::= TRUE | FALSE
<quoted_string> ::= '"' ( <char> )* '"'
| "'" ( <char> )* "'"
<identifier> ::= <identifier_start> ( <identifier_part> )*
+
<identifier_start> ::= <alpha> | '-' | '_'
+
<identifier_part> ::= <identifier_start> | <digit>
<column_name> ::= <identifier>
+
<facet_name> ::= <identifier>
<alpha> ::= <alpha_lower_case> | <alpha_upper_case>
<alpha_upper_case> ::= A | B | C | D | E | F | G | H | I | J | K | L | M | N | O
| P | Q | R | S | T | U | V | W | X | Y | Z
+
<alpha_lower_case> ::= a | b | c | d | e | f | g | h | i | j | k | l | m | n | o
| p | q | r | s | t | u | v | w | x | y | z
+
<digit> ::= 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
<numeric> ::= <time_expr> | <number>
+
<number> ::= <integer> | <real>
+
<integer> ::= ( <digit> )+
+
<real> ::= ( <digit> )+ '.' ( <digit> )+
<time_expr> ::= <time_span> AGO
@@ -278,13 +321,19 @@
[<time_minute_part>] [<time_second_part>] [<time_millisecond_part>]
<time_week_part> ::= <integer> ( 'week' | 'weeks' )
+
<time_day_part> ::= <integer> ( 'day' | 'days' )
+
<time_hour_part> ::= <integer> ( 'hour' | 'hours' )
+
<time_minute_part> ::= <integer> ( 'minute' | 'minutes' | 'min' | 'mins')
+
<time_second_part> ::= <integer> ( 'second' | 'seconds' | 'sec' | 'secs')
+
<time_millisecond_part> ::= <integer> ( 'millisecond' | 'milliseconds' | 'msec' | 'msecs')
-<date_time_string> ::= <digit><digit><digit><digit> ('-' | '/' | '.') <digit><digit> ('-' | '/' | '.') <digit><digit>
+<date_time_string> ::= <digit><digit><digit><digit> ('-' | '/' | '.') <digit><digit>
+ ('-' | '/' | '.') <digit><digit>
<digit><digit> ':' <digit><digit> ':' <digit><digit>
"""
@@ -0,0 +1,47 @@
+type=java
+job.class=com.sensei.indexing.hadoop.demo.CarDemo
+
+mapreduce.job.maps=2
+sensei.num.shards=3
+
+mapred.job.name=CarDemoShardedIndexing
+
+# if the output.path already exists, delete it first
+sensei.force.output.overwrite=true
+
+# adjust this to a small one if mapper number is huge. default is 50Mb = 52428800
+sensei.max.ramsize.bytes=52428800
+
+############# path of schema for interpreter #############
+
+##### TextJSON schema Sample (car demo) absolute path ######
+sensei.schema.file.url=conf/schema.xml
+
+############ Input and Output ##################
+
+####### Text JSON data (car demo) #####
+read.lock=data/cars.json
+sensei.input.dirs=data/cars.json
+
+######## Output configuration ######
+write.lock=example/hadoop-indexing/output
+sensei.output.dir=example/hadoop-indexing/fileoutput
+
+######## Index output location ######
+sensei.index.path=example/hadoop-indexing/index
+
+############# schemas for mapper input ################
+
+sensei.input.format=org.apache.hadoop.mapred.TextInputFormat
+
+############## Sharding strategy ################
+sensei.distribution.policy=com.sensei.indexing.hadoop.demo.CarShardingStrategy
+
+############# Converter for mapper input (data conversion and filtering) ##########
+sensei.mapinput.converter=com.sensei.indexing.hadoop.demo.CarMapInputConverter
+
+############# Analyzer configuration for lucene ###############
+sensei.document.analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
+sensei.document.analyzer.version=LUCENE_30
+
+
@@ -0,0 +1,65 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>com.senseidb</groupId>
+ <artifactId>sensei-parent</artifactId>
+ <version>1.0.0-SNAPSHOT</version>
+ <relativePath>../../sensei-parent/pom.xml</relativePath>
+ </parent>
+
+ <artifactId>sensei-example-hadoop</artifactId>
+ <packaging>jar</packaging>
+ <name>sensei example hadoop indexing</name>
+ <description>sensei hadoop indexer example</description>
+
+ <!-- Set the compiler to java6 -->
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.1</version>
+ <configuration>
+ <source>1.6</source>
+ <target>1.6</target>
+ <encoding>UTF-8</encoding>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>com.github.maven-hadoop.plugin</groupId>
+ <artifactId>maven-hadoop-plugin</artifactId>
+ <version>0.20.1</version>
+ <configuration>
+ <hadoopHome>/home/jwang/opensource/hadoop-0.21.0</hadoopHome>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>sensei-hadoop-indexing</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-core</artifactId>
+ <version>0.20.204.0</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.json</groupId>
+ <artifactId>json</artifactId>
+ <version>20080701</version>
+ </dependency>
+ </dependencies>
+
+</project>
@@ -0,0 +1,36 @@
+package com.sensei.indexing.hadoop.demo;
+
+
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+import com.sensei.indexing.hadoop.job.MapReduceJob;
+import com.sensei.indexing.hadoop.util.PropertiesLoader;
+
+
+public class CarDemo extends MapReduceJob implements Tool {
+
+
+ public int run(String[] args) throws Exception {
+ JobConf conf = createJob(CarDemo.class);
+
+ conf.setJobName("CarDemo");
+ JobClient.runJob(conf);
+ return 0;
+ }
+
+
+ public static void main(String[] args) throws Exception {
+ long start = System.currentTimeMillis();
+ Configuration conf = PropertiesLoader.loadProperties("example/hadoop-indexing/conf/JobCarDemo.job");
+ int res = ToolRunner.run(conf, new CarDemo(), args);
+ long end = System.currentTimeMillis();
+ System.out.println("Total time: " + (end - start));
+ System.exit(res);
+ }
+
+}
@@ -0,0 +1,24 @@
+package com.sensei.indexing.hadoop.demo;
+
+import org.json.JSONException;
+import org.json.JSONObject;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+
+import com.sensei.indexing.hadoop.map.MapInputConverter;
+
+public class CarMapInputConverter extends MapInputConverter {
+
+ @Override
+ public JSONObject getJsonInput(Object key, Object value, Configuration conf) throws JSONException {
+ String line = ((Text) value).toString();
+ return new JSONObject(line);
+ }
+
+ @Override
+ protected JSONObject doFilter(JSONObject data) throws Exception {
+ return data;
+ }
+
+
+}
Oops, something went wrong.

0 comments on commit 838d720

Please sign in to comment.