intel-analytics · yangw1234 · Jun 14, 2017
diff --git a/.gitignore b/.gitignore
@@ -94,3 +94,10 @@ metastore_db/
 
 # Download files
 datasets/
+
+# IDEA files
+.idea/
+target/
+*.iml
+*.pyc
+*.log
diff --git a/scala/udfpredictor/pom.xml b/scala/udfpredictor/pom.xml
@@ -0,0 +1,193 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>com.intel.analytics</groupId>
+    <artifactId>udfpredictor</artifactId>
+    <version>0.1-SNAPSHOT</version>
+
+    <repositories>
+        <repository>
+            <id>central</id>
+            <name>Maven Repository</name>
+            <url>https://repo1.maven.org/maven2</url>
+            <releases>
+                <enabled>true</enabled>
+            </releases>
+            <snapshots>
+                <enabled>false</enabled>
+            </snapshots>
+        </repository>
+        <repository>
+            <id>apache-repo</id>
+            <name>Apache Repository</name>
+            <url>https://repository.apache.org/content/repositories/releases</url>
+            <releases>
+                <enabled>true</enabled>
+            </releases>
+            <snapshots>
+                <enabled>false</enabled>
+            </snapshots>
+        </repository>
+        <repository>
+            <id>jboss-repo</id>
+            <name>JBoss Repository</name>
+            <url>https://repository.jboss.org/nexus/content/repositories/releases</url>
+            <releases>
+                <enabled>true</enabled>
+            </releases>
+            <snapshots>
+                <enabled>false</enabled>
+            </snapshots>
+        </repository>
+        <repository>
+            <id>sonatype</id>
+            <name>sonatype repository</name>
+            <url>https://oss.sonatype.org/content/groups/public/</url>
+            <releases>
+                <enabled>true</enabled>
+            </releases>
+            <snapshots>
+                <enabled>true</enabled>
+            </snapshots>
+        </repository>
+    </repositories>
+
+    <properties>
+        <java.version>1.7</java.version>
+        <javac.version>1.7</javac.version>
+        <scala.major.version>2.11</scala.major.version>
+        <scala.version>2.11.8</scala.version>
+        <scala.macros.version>2.0.1</scala.macros.version>
+        <scalatest.version>2.2.4</scalatest.version>
+        <spark.version>2.1.0</spark.version>
+        <spark-scope>provided</spark-scope>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.scala-lang</groupId>
+            <artifactId>scala-compiler</artifactId>
+            <version>${scala.version}</version>
+            <scope>${spark-scope}</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.scala-lang</groupId>
+            <artifactId>scala-reflect</artifactId>
+            <version>${scala.version}</version>
+            <scope>${spark-scope}</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.scala-lang</groupId>
+            <artifactId>scala-library</artifactId>
+            <version>${scala.version}</version>
+            <scope>${spark-scope}</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.scala-lang</groupId>
+            <artifactId>scalap</artifactId>
+            <version>${scala.version}</version>
+            <scope>${spark-scope}</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.scalatest</groupId>
+            <artifactId>scalatest_${scala.major.version}</artifactId>
+            <version>${scalatest.version}</version>
+            <scope>${spark-scope}</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.intel.analytics.bigdl</groupId>
+            <artifactId>bigdl</artifactId>
+            <version>0.2.0-SNAPSHOT</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-core_${scala.major.version}</artifactId>
+            <version>${spark.version}</version>
+            <scope>${spark-scope}</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-mllib_${scala.major.version}</artifactId>
+            <version>${spark.version}</version>
+            <scope>${spark-scope}</scope>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>net.alchim31.maven</groupId>
+                <artifactId>scala-maven-plugin</artifactId>
+                <version>3.2.0</version>
+                <executions>
+                    <execution>
+                        <id>eclipse-add-source</id>
+                        <goals>
+                            <goal>add-source</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>scala-compile-first</id>
+                        <phase>process-resources</phase>
+                        <goals>
+                            <goal>compile</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>scala-test-compile-first</id>
+                        <phase>process-test-resources</phase>
+                        <goals>
+                            <goal>testCompile</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>attach-scaladocs</id>
+                        <phase>verify</phase>
+                        <goals>
+                            <goal>doc-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+                <configuration>
+                    <scalaVersion>${scala.version}</scalaVersion>
+                    <recompileMode>incremental</recompileMode>
+                    <useZincServer>true</useZincServer>
+                    <args>
+                        <arg>-unchecked</arg>
+                        <arg>-deprecation</arg>
+                        <arg>-feature</arg>
+                    </args>
+                    <!-- The following plugin is required to use quasiquotes in Scala 2.10 and is used
+                         by Spark SQL for code generation. -->
+                    <compilerPlugins>
+                        <compilerPlugin>
+                            <groupId>org.scalamacros</groupId>
+                            <artifactId>paradise_${scala.version}</artifactId>
+                            <version>${scala.macros.version}</version>
+                        </compilerPlugin>
+                    </compilerPlugins>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-assembly-plugin</artifactId>
+                <configuration>
+                    <descriptorRefs>
+                        <descriptorRef>jar-with-dependencies</descriptorRef>
+                    </descriptorRefs>
+                </configuration>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>single</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+</project>
diff --git a/scala/udfpredictor/src/main/resources/example/udfpredictor/types b/scala/udfpredictor/src/main/resources/example/udfpredictor/types
@@ -0,0 +1,21 @@
+textType, textLabel
+alt.atheism, 1
+comp.graphics, 2
+comp.os.ms-windows.misc, 3
+comp.sys.ibm.pc.hardware, 4
+comp.sys.mac.hardware, 5
+comp.windows.x, 6
+misc.forsale, 7
+rec.autos, 8
+rec.motorcycles, 9
+rec.sport.baseball, 10
+rec.sport.hockey, 11
+sci.crypt, 12
+sci.electronics, 13
+sci.med, 14
+sci.space, 15
+soc.religion.christian, 16
+talk.politics.guns, 17
+talk.politics.mideast, 18
+talk.politics.misc, 19
+talk.religion.misc, 20
diff --git a/...rc/main/scala/com/intel/analytics/tutorials/example/udfpredictor/DataframePredictor.scala b/...rc/main/scala/com/intel/analytics/tutorials/example/udfpredictor/DataframePredictor.scala
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.tutorials.example.udfpredictor
+
+import com.intel.analytics.bigdl.example.utils.WordMeta
+import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter}
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.functions._
+import org.apache.log4j.{Level, Logger}
+import org.apache.spark.sql.SQLContext
+
+object DataframePredictor {
+
+  LoggerFilter.redirectSparkInfoLogs()
+  Logger.getLogger("com.intel.analytics.bigdl.example").setLevel(Level.INFO)
+
+  def main(args: Array[String]): Unit = {
+
+    Utils.localParser.parse(args, TextClassificationUDFParams()).foreach { param =>
+
+      val conf = Engine.createSparkConf()
+      conf.setAppName("Text classification")
+        .set("spark.task.maxFailures", "1")
+      val sc = new SparkContext(conf)
+      Engine.init
+
+      // Create spark session
+      val spark = new SQLContext(sc)
+      import spark.implicits._
+
+      var word2Meta = None: Option[Map[String, WordMeta]]
+      var word2Index = None: Option[Map[String, Int]]
+      var word2Vec = None: Option[Map[Float, Array[Float]]]
+
+      val result = Utils.getModel(sc, param)
+
+      val model = result._1
+      word2Meta = result._2
+      word2Vec = result._3
+      val sampleShape = result._4
+
+      // if not train, load word meta from file
+      if (word2Meta.isEmpty) {
+        val word2IndexMap = sc.textFile(s"${param.baseDir}/word2Meta.txt").map(item => {
+          val tuple = item.stripPrefix("(").stripSuffix(")").split(",")
+          (tuple(0), tuple(1).toInt)
+        }).collect()
+        word2Index = Some(word2IndexMap.toMap)
+      } else {
+        // already trained, use existing word meta
+        val word2IndexMap = collection.mutable.HashMap.empty[String, Int]
+        for((word, wordMeta) <- word2Meta.get) {
+          word2IndexMap += (word -> wordMeta.index)
+        }
+        word2Index = Some(word2IndexMap.toMap)
+      }
+
+      // if not train, create word vec
+      if (word2Vec.isEmpty) {
+        word2Vec = Some(Utils.getWord2Vec(word2Index.get))
+      }
+      val predict = Utils.genUdf(sc, model, sampleShape, word2Index.get, word2Vec.get)
+
+      // register udf for data frame
+      val classifierUDF = udf(predict)
+
+      val data = Utils.loadTestData(param.testDir)
+
+      val df = spark.createDataFrame(data)
+
+      // static dataframe
+      val types = sc.textFile(Utils.getResourcePath("/example/udfpredictor/types"))
+        .filter(!_.contains("textType"))
+        .map { line =>
+          val words = line.split(",")
+          (words(0).trim, words(1).trim.toInt)
+        }.toDF("textType", "textLabel")
+
+      val classifyDF1 = df.withColumn("textLabel", classifierUDF($"text"))
+        .select("filename", "text", "textLabel")
+      classifyDF1.show()
+
+      val filteredDF1 = df.filter(classifierUDF($"text") === 9)
+      filteredDF1.show()
+
+      val df_join = classifyDF1.join(types, "textLabel")
+      df_join.show()
+
+      // aggregation
+      val typeCount = classifyDF1.groupBy($"textLabel").count()
+      typeCount.show()
+
+      // play with udf in sqlcontext
+      spark.udf.register("textClassifier", predict)
+      df.registerTempTable("textTable")
+
+      val classifyDF2 = spark
+        .sql("SELECT filename, textClassifier(text) AS textType_sql, text " +
+          "FROM textTable")
+      classifyDF2.show()
+
+      val filteredDF2 = spark
+        .sql("SELECT filename, textClassifier(text) AS textType_sql, text " +
+          "FROM textTable WHERE textClassifier(text) = 9")
+      filteredDF2.show()
+      sc.stop()
+    }
+
+  }
+
+}