In [None]:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.Row

In [None]:
case class Tweets(id: Int, label: Double, source: String, text: String)
val training = sc.textFile("/training-tweet.csv").zipWithIndex().filter(_._2 > 0).map(line => line._1.split(",")).map(tw => Tweets(tw(0).toInt, tw(1).toDouble, tw(2), tw(3))).toDF() 
val test = sc.textFile("/test-tweets.csv").zipWithIndex().filter(_._2 > 0).map(line => line._1.split(",")).map(tw => Tweets(tw(0).toInt, tw(1).toDouble, tw(2), tw(3))).toDF() 

In [None]:
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
val hashingTF = new HashingTF().setNumFeatures(1000).setInputCol(tokenizer.getOutputCol).setOutputCol("features")
val lr = new LogisticRegression().setMaxIter(10).setRegParam(0.01)
val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, lr))

In [None]:
// Fit the pipeline to training documents.
val model = pipeline.fit(training)

In [None]:
val modelem = model.transform(test).select("id", "label", "text", "probability", "prediction")
modelem.collect().foreach { 
	case Row(id: Int, label: Double, text: String, probability: Vector, prediction: Double) => 
	println(s"($id, $text) --> prob=$probability, prediction=$prediction, label=$label")
  }

In [None]:
(modelem.filter("label = prediction").count().toDouble / modelem.count().toDouble) * 100D