In [None]:
import org.apache.spark.sql.SparkSession

val spark = SparkSession.builder().master("local").appName("spark-model").getOrCreate()

In [None]:
println(s"Current spark version is ${spark.version}")

In [None]:
import org.apache.spark.sql.types.{StructType, StructField, IntegerType, LongType, StringType}

val dataSchema = new StructType()
    .add("target", IntegerType)
    .add("id", LongType)
    .add("raw_timestamp", StringType)
    .add("query_status", StringType)
    .add("author", StringType)
    .add("tweet", StringType)

    
val dataPath= "/home/jovyan/data/training.1600000.processed.noemoticon.csv"

val raw_sentiment = spark.read
    .format("csv")
    .option("header",false)
    .schema(dataSchema)
    .load(dataPath)
    .selectExpr("(case when target=4 then 1 else 0 end) as label","tweet")

raw_sentiment.groupBy($"label").count.show

In [None]:
import org.apache.spark.sql.functions._

val getProbability = udf((prediction: org.apache.spark.ml.linalg.Vector) => prediction(1))

In [None]:
// Split the data into training and test sets (30% held out for testing).
val seed = 1234
val Array(trainingData, testingData) = raw_sentiment.randomSplit(Array(0.7, 0.3), seed)
trainingData.cache

In [None]:
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row

val tokenizer = new Tokenizer()
    .setInputCol("tweet")
    .setOutputCol("words")

val hashingTF = new HashingTF()
    .setNumFeatures(1000)
    .setInputCol(tokenizer.getOutputCol)
    .setOutputCol("features")

val rf = new RandomForestClassifier()
    .setLabelCol("label")
    .setFeaturesCol("features")
    .setNumTrees(10)

val pipeline = new Pipeline()
  .setStages(Array(tokenizer, hashingTF, rf))


In [None]:
val model = pipeline.fit(trainingData)


In [None]:
model.stages(2).asInstanceOf[RandomForestClassificationModel]


In [None]:
val predictionDF = model.transform(testingData)


In [None]:
predictionDF.select($"tweet", $"label", getProbability($"probability").alias("clean_probability"), $"prediction").show

In [None]:
// evaluate model with area under ROC
val evaluator = new BinaryClassificationEvaluator()
  .setLabelCol("label")
  .setMetricName("areaUnderROC")
  .setRawPredictionCol("probability")

evaluator.evaluate(predictionDF)


In [None]:
model.write.overwrite().save("/home/jovyan/models/spark-ml-model")

In [None]:
spark.stop()