In [0]:
import org.apache.spark.sql.types._
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.classification._
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.ml.Model
import org.apache.spark.sql._

In [1]:
def readCsv(path: String) = spark.read
    .option("header", true)
    .option("quote", "\"")
    .option("delimiter", ",")
    .option("charset", "utf-8")
    .option("escape","\"")
    .option("multiline",true)
    .csv(path)

def getShape(dataset: DataFrame) = (dataset.count(), dataset.columns.length)

def isNullOrEmpty(column:Column): Column = {
    column.isNull or column <=> lit("") or column.isNaN or column <=> lit("nan")
}

In [2]:
val submitDataPath = "/data/submit.csv"
val testDataPath = "/data/test.csv"
val trainDataPath = "/data/train.csv"
val submitData = readCsv(submitDataPath) 
val testData = readCsv(testDataPath) 
val trainData = readCsv(trainDataPath)

In [3]:
trainData.show(10)
testData.show(10)
submitData.show(10)
println(s"Train Shape : ${getShape(trainData)}")
println(s"Test Shape : ${getShape(testData)}")
println(s"Submit Shape : ${getShape(submitData)}")

In [4]:
submitData.show(2, truncate=false)

In [5]:
println("Train data summary")
println("Schema")
trainData.printSchema()
println("Summary (all)")
trainData.summary("count").show()
println("Summary (not nulls)")
println(s"""id - ${trainData.select("id").filter(!isNullOrEmpty($"id")).count()}""")
println(s"""title - ${trainData.select("title").filter(!isNullOrEmpty($"title")).count()}""")
println(s"""author - ${trainData.select("author").filter(!isNullOrEmpty($"author")).count()}""")
println(s"""text - ${trainData.select("text").filter(!isNullOrEmpty($"text")).count()}""")
println(s"""label - ${trainData.select("label").filter(!isNullOrEmpty($"label")).count()}""")
println("Summary (nulls)")
println(s"""id - ${trainData.select("id").filter(isNullOrEmpty($"id")).count()}""")
println(s"""title - ${trainData.select("title").filter(isNullOrEmpty($"title")).count()}""")
println(s"""author - ${trainData.select("author").filter(isNullOrEmpty($"author")).count()}""")
println(s"""text - ${trainData.select("text").filter(isNullOrEmpty($"text")).count()}""")
println(s"""label - ${trainData.select("label").filter(isNullOrEmpty($"label")).count()}""")

In [6]:
val joinedTestData = testData
    .join(submitData.withColumnRenamed("id", "id2"), $"id" === $"id2", "inner")
    .drop("id2")

val filledTrainData = trainData
    .withColumn("title", when(isNullOrEmpty($"title"), "").otherwise($"title"))
    .withColumn("author", when(isNullOrEmpty($"author"), "").otherwise($"author"))
    .withColumn("text", when(isNullOrEmpty($"text"), "").otherwise($"text"))
    
val filledTestData = joinedTestData
    .withColumn("title", when(isNullOrEmpty($"title"), "").otherwise($"title"))
    .withColumn("author", when(isNullOrEmpty($"author"), "").otherwise($"author"))
    .withColumn("text", when(isNullOrEmpty($"text"), "").otherwise($"text"))
    
val trainDataWithTotal = filledTrainData
    .withColumn("total", concat_ws(" ", $"title", $"author", $"text"))

val testDataWithTotal = filledTestData
    .withColumn("total", concat_ws(" ", $"title", $"author", $"text"))
    
val trainingData = trainDataWithTotal
    .select("label", "total")
    .withColumn("label", col("label").cast("int"))
    
val testingData = testDataWithTotal
    .select("label", "total")
    .withColumn("label", col("label").cast("int"))

In [7]:
trainingData.show(10)
testingData.show(10)

In [8]:
val tokenizer = new Tokenizer().setInputCol("total").setOutputCol("words")
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(30)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features").setMinDocFreq(5)

val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, idf))

In [9]:
val model = pipeline.fit(trainingData)
val transformedTrainingData = model.transform(trainingData)
val transformedTestingData = model.transform(testingData)

In [10]:
transformedTrainingData.show(10)

In [11]:
def logisticRegressionModelLearning(trainData: DataFrame, testData: DataFrame, regParam: Double = 0.0, maxIter: Int = 100, elasticNetParam: Double = 0.0) = {
    val classifier = new LogisticRegression()
        .setLabelCol("label")
        .setFeaturesCol("features")
        .setAggregationDepth(2)
        .setThreshold(0.5)
        .setFamily("auto")
        .setStandardization(true)
        .setFitIntercept(true)
        .setMaxIter(maxIter)
        .setTol(1E-6)
        .setRegParam(regParam)
        .setElasticNetParam(elasticNetParam)
    val model = classifier.fit(trainData)
    val predictions = model.transform(testData)
    val evaluator = new BinaryClassificationEvaluator().setLabelCol("label")
    val accuracy = evaluator.evaluate(predictions)
    println(s"Model accuracy: ${accuracy}")
}
logisticRegressionModelLearning(transformedTrainingData, transformedTestingData, regParam = 0.0, maxIter = 100, elasticNetParam = 0.0)

In [12]:
logisticRegressionModelLearning(transformedTrainingData, transformedTestingData, regParam = 0.01, maxIter = 1, elasticNetParam = 0.0)
logisticRegressionModelLearning(transformedTrainingData, transformedTestingData, regParam = 0.01, maxIter = 5, elasticNetParam = 0.5)
logisticRegressionModelLearning(transformedTrainingData, transformedTestingData, regParam = 0.01, maxIter = 10, elasticNetParam = 0.10)
logisticRegressionModelLearning(transformedTrainingData, transformedTestingData, regParam = 0.5, maxIter = 1, elasticNetParam = 0.0)
logisticRegressionModelLearning(transformedTrainingData, transformedTestingData, regParam = 0.5, maxIter = 5, elasticNetParam = 0.5)
logisticRegressionModelLearning(transformedTrainingData, transformedTestingData, regParam = 0.5, maxIter = 10, elasticNetParam = 0.10)
logisticRegressionModelLearning(transformedTrainingData, transformedTestingData, regParam = 2.0, maxIter = 1, elasticNetParam = 0.0)
logisticRegressionModelLearning(transformedTrainingData, transformedTestingData, regParam = 2.0, maxIter = 5, elasticNetParam = 0.5)
logisticRegressionModelLearning(transformedTrainingData, transformedTestingData, regParam = 2.0, maxIter = 10, elasticNetParam = 0.10)

In [13]:
def randomForestModelLearning(trainData: DataFrame, testData: DataFrame, maxBins: Int = 32, minInfoGain: Double = 0.0, numTrees: Int = 20, impurity: String = "gini") = {
    val classifier = new RandomForestClassifier()
        .setLabelCol("label")
        .setFeaturesCol("features")
        .setMaxBins(maxBins)
        .setMinInfoGain(minInfoGain)
        .setNumTrees(numTrees)
        .setImpurity(impurity)
    val model = classifier.fit(trainData)
    val predictions = model.transform(testData)
    val evaluator = new BinaryClassificationEvaluator().setLabelCol("label")
    val accuracy = evaluator.evaluate(predictions)
    println(s"Model accuracy: ${accuracy}")
}
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 32, minInfoGain = 0.0, numTrees = 20, impurity = "gini")

In [14]:
println("1")
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 25, minInfoGain = 0.01, numTrees = 20, impurity = "gini")
println("2")
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 25, minInfoGain = 0.001, numTrees = 20, impurity = "gini")
println("3")
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 31, minInfoGain = 0.01, numTrees = 20, impurity = "gini")
println("4")
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 31, minInfoGain = 0.001, numTrees = 20, impurity = "gini")
println("5")
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 25, minInfoGain = 0.01, numTrees = 60, impurity = "gini")
println("6")
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 25, minInfoGain = 0.001, numTrees = 60, impurity = "gini")
println("7")
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 31, minInfoGain = 0.01, numTrees = 60, impurity = "gini")
println("8")
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 31, minInfoGain = 0.001, numTrees = 60, impurity = "gini")
println("9")
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 25, minInfoGain = 0.01, numTrees = 20, impurity = "entropy")
println("10")
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 25, minInfoGain = 0.001, numTrees = 20, impurity = "entropy")
println("11")
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 31, minInfoGain = 0.01, numTrees = 20, impurity = "entropy")
println("12")
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 31, minInfoGain = 0.001, numTrees = 20, impurity = "entropy")
println("13")
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 25, minInfoGain = 0.01, numTrees = 60, impurity = "entropy")
println("14")
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 25, minInfoGain = 0.001, numTrees = 60, impurity = "entropy")
println("15")
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 31, minInfoGain = 0.01, numTrees = 60, impurity = "entropy")
println("16")
randomForestModelLearning(transformedTrainingData, transformedTestingData, maxBins = 31, minInfoGain = 0.001, numTrees = 60, impurity = "entropy")

In [15]:
def svmModelLearning(trainData: DataFrame, testData: DataFrame, regParam: Double = 0.0, maxIter: Int = 100, threshold: Double = 0.0) = {
    val classifier = new LinearSVC()
        .setLabelCol("label")
        .setFeaturesCol("features")
        .setAggregationDepth(2)
        .setStandardization(true)
        .setTol(1E-6)
        .setFitIntercept(true)
        .setMaxIter(maxIter)
        .setRegParam(regParam)
        .setThreshold(threshold)
    val model = classifier.fit(trainData)
    val predictions = model.transform(testData)
    val evaluator = new BinaryClassificationEvaluator().setLabelCol("label")
    val accuracy = evaluator.evaluate(predictions)
    println(s"Model accuracy: ${accuracy}")
}
//svmModelLearning(transformedTrainingData, transformedTestingData, regParam = 0.0, maxIter = 100, threshold = 0.0)

In [16]:
println("1")
svmModelLearning(transformedTrainingData, transformedTestingData, regParam = 0.01, maxIter = 1, threshold = 0.3)
println("2")
svmModelLearning(transformedTrainingData, transformedTestingData, regParam = 0.01, maxIter = 5, threshold = 0.5)
println("3")
svmModelLearning(transformedTrainingData, transformedTestingData, regParam = 0.01, maxIter = 10, threshold = 0.8)
println("4")
svmModelLearning(transformedTrainingData, transformedTestingData, regParam = 0.5, maxIter = 1, threshold = 0.3)
println("5")
svmModelLearning(transformedTrainingData, transformedTestingData, regParam = 0.5, maxIter = 5, threshold = 0.5)
println("6")
svmModelLearning(transformedTrainingData, transformedTestingData, regParam = 0.5, maxIter = 10, threshold = 0.8)
println("7")
svmModelLearning(transformedTrainingData, transformedTestingData, regParam = 2.0, maxIter = 1, threshold = 0.3)
println("8")
svmModelLearning(transformedTrainingData, transformedTestingData, regParam = 2.0, maxIter = 5, threshold = 0.5)
println("9")
svmModelLearning(transformedTrainingData, transformedTestingData, regParam = 2.0, maxIter = 10, threshold = 0.8)