In [None]:
spark

In [None]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.{col}

In [None]:
val spark = SparkSession.builder.appName("Spark ML").getOrCreate()

In [None]:
val TRAIN_DATA_PATH = "/user/vladimir.belov/lab05_train.csv"
val TEST_DATA_PATH = "/user/vladimir.belov/lab05_test.csv"
val SAVE_PATH = "/user/vladimir.belov/lab05"

# Data preparation

In [None]:
var trainDf = spark
    .read
    .format("csv")
    .option("header", "true")
    .option("delimiter", ",")
    .option("inferSchema", "true")
    .load(TRAIN_DATA_PATH)
    .drop("_c0")
    .cache

trainDf.show(1, 0, true)

In [None]:
var testDf = spark
    .read
    .format("csv")
    .option("header", "true")
    .option("delimiter", ",")
    .option("inferSchema", "true")
    .load(TEST_DATA_PATH)
    .drop("_c0")
    .na.fill(0)
    .cache
trainDf.show(1, 0, true)

In [None]:
val categoriesCols = List(
    "CLNT_TRUST_RELATION", 
    "APP_MARITAL_STATUS",
    "APP_KIND_OF_PROP_HABITATION", 
    "CLNT_JOB_POSITION_TYPE",
    "CLNT_JOB_POSITION", 
    "APP_DRIVING_LICENSE", 
    "APP_EDUCATION",
    "APP_TRAVEL_PASS", 
    "APP_CAR", 
    "APP_POSITION_TYPE", 
    "APP_EMP_TYPE",
    "APP_COMP_TYPE", 
    "PACK"
)

In [None]:
trainDf = trainDf.drop(categoriesCols:_*).na.fill(0)

# Train model

In [None]:
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.feature.{VectorAssembler}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}

In [None]:
val vectorAssembler = new VectorAssembler()
    .setInputCols(trainDf.drop("TARGET").columns)
    .setOutputCol("features")

In [None]:
val trainDfCleaned = vectorAssembler.transform(trainDf).cache

In [None]:
trainDfCleaned.show(1, 0, true)

In [None]:
// Train RandomForest
val randomForestModel = new RandomForestClassifier()
    .setLabelCol("TARGET")
    .setFeaturesCol("features")
    .setSeed(42)
    .fit(trainDfCleaned)

In [None]:
val gbt = new GBTClassifier()
    .setLabelCol("TARGET")
    .setFeaturesCol("features")
    .setMaxIter(10)
    .setFeatureSubsetStrategy("auto")
    .fit(trainDfCleaned)

In [None]:
import org.apache.spark.ml.linalg.{SparseVector, Vector}
import org.apache.spark.mllib.linalg.{Vector => OldVector}

val vectorToArrayUdf = udf { vec: Any =>
    vec match {
      case v: Vector => v.toArray
      case v: OldVector => v.toArray
      case v => throw new IllegalArgumentException(
        "function vector_to_array requires a non-null input argument and input type must be " +
        "`org.apache.spark.ml.linalg.Vector` or `org.apache.spark.mllib.linalg.Vector`, " +
        s"but got ${ if (v == null) "null" else v.getClass.getName }.")
    }
  }.asNonNullable()

In [None]:
val realPredictions = gbt.transform(vectorAssembler.transform(testDf)).cache

# Predict

In [None]:
realPredictions
    .withColumn("prediction", vectorToArrayUdf(col("probability")).getItem(1))
    .select("ID", "prediction")
    .withColumnRenamed("prediction", "target")
    .coalesce(1)
    .write
    .option("header","true")
    .option("sep" ,"\t")
    .mode("overwrite")
    .format("csv")
    .save(SAVE_PATH)

In [None]:
spark.stop()