In [None]:
spark

In [None]:
spark.version

In [None]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types._

import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier}

In [None]:
val train_df = spark.read
                    .option("delimiter", ",")
                    .option("header", "True")
                    .option("encoding","utf-8")
                    .csv("/labs/slaba05/lab05_train.csv")
                    .filter("TARGET is not null")
val test_df = spark.read
                    .option("delimiter", ",")
                    .option("header", "True")
                    .option("encoding","utf-8")
                    .csv("/labs/slaba05/lab05_test.csv")

In [None]:
var df_train = train_df.drop("_c0", "CLNT_TRUST_RELATION", "APP_MARITAL_STATUS", "APP_KIND_OF_PROP_HABITATION", 
             "CLNT_JOB_POSITION_TYPE", "CLNT_JOB_POSITION", "APP_DRIVING_LICENSE", "APP_EDUCATION", 
             "APP_TRAVEL_PASS", "APP_CAR", "APP_POSITION_TYPE", "APP_EMP_TYPE", "APP_COMP_TYPE", "PACK")
var df_test = test_df.drop("_c0", "CLNT_TRUST_RELATION", "APP_MARITAL_STATUS", "APP_KIND_OF_PROP_HABITATION", 
             "CLNT_JOB_POSITION_TYPE", "CLNT_JOB_POSITION", "APP_DRIVING_LICENSE", "APP_EDUCATION", 
             "APP_TRAVEL_PASS", "APP_CAR", "APP_POSITION_TYPE", "APP_EMP_TYPE", "APP_COMP_TYPE", "PACK")

In [None]:
var feature_list = Array[String]()
for (t <- df_train.columns if (t!="ID"))
{ df_train = df_train.withColumn(t, col(t).cast(FloatType))
        feature_list +:= t}
val b = feature_list.filter(! _.contains("TARGET"))

In [None]:
for (t <- df_test.columns if (t!="ID"))
{ df_test = df_test.withColumn(t, col(t).cast(FloatType))}

In [None]:
df_test = df_test.na.fill(value=0.0)
df_train = df_train.na.fill(value=0.0)

In [None]:
val assembler = new VectorAssembler()
                    .setInputCols(b)
                    .setOutputCol("features")

In [None]:
df_train = df_train.withColumn("TARGET", col("TARGET").cast(IntegerType))

In [None]:
val train_data=assembler.transform(df_train)

In [None]:
val evaluator = new BinaryClassificationEvaluator()
                        .setLabelCol("TARGET")
                        .setRawPredictionCol("probability_")
                        .setMetricName("areaUnderROC")

In [None]:
val gbt = new GBTClassifier()
      .setFeaturesCol("features")
      .setLabelCol("TARGET")
      .setPredictionCol("prediction_")
      .setProbabilityCol("probability_")
      .setRawPredictionCol("raw_prediction_")

In [None]:
val test_data=assembler.transform(df_test)

In [None]:
val model_gbt = gbt.fit(train_data)

In [None]:
val predictions_train = model_gbt.transform(train_data)

In [None]:
val roc_auc = evaluator.evaluate(predictions_train)

In [None]:
val predictions_test = model_gbt.transform(test_data)

In [None]:
val sparse_to_dense = udf((v: Vector) =>  v.toArray)

In [None]:
predictions_test.select(col("ID").alias("id"), sparse_to_dense(col("probability_")).getItem(1).alias("target")).show

In [None]:
val to_file = predictions_test.select(col("ID").alias("id"), sparse_to_dense(col("probability_")).getItem(1).alias("target")).coalesce(1)

In [None]:
to_file.write.mode("overwrite").format("csv").option("delimiter", "\t").option("header", "True").option("encoding", "UTF-8").save("lab05")