In [1]:
spark

Waiting for a Spark session to start...

org.apache.spark.sql.SparkSession@5f46d224

In [2]:
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.{DoubleType, IntegerType}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier}
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.sql.functions.{col, udf}

In [3]:
val train_df = spark.read
.option("delimiter", ",")
.option("header", true)
.csv("/labs/slaba05/lab05_train.csv")

Waiting for a Spark session to start...

train_df = [_c0: string, ID: string ... 115 more fields]


[_c0: string, ID: string ... 115 more fields]

In [4]:
val test_df = spark.read
.option("delimiter", ",")
.option("header", true)
.csv("/labs/slaba05/lab05_test.csv")

test_df = [_c0: string, ID: string ... 114 more fields]


[_c0: string, ID: string ... 114 more fields]

## fit model on train_df

In [5]:
val fields_X_y = train_df.schema.fieldNames.filter(x => !(x == "ID" || x == "_c0"))

val train_df_casted = train_df.schema.filter(x => x.name != "TARGET").map(x => x.name).foldLeft(train_df) {
    case (folded_df, column_name) => folded_df.withColumn(column_name, col(column_name).cast(DoubleType))}

val train_df_casted_filled = train_df_casted.na.fill(0)

val feature_names = train_df_casted_filled.drop("_c0", "ID", "TARGET").schema.fieldNames
val assembler: VectorAssembler = new VectorAssembler()
                                        .setInputCols(feature_names)
                                        .setOutputCol("features")
                                        .setHandleInvalid("error")

val model: GBTClassifier = new GBTClassifier()
                                .setLabelCol("TARGET")
                                .setFeaturesCol("features")
                                .setPredictionCol("prediction_")
                                .setProbabilityCol("probability_")
                                .setRawPredictionCol("raw_prediction_")

val train_df_casted_filled_with_features = assembler.transform(train_df_casted_filled.drop("_c0", "ID"))

val train_df_casted_filled_with_features2 = train_df_casted_filled_with_features.drop(feature_names: _*)
                                                .withColumn("TARGET", col("TARGET").cast(DoubleType))
                                                .filter(col("TARGET").isNotNull)

val model_fitted = model.fit(train_df_casted_filled_with_features2)

val train_to_check = model_fitted.transform(train_df_casted_filled_with_features2)

fields_X_y = Array(CR_PROD_CNT_IL, AMOUNT_RUB_CLO_PRC, PRC_ACCEPTS_A_EMAIL_LINK, APP_REGISTR_RGN_CODE, PRC_ACCEPTS_A_POS, PRC_ACCEPTS_A_TK, TURNOVER_DYNAMIC_IL_1M, CNT_TRAN_AUT_TENDENCY1M, SUM_TRAN_AUT_TENDENCY1M, AMOUNT_RUB_SUP_PRC, PRC_ACCEPTS_A_AMOBILE, SUM_TRAN_AUT_TENDENCY3M, CLNT_TRUST_RELATION, PRC_ACCEPTS_TK, PRC_ACCEPTS_A_MTP, REST_DYNAMIC_FDEP_1M, CNT_TRAN_AUT_TENDENCY3M, CNT_ACCEPTS_TK, APP_MARITAL_STATUS, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, CNT_TRAN_MED_TENDENCY1M, APP_KIND_OF_PROP_HABITATION, CLNT_JOB_POSITION_TYPE, AMOUNT_RUB_NAS_PRC, CLNT_JOB_POSITION, APP_DRIVING_LICENSE, TRANS_COUNT_SUP_PRC, APP_EDUCATION, CNT_TRAN_CLO_TENDENCY1M, SUM_TRAN_MED_TENDENCY1M, PRC_ACCEPTS_A_ATM, PRC_ACCEPTS_MTP, TRANS_COUNT_NAS_PRC, APP_TRAVEL_PASS, CNT_ACCEPT...


Array(CR_PROD_CNT_IL, AMOUNT_RUB_CLO_PRC, PRC_ACCEPTS_A_EMAIL_LINK, APP_REGISTR_RGN_CODE, PRC_ACCEPTS_A_POS, PRC_ACCEPTS_A_TK, TURNOVER_DYNAMIC_IL_1M, CNT_TRAN_AUT_TENDENCY1M, SUM_TRAN_AUT_TENDENCY1M, AMOUNT_RUB_SUP_PRC, PRC_ACCEPTS_A_AMOBILE, SUM_TRAN_AUT_TENDENCY3M, CLNT_TRUST_RELATION, PRC_ACCEPTS_TK, PRC_ACCEPTS_A_MTP, REST_DYNAMIC_FDEP_1M, CNT_TRAN_AUT_TENDENCY3M, CNT_ACCEPTS_TK, APP_MARITAL_STATUS, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, CNT_TRAN_MED_TENDENCY1M, APP_KIND_OF_PROP_HABITATION, CLNT_JOB_POSITION_TYPE, AMOUNT_RUB_NAS_PRC, CLNT_JOB_POSITION, APP_DRIVING_LICENSE, TRANS_COUNT_SUP_PRC, APP_EDUCATION, CNT_TRAN_CLO_TENDENCY1M, SUM_TRAN_MED_TENDENCY1M, PRC_ACCEPTS_A_ATM, PRC_ACCEPTS_MTP, TRANS_COUNT_NAS_PRC, APP_TRAVEL_PASS, CNT_ACCEPT...

## evaluate model on train_df

In [6]:
val vecToArray = udf((xs: org.apache.spark.ml.linalg.Vector) => xs.toArray)

val metrics = new BinaryClassificationMetrics(
    train_to_check.select(vecToArray(col("probability_")).getItem(1).alias("prob"), col("TARGET"))
        .rdd.map(row => (row.getAs[Double]("prob"), row.getAs[Double]("TARGET"))))

metrics.areaUnderROC

vecToArray = UserDefinedFunction(<function1>,ArrayType(DoubleType,false),Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))
metrics = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@49463220


0.8265829492695888

## apply model on test_df

In [7]:
val test_df_casted = test_df.schema.filter(x => x.name != "TARGET").map(x => x.name).foldLeft(test_df) {
    case (folded_df, column_name) => folded_df.withColumn(column_name, col(column_name).cast(DoubleType))}

val test_df_casted_filled = test_df_casted.na.fill(0)

val test_df_casted_filled_with_features = assembler.transform(test_df_casted_filled)

val test_df_casted_filled_with_features2 = test_df_casted_filled_with_features.drop(feature_names: _*)

val test_to_check = model_fitted.transform(test_df_casted_filled_with_features2)

test_df_casted = [_c0: double, ID: double ... 114 more fields]
test_df_casted_filled = [_c0: double, ID: double ... 114 more fields]
test_df_casted_filled_with_features = [_c0: double, ID: double ... 115 more fields]
test_df_casted_filled_with_features2 = [_c0: double, ID: double ... 1 more field]
test_to_check = [_c0: double, ID: double ... 4 more fields]


[_c0: double, ID: double ... 4 more fields]

In [8]:
test_to_check.show(5)

+--------+--------+--------------------+--------------------+--------------------+-----------+
|     _c0|      ID|            features|     raw_prediction_|        probability_|prediction_|
+--------+--------+--------------------+--------------------+--------------------+-----------+
|372289.0|519130.0|(114,[19,21,43,44...|[0.77871996434737...|[0.82598569244220...|        0.0|
| 87204.0|234045.0|(114,[1,9,21,25,2...|[1.46755700663867...|[0.94955520086183...|        0.0|
|254415.0|401256.0|(114,[1,7,8,9,11,...|[1.50742241243035...|[0.95324027872665...|        0.0|
|404229.0|551070.0|(114,[43,44,45,46...|[1.34033211650541...|[0.93587599716558...|        0.0|
|220444.0|367285.0|(114,[21,46,61,67...|[1.47093893863527...|[0.94987820736100...|        0.0|
+--------+--------+--------------------+--------------------+--------------------+-----------+
only showing top 5 rows



## save results to files

In [9]:
test_to_check.select(col("ID"),vecToArray(col("probability_")).getItem(1).alias("target"))
    .write.option("header",true).csv("lab05_test.csv")

Name: org.apache.spark.sql.AnalysisException
Message: path hdfs://spark-master-1.newprolab.com:8020/user/anastasia.shabalina/lab05_test.csv already exists.;
StackTrace:   at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:114)
  at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
  at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
  at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
  at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
  at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
  at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
  at org.apache.spa

In [10]:
val collected = test_to_check.select(col("ID").cast(IntegerType), vecToArray(col("probability_")).getItem(1).alias("target")).collect()

val string_to_write = "id\ttarget" + "\n" + collected.map(x => Array(x(0), x(1))).map(y => y.mkString("\t")).mkString("\n")

import java.io.PrintWriter
new PrintWriter("lab05.csv") { write(string_to_write); close }

collected = Array([519130,0.17401430755779135], [234045,0.05044479913816702], [401256,0.04675972127334482], [551070,0.06412400283441888], [367285,0.05012179263899408], [497998,0.04980183358732948], [413082,0.09826151101411551], [349893,0.049872318800841864], [346337,0.13800344865519498], [289979,0.04879098018686989], [510818,0.1774711476452263], [235935,0.08371084224434533], [532135,0.24896083373726408], [564760,0.05405249493615094], [277391,0.14240831920696695], [336830,0.35779679612356996], [356053,0.04672938292276185], [293302,0.08213315353638717], [322368,0.04928306710303443], [406041,0.04672938292276185], [569179,0.05713474373137284], [191405,0.05536715881150078], [489011,0.04906123933359585], [265952,0.1915450574919848], [193718,0.07066298921978975...


lastException: Throwable = null


Array([519130,0.17401430755779135], [234045,0.05044479913816702], [401256,0.04675972127334482], [551070,0.06412400283441888], [367285,0.05012179263899408], [497998,0.04980183358732948], [413082,0.09826151101411551], [349893,0.049872318800841864], [346337,0.13800344865519498], [289979,0.04879098018686989], [510818,0.1774711476452263], [235935,0.08371084224434533], [532135,0.24896083373726408], [564760,0.05405249493615094], [277391,0.14240831920696695], [336830,0.35779679612356996], [356053,0.04672938292276185], [293302,0.08213315353638717], [322368,0.04928306710303443], [406041,0.04672938292276185], [569179,0.05713474373137284], [191405,0.05536715881150078], [489011,0.04906123933359585], [265952,0.1915450574919848], [193718,0.07066298921978975...