In [2]:
import org.apache.spark.sql.types.{StringType, DoubleType, StructType, StructField, LongType}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.ml.feature.{Imputer, CountVectorizer, VectorAssembler, StandardScaler, OneHotEncoder, StringIndexer}
import org.apache.spark.ml.classification.{GBTClassifier, GBTClassificationModel}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.sql._
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.tuning.TrainValidationSplit
import org.apache.spark.ml.linalg.Vector
import sys.process._

In [None]:
val TARGET_COL = "TARGET"

In [None]:
 val spark = SparkSession
   .builder()
   .appName("mospan lab5")
   .config("spark.executor.instances", "16")
   .getOrCreate()

In [None]:
val trainDf = spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")  
  .load("lab5/lab05_train.csv")
  .drop("_c0")
  .filter(col(TARGET_COL).isNotNull)  

In [None]:
trainDf.createOrReplaceTempView("train_df")

In [None]:
spark.sql("select count(distinct substr(nvl(upper(CLNT_JOB_POSITION), 'OTHER'), 1, 3)) from train_df").show(50)

In [None]:
val stringCols = List("CLNT_TRUST_RELATION", "APP_MARITAL_STATUS", 
                             "APP_KIND_OF_PROP_HABITATION", "CLNT_JOB_POSITION_TYPE",
                              "APP_DRIVING_LICENSE", "APP_EDUCATION", "APP_TRAVEL_PASS",
                              "APP_CAR", "APP_POSITION_TYPE", "APP_EMP_TYPE", "APP_COMP_TYPE", "PACK", "CLNT_JOB_POSITION")

In [None]:
def processStringCols(df: DataFrame): DataFrame = {
    val numericCols = df.columns.toSeq.filter(col => !stringCols.contains(col)).mkString(",")
    df.createOrReplaceTempView("df")
    spark.sql(s"""select case 
                    when CLNT_TRUST_RELATION in ('Мать', 'мама', 'мать') then 'MOTHER'
                    when CLNT_TRUST_RELATION = 'Сын' then 'SON'
                    when CLNT_TRUST_RELATION = 'Дочь' then 'DAUGHTER'
                    when CLNT_TRUST_RELATION = 'Друг' then 'FRIEND'
                    when CLNT_TRUST_RELATION = 'Отец' then 'FATHER'
                    when CLNT_TRUST_RELATION = 'Брат' then 'BROTHER'
                    when CLNT_TRUST_RELATION = 'Сестра' then 'SISTER'
                    when CLNT_TRUST_RELATION in ('Близкий ро', 'Дальний ро') then 'RELATIVE'
                    else  nvl(CLNT_TRUST_RELATION, 'OTHER') end as CLNT_TRUST_RELATION,
                nvl(upper(APP_MARITAL_STATUS), 'OTHER') as APP_MARITAL_STATUS,
                nvl(APP_KIND_OF_PROP_HABITATION, 'OTHER') as APP_KIND_OF_PROP_HABITATION,
                nvl(CLNT_JOB_POSITION_TYPE, 'OTHER') as CLNT_JOB_POSITION_TYPE,
                nvl(APP_DRIVING_LICENSE, 'OTHER') as APP_DRIVING_LICENSE,
                nvl(APP_EDUCATION, 'OTHER') as APP_EDUCATION,
                nvl(APP_TRAVEL_PASS, 'OTHER') as APP_TRAVEL_PASS,
                nvl(APP_CAR, 'OTHER') as APP_CAR,
                nvl(APP_POSITION_TYPE, 'OTHER') as APP_POSITION_TYPE,
                nvl(APP_EMP_TYPE, 'OTHER') as APP_EMP_TYPE,
                nvl(APP_COMP_TYPE, 'OTHER') as APP_COMP_TYPE,
                PACK,
                substr(nvl(upper(CLNT_JOB_POSITION), 'OTHER'), 1, 3) as CLNT_JOB_POSITION,
                $numericCols
            from df""")    
}

In [None]:
val trainHandleStringsDf = processStringCols(trainDf)

In [None]:
// val stringCols = trainProcessStringsDf.schema.fields
//     .filter(x => x.dataType == StringType && !processedStringCols.contains(x.name))
//     .map(x => x.name)

In [None]:
// val trainNoStringsDf = trainProcessStringsDf.drop(stringCols: _*)

In [None]:
def castIntColsToDouble(df: DataFrame): DataFrame = {
    val colsToExcludeFromCast = df.schema.fields
        .filter(x => x.dataType == DoubleType || x.name == "ID" || x.name == TARGET_COL 
                || stringCols.contains(x.name))
        .map(x => x.name)
    (df.columns.toBuffer --= colsToExcludeFromCast)
                .foldLeft(df)((current, c) => current.withColumn(c, col(c).cast("double")))
}

In [None]:
val trainDoubleCastDf = castIntColsToDouble(trainHandleStringsDf)

In [None]:
val doubleFeatureCols = trainDoubleCastDf.schema.fields
    .filter(x => !("ID" :: TARGET_COL :: stringCols).contains(x.name))
    .map(x => x.name)

In [None]:
val finalTrainDf = trainDoubleCastDf.stat.sampleBy(TARGET_COL, fractions=Map(0 -> 0.98, 1 -> 0.98), seed=41)
val valDf = trainDoubleCastDf.join(finalTrainDf, Seq("ID"), "leftanti")

In [None]:
// finalTrainDf.count()

In [None]:
// valDf.count()

In [None]:
doubleFeatureCols.contains("AGE")

In [None]:
val imputer = new Imputer().setStrategy("mean").setInputCols(doubleFeatureCols).setOutputCols(doubleFeatureCols)
val clntTrustRelIndexer = new StringIndexer().setInputCol("CLNT_TRUST_RELATION")
    .setOutputCol("CLNT_TRUST_RELATION_IND")
val clntTrustRelOhe = new OneHotEncoder().setInputCol("CLNT_TRUST_RELATION_IND")
    .setOutputCol("CLNT_TRUST_RELATION_ONE_HOT")
val appMaritalStatusIndexer = new StringIndexer().setInputCol("APP_MARITAL_STATUS")
    .setOutputCol("APP_MARITAL_STATUS_IND")
val appMaritalStatusOhe = new OneHotEncoder().setInputCol("APP_MARITAL_STATUS_IND")
    .setOutputCol("APP_MARITAL_STATUS_ONE_HOT")
val appKindHabitIndexer = new StringIndexer().setInputCol("APP_KIND_OF_PROP_HABITATION")
    .setOutputCol("APP_KIND_OF_PROP_HABITATION_IND")
val appKindHabitOhe = new OneHotEncoder().setInputCol("APP_KIND_OF_PROP_HABITATION_IND")
    .setOutputCol("APP_KIND_OF_PROP_HABITATION_IND_ONE_HOT")
val clntJobIndexer = new StringIndexer().setInputCol("CLNT_JOB_POSITION_TYPE")
    .setOutputCol("CLNT_JOB_POSITION_TYPE_IND")
val clntJobOhe = new OneHotEncoder().setInputCol("CLNT_JOB_POSITION_TYPE_IND")
    .setOutputCol("CLNT_JOB_POSITION_TYPE_ONE_HOT")
val appDriveLicenseIndexer = new StringIndexer().setInputCol("APP_DRIVING_LICENSE")
    .setOutputCol("APP_DRIVING_LICENSE_IND")
val appDriveLicenseOhe = new OneHotEncoder().setInputCol("APP_DRIVING_LICENSE_IND")
    .setOutputCol("APP_DRIVING_LICENSE_ONE_HOT")
val appEducationIndexer = new StringIndexer().setInputCol("APP_EDUCATION")
    .setOutputCol("APP_EDUCATION_IND")
val appEducationOhe = new OneHotEncoder().setInputCol("APP_EDUCATION_IND")
    .setOutputCol("APP_EDUCATION_ONE_HOT")
val appTravelPassIndexer = new StringIndexer().setInputCol("APP_TRAVEL_PASS")
    .setOutputCol("APP_TRAVEL_PASS_IND")
val appTravelPassOhe = new OneHotEncoder().setInputCol("APP_TRAVEL_PASS_IND")
    .setOutputCol("APP_TRAVEL_PASS_IND_ONE_HOT")
val appCarIndexer = new StringIndexer().setInputCol("APP_CAR")
    .setOutputCol("APP_CAR_IND")
val appCarOhe = new OneHotEncoder().setInputCol("APP_CAR_IND")
    .setOutputCol("APP_CAR_ONE_HOT")
val appPosTypeIndexer = new StringIndexer().setInputCol("APP_POSITION_TYPE")
    .setOutputCol("APP_POSITION_TYPE_IND")
val appPosTypeOhe = new OneHotEncoder().setInputCol("APP_POSITION_TYPE_IND")
    .setOutputCol("APP_POSITION_TYPE_ONE_HOT")
val appEmpTypeIndexer = new StringIndexer().setInputCol("APP_EMP_TYPE")
    .setOutputCol("APP_EMP_TYPE_IND")
val appEmpTypeOhe = new OneHotEncoder().setInputCol("APP_EMP_TYPE_IND")
    .setOutputCol("APP_EMP_TYPE_ONE_HOT")
val appCompTypeIndexer = new StringIndexer().setInputCol("APP_COMP_TYPE")
    .setOutputCol("APP_COMP_TYPE_IND")
val appCompTypeOhe = new OneHotEncoder().setInputCol("APP_COMP_TYPE_IND")
    .setOutputCol("APP_COMP_TYPE_ONE_HOT")
val packIndexer = new StringIndexer().setInputCol("PACK")
    .setOutputCol("PACK_IND")
val packOhe = new OneHotEncoder().setInputCol("PACK_IND")
    .setOutputCol("PACK_ONE_HOT")
val jobPosIndexer = new StringIndexer().setInputCol("CLNT_JOB_POSITION")
    .setOutputCol("CLNT_JOB_POSITION_IND")
    .setHandleInvalid("keep")
val jobPosOhe = new OneHotEncoder().setInputCol("CLNT_JOB_POSITION_IND")
            .setOutputCol("CLNT_JOB_POSITION_ONE_HOT")
val vectorAssembler = new VectorAssembler().setInputCols(doubleFeatureCols 
                                                        :+ "CLNT_TRUST_RELATION_ONE_HOT" 
                                                        :+ "APP_MARITAL_STATUS_ONE_HOT"
                                                        :+ "APP_KIND_OF_PROP_HABITATION_IND_ONE_HOT"
                                                         :+ "CLNT_JOB_POSITION_TYPE_ONE_HOT"
                                                         :+ "APP_DRIVING_LICENSE_ONE_HOT"
                                                         :+ "APP_EDUCATION_ONE_HOT"
                                                         :+ "APP_TRAVEL_PASS_IND_ONE_HOT"
                                                         :+ "APP_CAR_ONE_HOT"
                                                         :+ "APP_POSITION_TYPE_ONE_HOT"
                                                         :+ "APP_EMP_TYPE_ONE_HOT"
                                                         :+ "APP_COMP_TYPE_ONE_HOT"
                                                         :+ "PACK_ONE_HOT"
                                                         :+ "CLNT_JOB_POSITION_ONE_HOT"
                                                        )
    .setOutputCol("features")
// val minMaxScaler = new MinMaxScaler().setMin(0).setMax(1).setInputCol("features").setOutputCol("min_max_features")
// val stdScaler = new StandardScaler().setInputCol("features").setOutputCol("norm_features")
// val layers = Array[Int](119, 32, 2)
// val mlp = new MultilayerPerceptronClassifier().setLabelCol(TARGET_COL).setFeaturesCol("norm_features")
//     .setLayers(layers)
//     .setBlockSize(32)
//     .setSeed(41L)
//     .setMaxIter(200)
val gbt = new GBTClassifier()
  .setLabelCol(TARGET_COL)
  .setFeaturesCol("features")
//   .setMaxIter(20)
  .setFeatureSubsetStrategy("auto")
val stages = Array(imputer, clntTrustRelIndexer, clntTrustRelOhe, 
                   appMaritalStatusIndexer, appMaritalStatusOhe,
                   appKindHabitIndexer, appKindHabitOhe,
                   clntJobIndexer, clntJobOhe,
                   appDriveLicenseIndexer, appDriveLicenseOhe,
                   appEducationIndexer, appEducationOhe,
                   appTravelPassIndexer, appTravelPassOhe,
                   appCarIndexer, appCarOhe,
                   appPosTypeIndexer, appPosTypeOhe,
                   appEmpTypeIndexer, appEmpTypeOhe,
                   appCompTypeIndexer, appCompTypeOhe,
                   packIndexer, packOhe,
                   jobPosIndexer, jobPosOhe,
                   vectorAssembler, 
//                    minMaxScaler, 
//                    stdScaler, 
                   gbt)
val pipeline = new Pipeline().setStages(stages)

In [None]:
// val afterImputeDf = imputer.fit(finalTrainDf).transform(finalTrainDf)
// val afterIndexerDf = clntTrustRelIndexer.fit(afterImputeDf).transform(afterImputeDf)
// val afterOheDf = clntTrustRelOhe.transform(afterIndexerDf)
// val afterIndexer2Df = appMaritalStatusIndexer.fit(afterOheDf).transform(afterOheDf)
// val afterOhe2Df = appMaritalStatusOhe.transform(afterIndexer2Df)
// val afterIndexer3Df = packIndexer.fit(afterOhe2Df).transform(afterOhe2Df)
// val afterOhe3Df = packOhe.transform(afterIndexer3Df)
// val afterVecAssemblerDf = vectorAssembler.transform(afterOhe3Df)
// afterVecAssemblerDf.select("features").show(1)

In [None]:
val params = new ParamGridBuilder()
        .addGrid(gbt.maxDepth, Array(10))
        .addGrid(gbt.maxBins, Array(100))
        .addGrid(gbt.maxIter, Array(100))
        .addGrid(gbt.stepSize, Array(0.1))
        .build()

In [None]:
val evaluator = new BinaryClassificationEvaluator().setMetricName("areaUnderROC").setLabelCol(TARGET_COL)

In [None]:
val tvs = new TrainValidationSplit()
    .setTrainRatio(0.95)
    .setEstimatorParamMaps(params)
    .setEstimator(pipeline)
    .setEvaluator(evaluator)

val fittedTvs = tvs.fit(finalTrainDf)

In [None]:
evaluator.evaluate(fittedTvs.transform(valDf))

In [None]:
// maxDepth = 10
// maxBins = 120
// maxIter = 100

val bestPipelineModel = fittedTvs.bestModel.asInstanceOf[PipelineModel]
val bestGbtStage = bestPipelineModel.stages(28).asInstanceOf[GBTClassificationModel]
println("maxDepth = " + bestGbtStage.getMaxDepth)
println("maxBins = " + bestGbtStage.getMaxBins)
println("maxIter = " + bestGbtStage.getMaxIter)
println("stepSize = " + bestGbtStage.getStepSize)

In [None]:
val testDf = spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .option("mode", "failfast")  
  .load("lab5/lab05_test.csv")
  .drop("_c0") 

In [None]:
val testHandleStringsDf = processStringCols(testDf)
// val testNoStringsDf = testProcessStringsDf.drop(stringCols: _*)

In [None]:
val testDoubleCastDf = castIntColsToDouble(testHandleStringsDf)

In [None]:
val submissionPredictions = fittedTvs.transform(testDoubleCastDf)

In [None]:
val vectorSecondElementUdf = udf((v: Vector) => v.toArray(1))

In [None]:
val resultDf = submissionPredictions.select("id", "probability")
    .withColumn("target", vectorSecondElementUdf(col("probability")))
    .select("id", "target")

In [None]:
resultDf.coalesce(1).write.format("csv").mode("overwrite").option("header", "true")
    .option("sep", "\t")
    .save("lab5/results")

In [None]:
"hdfs dfs -ls lab5/results" !

In [3]:
"cp lab05_best.csv lab05.csv" !



0

In [None]:
"rm lab05.csv" !

In [None]:
"hdfs dfs -get lab5/results/part-00000-e5adcd51-b04f-41c8-8db2-d7afb2beb4cf-c000.csv lab05.csv" !

In [None]:
spark.stop()