In [None]:
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StringType
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.{Pipeline, PipelineModel}

import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier, LogisticRegression}
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row

import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.DoubleType

import org.apache.spark.ml.feature.VectorAssembler
import scala.collection.mutable.ArrayBuffer

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import sys.process._


//Create Hadoop Configuration from Spark
val conf = new SparkConf().set("spark.driver.memory", "4g")
val sc = new SparkContext(conf)

In [None]:
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val result_path = new Path("lab05.csv")

In [None]:
val csvOptions = Map("header" -> "true", "inferSchema" -> "true")
val train = spark.read.options(csvOptions).csv("/labs/slaba05/lab05_train.csv")
val test = spark.read.options(csvOptions).csv("/labs/slaba05/lab05_test.csv")

In [None]:
def create_features(df : DataFrame) : DataFrame = {

    val to_index_pipe_array = Array(
        new StringIndexer().setInputCol("CLNT_TRUST_RELATION").setOutputCol("CLNT_TRUST_RELATION_IND").setHandleInvalid("keep"),
        new StringIndexer().setInputCol("APP_MARITAL_STATUS").setOutputCol("APP_MARITAL_STATUS_IND").setHandleInvalid("keep"),
        new StringIndexer().setInputCol("APP_KIND_OF_PROP_HABITATION").setOutputCol("APP_KIND_OF_PROP_HABITATION_IND").setHandleInvalid("keep"),
        new StringIndexer().setInputCol("CLNT_JOB_POSITION_TYPE").setOutputCol("CLNT_JOB_POSITION_TYPE_IND").setHandleInvalid("keep"),
        new StringIndexer().setInputCol("CLNT_JOB_POSITION").setOutputCol("CLNT_JOB_POSITION_IND").setHandleInvalid("keep"),
        new StringIndexer().setInputCol("APP_DRIVING_LICENSE").setOutputCol("APP_DRIVING_LICENSE_IND").setHandleInvalid("keep"),
        new StringIndexer().setInputCol("APP_EDUCATION").setOutputCol("APP_EDUCATION_IND").setHandleInvalid("keep"),
        new StringIndexer().setInputCol("APP_TRAVEL_PASS").setOutputCol("APP_TRAVEL_PASS_IND").setHandleInvalid("keep"),
        new StringIndexer().setInputCol("APP_CAR").setOutputCol("APP_CAR_IND").setHandleInvalid("keep"),
        new StringIndexer().setInputCol("APP_POSITION_TYPE").setOutputCol("APP_POSITION_TYPE_IND").setHandleInvalid("keep"),
        new StringIndexer().setInputCol("APP_EMP_TYPE").setOutputCol("APP_EMP_TYPE_IND").setHandleInvalid("keep"),
        new StringIndexer().setInputCol("APP_COMP_TYPE").setOutputCol("APP_COMP_TYPE_IND").setHandleInvalid("keep"),
        new StringIndexer().setInputCol("PACK").setOutputCol("PACK_IND").setHandleInvalid("keep")
    )

    val to_index_pipe = new Pipeline().setStages(to_index_pipe_array)
    val to_index_pipe_model = to_index_pipe.fit(df)
    
    val df2 = to_index_pipe_model.transform(df)
    
    val df3 = df2.withColumn("CR_PROD_CNT_IL",col("CR_PROD_CNT_IL").cast(DoubleType))
        .withColumn("CR_PROD_CNT_VCU",col("CR_PROD_CNT_VCU").cast(DoubleType))
        .withColumn("CR_PROD_CNT_TOVR",col("CR_PROD_CNT_TOVR").cast(DoubleType))
        .withColumn("CR_PROD_CNT_PIL",col("CR_PROD_CNT_PIL").cast(DoubleType))
        .withColumn("AGE",col("AGE").cast(DoubleType))
        .withColumn("CR_PROD_CNT_CC",col("CR_PROD_CNT_CC").cast(DoubleType))
        .withColumn("CR_PROD_CNT_CCFP",col("CR_PROD_CNT_CCFP").cast(DoubleType))
    
    val col_df = df3.columns
    val buf = collection.mutable.ArrayBuffer(col_df: _*)
    buf --= Array("_c0", "ID", "CLNT_TRUST_RELATION", "APP_MARITAL_STATUS", "APP_KIND_OF_PROP_HABITATION"
                  , "CLNT_JOB_POSITION_TYPE", "CLNT_JOB_POSITION", "APP_DRIVING_LICENSE", "APP_EDUCATION"
                  , "APP_TRAVEL_PASS", "APP_CAR", "APP_POSITION_TYPE", "APP_EMP_TYPE", "APP_COMP_TYPE", "PACK"
                  , "TARGET")
    val array_col = buf.toArray
    val assembler = new VectorAssembler().
        setInputCols(array_col).
        setOutputCol("features")
    
    return assembler.transform(df3.na.fill(0))
}

In [None]:
val train_faetures_non_balance = create_features(train)

val sample_count = (train_faetures_non_balance.filter(col("TARGET") === 1).count() + 0.0) / (train_faetures_non_balance.count() + 0.0)
val train_faetures = train_faetures_non_balance.filter(col("TARGET") === 1)
    .union(train_faetures_non_balance.filter(col("TARGET") === 0).sample(sample_count))

val test_faetures = create_features(test)

In [None]:
val rf = new RandomForestClassifier()
    .setLabelCol("TARGET")
    .setFeaturesCol("features")
    .setNumTrees(50)
    .setMaxDepth(30)
    .setMaxBins(64)

val rfModel = rf.fit(train_faetures)

In [None]:
val predictions = rfModel.transform(test_faetures)

if(fs.exists(result_path))
    fs.delete(result_path, true)

predictions.selectExpr("ID as id", "prediction as target")
    .coalesce(1)
    .write.format("csv")
    .option("delimiter", "\t")
    .option("header", "true")
    .save("from_hdfs_lab05.csv")

In [None]:
spark.stop