In [1]:
import org.apache.spark.sql.functions.{col, when, count, lit, udf}
import org.apache.spark.sql.Column
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{FloatType, StringType, IntegerType}
import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoderEstimator, VectorAssembler}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.classification.{LogisticRegression, GBTClassifier}
import org.apache.spark.ml.linalg.{SparseVector, DenseVector, Vector}
import sys.process._

In [2]:
import org.apache.spark.sql.SparkSession
val spark:SparkSession = SparkSession.builder()
      .master("local[1]")
      .appName("Lab05_by_sand")
      .getOrCreate()

spark = org.apache.spark.sql.SparkSession@14062151


org.apache.spark.sql.SparkSession@14062151

In [3]:
val trainFileName = "/labs/slaba05/lab05_train.csv" 
val testFileName = "/labs/slaba05/lab05_test.csv" 

trainFileName = /labs/slaba05/lab05_train.csv
testFileName = /labs/slaba05/lab05_test.csv


/labs/slaba05/lab05_test.csv

In [4]:
var rawTrain = spark.read.option("header", true).format("csv").load(trainFileName)
rawTrain.printSchema

root
 |-- _c0: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- CR_PROD_CNT_IL: string (nullable = true)
 |-- AMOUNT_RUB_CLO_PRC: string (nullable = true)
 |-- PRC_ACCEPTS_A_EMAIL_LINK: string (nullable = true)
 |-- APP_REGISTR_RGN_CODE: string (nullable = true)
 |-- PRC_ACCEPTS_A_POS: string (nullable = true)
 |-- PRC_ACCEPTS_A_TK: string (nullable = true)
 |-- TURNOVER_DYNAMIC_IL_1M: string (nullable = true)
 |-- CNT_TRAN_AUT_TENDENCY1M: string (nullable = true)
 |-- SUM_TRAN_AUT_TENDENCY1M: string (nullable = true)
 |-- AMOUNT_RUB_SUP_PRC: string (nullable = true)
 |-- PRC_ACCEPTS_A_AMOBILE: string (nullable = true)
 |-- SUM_TRAN_AUT_TENDENCY3M: string (nullable = true)
 |-- CLNT_TRUST_RELATION: string (nullable = true)
 |-- PRC_ACCEPTS_TK: string (nullable = true)
 |-- PRC_ACCEPTS_A_MTP: string (nullable = true)
 |-- REST_DYNAMIC_FDEP_1M: string (nullable = true)
 |-- CNT_TRAN_AUT_TENDENCY3M: string (nullable = true)
 |-- CNT_ACCEPTS_TK: string (nullable = true)
 |--

rawTrain = [_c0: string, ID: string ... 115 more fields]


[_c0: string, ID: string ... 115 more fields]

In [5]:
rawTrain.groupBy("TARGET").count().show()

+------+------+
|TARGET| count|
+------+------+
|     0|294607|
|  null|     1|
|     1| 26156|
+------+------+



In [6]:
var rawTest = spark.read.option("header", true).format("csv").load(testFileName)
rawTest.printSchema

root
 |-- _c0: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- CR_PROD_CNT_IL: string (nullable = true)
 |-- AMOUNT_RUB_CLO_PRC: string (nullable = true)
 |-- PRC_ACCEPTS_A_EMAIL_LINK: string (nullable = true)
 |-- APP_REGISTR_RGN_CODE: string (nullable = true)
 |-- PRC_ACCEPTS_A_POS: string (nullable = true)
 |-- PRC_ACCEPTS_A_TK: string (nullable = true)
 |-- TURNOVER_DYNAMIC_IL_1M: string (nullable = true)
 |-- CNT_TRAN_AUT_TENDENCY1M: string (nullable = true)
 |-- SUM_TRAN_AUT_TENDENCY1M: string (nullable = true)
 |-- AMOUNT_RUB_SUP_PRC: string (nullable = true)
 |-- PRC_ACCEPTS_A_AMOBILE: string (nullable = true)
 |-- SUM_TRAN_AUT_TENDENCY3M: string (nullable = true)
 |-- CLNT_TRUST_RELATION: string (nullable = true)
 |-- PRC_ACCEPTS_TK: string (nullable = true)
 |-- PRC_ACCEPTS_A_MTP: string (nullable = true)
 |-- REST_DYNAMIC_FDEP_1M: string (nullable = true)
 |-- CNT_TRAN_AUT_TENDENCY3M: string (nullable = true)
 |-- CNT_ACCEPTS_TK: string (nullable = true)
 |--

rawTest = [_c0: string, ID: string ... 114 more fields]


[_c0: string, ID: string ... 114 more fields]

In [7]:
rawTest.count()

44399

### Clear DataFrame

In [8]:
rawTrain = rawTrain.drop("_c0")
rawTrain.printSchema

root
 |-- ID: string (nullable = true)
 |-- CR_PROD_CNT_IL: string (nullable = true)
 |-- AMOUNT_RUB_CLO_PRC: string (nullable = true)
 |-- PRC_ACCEPTS_A_EMAIL_LINK: string (nullable = true)
 |-- APP_REGISTR_RGN_CODE: string (nullable = true)
 |-- PRC_ACCEPTS_A_POS: string (nullable = true)
 |-- PRC_ACCEPTS_A_TK: string (nullable = true)
 |-- TURNOVER_DYNAMIC_IL_1M: string (nullable = true)
 |-- CNT_TRAN_AUT_TENDENCY1M: string (nullable = true)
 |-- SUM_TRAN_AUT_TENDENCY1M: string (nullable = true)
 |-- AMOUNT_RUB_SUP_PRC: string (nullable = true)
 |-- PRC_ACCEPTS_A_AMOBILE: string (nullable = true)
 |-- SUM_TRAN_AUT_TENDENCY3M: string (nullable = true)
 |-- CLNT_TRUST_RELATION: string (nullable = true)
 |-- PRC_ACCEPTS_TK: string (nullable = true)
 |-- PRC_ACCEPTS_A_MTP: string (nullable = true)
 |-- REST_DYNAMIC_FDEP_1M: string (nullable = true)
 |-- CNT_TRAN_AUT_TENDENCY3M: string (nullable = true)
 |-- CNT_ACCEPTS_TK: string (nullable = true)
 |-- APP_MARITAL_STATUS: string (nullab

rawTrain = [ID: string, CR_PROD_CNT_IL: string ... 114 more fields]


[ID: string, CR_PROD_CNT_IL: string ... 114 more fields]

In [9]:
rawTest = rawTest.drop("_c0")
rawTest.printSchema

root
 |-- ID: string (nullable = true)
 |-- CR_PROD_CNT_IL: string (nullable = true)
 |-- AMOUNT_RUB_CLO_PRC: string (nullable = true)
 |-- PRC_ACCEPTS_A_EMAIL_LINK: string (nullable = true)
 |-- APP_REGISTR_RGN_CODE: string (nullable = true)
 |-- PRC_ACCEPTS_A_POS: string (nullable = true)
 |-- PRC_ACCEPTS_A_TK: string (nullable = true)
 |-- TURNOVER_DYNAMIC_IL_1M: string (nullable = true)
 |-- CNT_TRAN_AUT_TENDENCY1M: string (nullable = true)
 |-- SUM_TRAN_AUT_TENDENCY1M: string (nullable = true)
 |-- AMOUNT_RUB_SUP_PRC: string (nullable = true)
 |-- PRC_ACCEPTS_A_AMOBILE: string (nullable = true)
 |-- SUM_TRAN_AUT_TENDENCY3M: string (nullable = true)
 |-- CLNT_TRUST_RELATION: string (nullable = true)
 |-- PRC_ACCEPTS_TK: string (nullable = true)
 |-- PRC_ACCEPTS_A_MTP: string (nullable = true)
 |-- REST_DYNAMIC_FDEP_1M: string (nullable = true)
 |-- CNT_TRAN_AUT_TENDENCY3M: string (nullable = true)
 |-- CNT_ACCEPTS_TK: string (nullable = true)
 |-- APP_MARITAL_STATUS: string (nullab

rawTest = [ID: string, CR_PROD_CNT_IL: string ... 113 more fields]


[ID: string, CR_PROD_CNT_IL: string ... 113 more fields]

In [10]:
rawTrain = rawTrain.filter(col("TARGET").isNotNull)
rawTrain.groupBy("TARGET").count().show()

+------+------+
|TARGET| count|
+------+------+
|     0|294607|
|     1| 26156|
+------+------+



rawTrain = [ID: string, CR_PROD_CNT_IL: string ... 114 more fields]


[ID: string, CR_PROD_CNT_IL: string ... 114 more fields]

In [11]:
rawTrain = rawTrain.filter(col("TARGET").isNotNull)
rawTrain.count()

rawTrain = [ID: string, CR_PROD_CNT_IL: string ... 114 more fields]


320763

In [12]:
rawTest = rawTest.withColumn("TARGET", lit(null))
rawTest.filter(col("TARGET").isNotNull).count()

rawTest = [ID: string, CR_PROD_CNT_IL: string ... 114 more fields]


0

In [13]:
val raw = rawTrain.unionAll(rawTest)
raw.printSchema

root
 |-- ID: string (nullable = true)
 |-- CR_PROD_CNT_IL: string (nullable = true)
 |-- AMOUNT_RUB_CLO_PRC: string (nullable = true)
 |-- PRC_ACCEPTS_A_EMAIL_LINK: string (nullable = true)
 |-- APP_REGISTR_RGN_CODE: string (nullable = true)
 |-- PRC_ACCEPTS_A_POS: string (nullable = true)
 |-- PRC_ACCEPTS_A_TK: string (nullable = true)
 |-- TURNOVER_DYNAMIC_IL_1M: string (nullable = true)
 |-- CNT_TRAN_AUT_TENDENCY1M: string (nullable = true)
 |-- SUM_TRAN_AUT_TENDENCY1M: string (nullable = true)
 |-- AMOUNT_RUB_SUP_PRC: string (nullable = true)
 |-- PRC_ACCEPTS_A_AMOBILE: string (nullable = true)
 |-- SUM_TRAN_AUT_TENDENCY3M: string (nullable = true)
 |-- CLNT_TRUST_RELATION: string (nullable = true)
 |-- PRC_ACCEPTS_TK: string (nullable = true)
 |-- PRC_ACCEPTS_A_MTP: string (nullable = true)
 |-- REST_DYNAMIC_FDEP_1M: string (nullable = true)
 |-- CNT_TRAN_AUT_TENDENCY3M: string (nullable = true)
 |-- CNT_ACCEPTS_TK: string (nullable = true)
 |-- APP_MARITAL_STATUS: string (nullab

raw = [ID: string, CR_PROD_CNT_IL: string ... 114 more fields]




[ID: string, CR_PROD_CNT_IL: string ... 114 more fields]

In [14]:
raw.groupBy("TARGET").count().show()

+------+------+
|TARGET| count|
+------+------+
|     0|294607|
|  null| 44399|
|     1| 26156|
+------+------+



### Select features

In [15]:
var data = raw.select(
    col("ID").cast(IntegerType),
    col("CR_PROD_CNT_IL").cast(IntegerType),
    col("TURNOVER_DYNAMIC_IL_1M").cast(FloatType),
    col("REST_DYNAMIC_FDEP_1M").cast(FloatType),
    col("PACK").cast(StringType),
    col("REST_DYNAMIC_SAVE_3M").cast(FloatType), //0.5705257919277931
    col("CR_PROD_CNT_VCU").cast(FloatType), //0.5705257919277931
    col("REST_AVG_CUR").cast(FloatType), //0.6372168672441041
    col("CR_PROD_CNT_TOVR").cast(FloatType), //0.6348565533123227
    col("CR_PROD_CNT_PIL").cast(FloatType), //0.634683576729558
    col("TURNOVER_CC").cast(FloatType), //0.6346491289790777
    col("TURNOVER_PAYM").cast(FloatType), //0.6348922970177697
    col("AGE").cast(FloatType), //0.6334057046431302
    col("CR_PROD_CNT_CC").cast(FloatType), //6339097274532639
    col("REST_DYNAMIC_FDEP_3M").cast(FloatType), //0.6341628607521619
    col("REST_DYNAMIC_IL_1M").cast(FloatType), //0.6342077051718181
    col("CR_PROD_CNT_CCFP").cast(FloatType), //0.6341087352213053
    col("REST_DYNAMIC_CUR_1M").cast(FloatType), //0.6341481290271073
    col("REST_AVG_PAYM").cast(FloatType),
    col("TARGET").cast(IntegerType)
)
data.printSchema

root
 |-- ID: integer (nullable = true)
 |-- CR_PROD_CNT_IL: integer (nullable = true)
 |-- TURNOVER_DYNAMIC_IL_1M: float (nullable = true)
 |-- REST_DYNAMIC_FDEP_1M: float (nullable = true)
 |-- PACK: string (nullable = true)
 |-- REST_DYNAMIC_SAVE_3M: float (nullable = true)
 |-- CR_PROD_CNT_VCU: float (nullable = true)
 |-- REST_AVG_CUR: float (nullable = true)
 |-- CR_PROD_CNT_TOVR: float (nullable = true)
 |-- CR_PROD_CNT_PIL: float (nullable = true)
 |-- TURNOVER_CC: float (nullable = true)
 |-- TURNOVER_PAYM: float (nullable = true)
 |-- AGE: float (nullable = true)
 |-- CR_PROD_CNT_CC: float (nullable = true)
 |-- REST_DYNAMIC_FDEP_3M: float (nullable = true)
 |-- REST_DYNAMIC_IL_1M: float (nullable = true)
 |-- CR_PROD_CNT_CCFP: float (nullable = true)
 |-- REST_DYNAMIC_CUR_1M: float (nullable = true)
 |-- REST_AVG_PAYM: float (nullable = true)
 |-- TARGET: integer (nullable = true)



data = [ID: int, CR_PROD_CNT_IL: int ... 18 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 18 more fields]

### Encode features

In [16]:
val stringIndexer = new StringIndexer()
    .setInputCol("PACK")
    .setOutputCol("PackIndex").fit(data)
val indexed = stringIndexer.transform(data)
val encoder = new OneHotEncoderEstimator()
    .setInputCols(Array("PackIndex"))
    .setOutputCols(Array("PackVector")).fit(indexed)
val encodedFinal = encoder.transform(indexed)

stringIndexer = strIdx_c666e64d823e
indexed = [ID: int, CR_PROD_CNT_IL: int ... 19 more fields]
encoder = oneHotEncoder_c04986ea2b97
encodedFinal = [ID: int, CR_PROD_CNT_IL: int ... 20 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 20 more fields]

In [17]:
val fetaure_names: Array[String] = Array[String](
    "CR_PROD_CNT_IL", "TURNOVER_DYNAMIC_IL_1M", "PackVector", 
    "REST_DYNAMIC_FDEP_1M", "REST_DYNAMIC_SAVE_3M", "CR_PROD_CNT_VCU",
    "REST_AVG_CUR", "CR_PROD_CNT_TOVR", "CR_PROD_CNT_PIL", "TURNOVER_CC",
    "TURNOVER_PAYM", "AGE", "CR_PROD_CNT_CC", "REST_DYNAMIC_FDEP_3M", 
    "REST_DYNAMIC_IL_1M", "CR_PROD_CNT_CCFP", "REST_DYNAMIC_CUR_1M",
    "REST_AVG_PAYM"
)
val assembler = new VectorAssembler().setInputCols(fetaure_names).setOutputCol("features")

fetaure_names = Array(CR_PROD_CNT_IL, TURNOVER_DYNAMIC_IL_1M, PackVector, REST_DYNAMIC_FDEP_1M, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, CR_PROD_CNT_TOVR, CR_PROD_CNT_PIL, TURNOVER_CC, TURNOVER_PAYM, AGE, CR_PROD_CNT_CC, REST_DYNAMIC_FDEP_3M, REST_DYNAMIC_IL_1M, CR_PROD_CNT_CCFP, REST_DYNAMIC_CUR_1M, REST_AVG_PAYM)
assembler = vecAssembler_bd67d433ba9f


vecAssembler_bd67d433ba9f

In [18]:
val testData = encodedFinal.filter(col("TARGET").isNull)
val splits = encodedFinal.filter(col("TARGET").isNotNull).randomSplit(Array(0.75, 0.25), seed = 42)
val trainData = splits(0)
val validData = splits(1)

testData = [ID: int, CR_PROD_CNT_IL: int ... 20 more fields]
splits = Array([ID: int, CR_PROD_CNT_IL: int ... 20 more fields], [ID: int, CR_PROD_CNT_IL: int ... 20 more fields])
trainData = [ID: int, CR_PROD_CNT_IL: int ... 20 more fields]
validData = [ID: int, CR_PROD_CNT_IL: int ... 20 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 20 more fields]

In [19]:
val trainSet = assembler.transform(trainData)
val validSet = assembler.transform(validData)
val testSet = assembler.transform(testData)

trainSet = [ID: int, CR_PROD_CNT_IL: int ... 21 more fields]
validSet = [ID: int, CR_PROD_CNT_IL: int ... 21 more fields]
testSet = [ID: int, CR_PROD_CNT_IL: int ... 21 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 21 more fields]

In [20]:
val gbt = new GBTClassifier()
    .setLabelCol("TARGET")
    .setFeaturesCol("features")
    .setMaxIter(10)
val gbtModel = gbt.fit(trainSet)
val predictions = gbtModel.transform(validSet)
val evaluator = new BinaryClassificationEvaluator()
    .setLabelCol("TARGET")
    .setRawPredictionCol("probability")
    .setMetricName("areaUnderROC")
val ROC = evaluator.evaluate(predictions)
println("ROC on test data = " + ROC)

ROC on test data = 0.7784975700534695


gbt = gbtc_52ddc2f19bd3
gbtModel = GBTClassificationModel (uid=gbtc_52ddc2f19bd3) with 10 trees
predictions = [ID: int, CR_PROD_CNT_IL: int ... 24 more fields]
evaluator = binEval_50ab676415c3
ROC = 0.7784975700534695


0.7784975700534695

In [21]:
val result = gbtModel.transform(testSet)

result = [ID: int, CR_PROD_CNT_IL: int ... 24 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 24 more fields]

In [22]:
result.groupBy("prediction").count().show(1)

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|44393|
+----------+-----+
only showing top 1 row



In [23]:
val probValue = udf((vec: Vector) => vec(1))
result.withColumn("target", probValue(col("probability")))
    .withColumnRenamed("ID", "id").select("id", "target").write
    .option("header", true).option("delimiter", "\t").mode("overwrite").csv("lab05.csv")

probValue = UserDefinedFunction(<function1>,DoubleType,Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))


UserDefinedFunction(<function1>,DoubleType,Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))

In [24]:
"hdfs dfs -ls" !

Found 10 items
drwx------   - andrey.sorokin andrey.sorokin          0 2022-10-29 03:00 .Trash
drwxr-xr-x   - andrey.sorokin andrey.sorokin          0 2022-10-30 16:30 .sparkStaging
-rw-r--r--   3 andrey.sorokin andrey.sorokin        185 2022-10-05 21:33 lab01.json
drwxr-xr-x   - andrey.sorokin andrey.sorokin          0 2022-10-29 12:51 lab03.csv
drwxr-xr-x   - andrey.sorokin andrey.sorokin          0 2022-10-30 17:08 lab05.csv
drwxr-xr-x   - andrey.sorokin andrey.sorokin          0 2022-10-29 12:36 mydata.csv
drwxr-xr-x   - andrey.sorokin andrey.sorokin          0 2022-10-30 11:46 predictions_test.parquet
drwxr-xr-x   - andrey.sorokin andrey.sorokin          0 2022-10-30 11:18 preprocessed_train_data.parquet
drwxr-xr-x   - andrey.sorokin andrey.sorokin          0 2022-10-29 22:03 streaming
drwxr-xr-x   - andrey.sorokin andrey.sorokin          0 2022-10-28 20:51 train_data.parquet




0

In [25]:
"hdfs dfs -getmerge lab05.csv /data/home/andrey.sorokin/lab05.csv" !



0

In [26]:
"ls -l" !

total 120
-rw-rw-r-- 1 andrey.sorokin andrey.sorokin  9018 Oct 30 16:26 lab01.ipynb
-rw-rw-r-- 1 andrey.sorokin andrey.sorokin 15160 Oct 30 16:30 lab02.ipynb
-rw-rw-r-- 1 andrey.sorokin andrey.sorokin 36226 Oct 30 17:06 lab04.ipynb
-rw-rw-r-- 1 andrey.sorokin andrey.sorokin  9665 Oct 30 17:06 lab05.ipynb
-rw-rw-r-- 1 andrey.sorokin andrey.sorokin 44897 Oct 30 16:26 lab3.ipynb




0

In [27]:
spark.stop