## Лаба 5: Прогнозирование оттока клиентов

In [1]:
import org.apache.spark.sql.functions.{col, when, count, lit, udf}
import org.apache.spark.sql.Column
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{FloatType, StringType, IntegerType}
import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoderEstimator, VectorAssembler}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.classification.{LogisticRegression, GBTClassifier}
import org.apache.spark.ml.linalg.{SparseVector, DenseVector, Vector}
import sys.process._

In [2]:
val df_train = spark.read.option("header", true).format("csv").load("/labs/slaba05/lab05_train.csv")
df_train.show(2)

Waiting for a Spark session to start...

+------+------+--------------+------------------+------------------------+--------------------+-----------------+----------------+----------------------+-----------------------+-----------------------+------------------+---------------------+-----------------------+-------------------+--------------+-----------------+--------------------+-----------------------+--------------+------------------+--------------------+---------------+----------------+-----------------------+---------------------------+----------------------+------------------+-----------------+-------------------+-------------------+-------------+-----------------------+-----------------------+-----------------+---------------+-------------------+---------------+---------------+----------------+-------+---------------+-----------------------+-----------------+-----------+-------------------+------------------+-------------+---+-----------------------+--------------+-----------------------+--------------------+------------

df_train = [_c0: string, ID: string ... 115 more fields]


[_c0: string, ID: string ... 115 more fields]

In [3]:
val df_test = spark.read.option("header", true).format("csv").load("/labs/slaba05/lab05_test.csv")
df_test.show(2)

+------+------+--------------+------------------+------------------------+--------------------+-----------------+----------------+----------------------+-----------------------+-----------------------+------------------+---------------------+-----------------------+-------------------+--------------+-----------------+--------------------+-----------------------+--------------+------------------+--------------------+---------------+----------------+-----------------------+---------------------------+----------------------+------------------+-----------------+-------------------+-------------------+-------------+-----------------------+-----------------------+-----------------+---------------+-------------------+---------------+---------------+----------------+-------+---------------+-----------------------+-----------------+-----------+-------------------+------------------+-------------+---+-----------------------+--------------+-----------------------+--------------------+------------

df_test = [_c0: string, ID: string ... 114 more fields]


[_c0: string, ID: string ... 114 more fields]

### 1. Выбираем фичи

In [4]:
var df_train_short = df_train.select(
    col("ID").cast(IntegerType),
    col("CR_PROD_CNT_IL").cast(IntegerType),
    col("TURNOVER_DYNAMIC_IL_1M").cast(FloatType),
    col("REST_DYNAMIC_FDEP_1M").cast(FloatType),
    col("REST_DYNAMIC_SAVE_3M").cast(FloatType),
    col("CR_PROD_CNT_VCU").cast(FloatType), 
    col("REST_AVG_CUR").cast(FloatType), 
    col("CR_PROD_CNT_TOVR").cast(FloatType), 
    col("CR_PROD_CNT_PIL").cast(FloatType), 
    col("TURNOVER_CC").cast(FloatType), 
    col("TURNOVER_PAYM").cast(FloatType), 
    col("AGE").cast(FloatType), 
    col("CR_PROD_CNT_CC").cast(FloatType), 
    col("REST_DYNAMIC_FDEP_3M").cast(FloatType), 
    col("REST_DYNAMIC_IL_1M").cast(FloatType), 
    col("CR_PROD_CNT_CCFP").cast(FloatType), 
    col("REST_DYNAMIC_CUR_1M").cast(FloatType), 
    col("REST_AVG_PAYM").cast(FloatType), 
    col("LDEAL_GRACE_DAYS_PCT_MED").cast(FloatType),
    col("REST_DYNAMIC_CUR_3M").cast(FloatType),
    col("TURNOVER_DYNAMIC_CUR_1M").cast(FloatType),
    col("REST_DYNAMIC_PAYM_3M").cast(FloatType),
    col("REST_DYNAMIC_IL_3M").cast(FloatType),
    col("TURNOVER_DYNAMIC_IL_3M").cast(FloatType),
    col("REST_DYNAMIC_PAYM_1M").cast(FloatType),
    col("TURNOVER_DYNAMIC_CUR_3M").cast(FloatType),
    col("PACK").cast(StringType),
    col("CLNT_SETUP_TENOR").cast(FloatType),
    col("TURNOVER_DYNAMIC_PAYM_3M").cast(FloatType),
    col("TURNOVER_DYNAMIC_PAYM_1M").cast(FloatType),
    col("REST_DYNAMIC_CC_1M").cast(FloatType),
    col("TURNOVER_DYNAMIC_CC_1M").cast(FloatType),
    col("REST_DYNAMIC_CC_3M").cast(FloatType),
    col("TURNOVER_DYNAMIC_CC_3M").cast(FloatType),
    col("TARGET").cast(IntegerType))


df_train_short.printSchema

root
 |-- ID: integer (nullable = true)
 |-- CR_PROD_CNT_IL: integer (nullable = true)
 |-- TURNOVER_DYNAMIC_IL_1M: float (nullable = true)
 |-- REST_DYNAMIC_FDEP_1M: float (nullable = true)
 |-- REST_DYNAMIC_SAVE_3M: float (nullable = true)
 |-- CR_PROD_CNT_VCU: float (nullable = true)
 |-- REST_AVG_CUR: float (nullable = true)
 |-- CR_PROD_CNT_TOVR: float (nullable = true)
 |-- CR_PROD_CNT_PIL: float (nullable = true)
 |-- TURNOVER_CC: float (nullable = true)
 |-- TURNOVER_PAYM: float (nullable = true)
 |-- AGE: float (nullable = true)
 |-- CR_PROD_CNT_CC: float (nullable = true)
 |-- REST_DYNAMIC_FDEP_3M: float (nullable = true)
 |-- REST_DYNAMIC_IL_1M: float (nullable = true)
 |-- CR_PROD_CNT_CCFP: float (nullable = true)
 |-- REST_DYNAMIC_CUR_1M: float (nullable = true)
 |-- REST_AVG_PAYM: float (nullable = true)
 |-- LDEAL_GRACE_DAYS_PCT_MED: float (nullable = true)
 |-- REST_DYNAMIC_CUR_3M: float (nullable = true)
 |-- TURNOVER_DYNAMIC_CUR_1M: float (nullable = true)
 |-- REST_D

df_train_short = [ID: int, CR_PROD_CNT_IL: int ... 33 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 33 more fields]

In [5]:
var df_test_short = df_test.select(
    col("ID").cast(IntegerType),
    col("CR_PROD_CNT_IL").cast(IntegerType),
    col("TURNOVER_DYNAMIC_IL_1M").cast(FloatType),
    col("REST_DYNAMIC_FDEP_1M").cast(FloatType),
    col("REST_DYNAMIC_SAVE_3M").cast(FloatType),
    col("CR_PROD_CNT_VCU").cast(FloatType), 
    col("REST_AVG_CUR").cast(FloatType), 
    col("CR_PROD_CNT_TOVR").cast(FloatType), 
    col("CR_PROD_CNT_PIL").cast(FloatType), 
    col("TURNOVER_CC").cast(FloatType), 
    col("TURNOVER_PAYM").cast(FloatType), 
    col("AGE").cast(FloatType), 
    col("CR_PROD_CNT_CC").cast(FloatType), 
    col("REST_DYNAMIC_FDEP_3M").cast(FloatType), 
    col("REST_DYNAMIC_IL_1M").cast(FloatType), 
    col("CR_PROD_CNT_CCFP").cast(FloatType), 
    col("REST_DYNAMIC_CUR_1M").cast(FloatType), 
    col("REST_AVG_PAYM").cast(FloatType), 
    col("LDEAL_GRACE_DAYS_PCT_MED").cast(FloatType),
    col("REST_DYNAMIC_CUR_3M").cast(FloatType),
    col("TURNOVER_DYNAMIC_CUR_1M").cast(FloatType),
    col("REST_DYNAMIC_PAYM_3M").cast(FloatType),
    col("REST_DYNAMIC_IL_3M").cast(FloatType),
    col("TURNOVER_DYNAMIC_IL_3M").cast(FloatType),
    col("REST_DYNAMIC_PAYM_1M").cast(FloatType),
    col("TURNOVER_DYNAMIC_CUR_3M").cast(FloatType),
    col("PACK").cast(StringType),
    col("CLNT_SETUP_TENOR").cast(FloatType),
    col("TURNOVER_DYNAMIC_PAYM_3M").cast(FloatType),
    col("TURNOVER_DYNAMIC_PAYM_1M").cast(FloatType),
    col("REST_DYNAMIC_CC_1M").cast(FloatType),
    col("TURNOVER_DYNAMIC_CC_1M").cast(FloatType),
    col("REST_DYNAMIC_CC_3M").cast(FloatType),
    col("TURNOVER_DYNAMIC_CC_3M").cast(FloatType))
df_test_short.printSchema

root
 |-- ID: integer (nullable = true)
 |-- CR_PROD_CNT_IL: integer (nullable = true)
 |-- TURNOVER_DYNAMIC_IL_1M: float (nullable = true)
 |-- REST_DYNAMIC_FDEP_1M: float (nullable = true)
 |-- REST_DYNAMIC_SAVE_3M: float (nullable = true)
 |-- CR_PROD_CNT_VCU: float (nullable = true)
 |-- REST_AVG_CUR: float (nullable = true)
 |-- CR_PROD_CNT_TOVR: float (nullable = true)
 |-- CR_PROD_CNT_PIL: float (nullable = true)
 |-- TURNOVER_CC: float (nullable = true)
 |-- TURNOVER_PAYM: float (nullable = true)
 |-- AGE: float (nullable = true)
 |-- CR_PROD_CNT_CC: float (nullable = true)
 |-- REST_DYNAMIC_FDEP_3M: float (nullable = true)
 |-- REST_DYNAMIC_IL_1M: float (nullable = true)
 |-- CR_PROD_CNT_CCFP: float (nullable = true)
 |-- REST_DYNAMIC_CUR_1M: float (nullable = true)
 |-- REST_AVG_PAYM: float (nullable = true)
 |-- LDEAL_GRACE_DAYS_PCT_MED: float (nullable = true)
 |-- REST_DYNAMIC_CUR_3M: float (nullable = true)
 |-- TURNOVER_DYNAMIC_CUR_1M: float (nullable = true)
 |-- REST_D

df_test_short = [ID: int, CR_PROD_CNT_IL: int ... 32 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 32 more fields]

## 2. Удаляем сотавшиеся null

In [6]:
val train_short = df_train_short.na.fill(-1)
train_short.show(2)

+------+--------------+----------------------+--------------------+--------------------+---------------+------------+----------------+---------------+-----------+-------------+-----+--------------+--------------------+------------------+----------------+-------------------+-------------+------------------------+-------------------+-----------------------+--------------------+------------------+----------------------+--------------------+-----------------------+----+----------------+------------------------+------------------------+------------------+----------------------+------------------+----------------------+------+
|    ID|CR_PROD_CNT_IL|TURNOVER_DYNAMIC_IL_1M|REST_DYNAMIC_FDEP_1M|REST_DYNAMIC_SAVE_3M|CR_PROD_CNT_VCU|REST_AVG_CUR|CR_PROD_CNT_TOVR|CR_PROD_CNT_PIL|TURNOVER_CC|TURNOVER_PAYM|  AGE|CR_PROD_CNT_CC|REST_DYNAMIC_FDEP_3M|REST_DYNAMIC_IL_1M|CR_PROD_CNT_CCFP|REST_DYNAMIC_CUR_1M|REST_AVG_PAYM|LDEAL_GRACE_DAYS_PCT_MED|REST_DYNAMIC_CUR_3M|TURNOVER_DYNAMIC_CUR_1M|REST_DYNAMIC_P

train_short = [ID: int, CR_PROD_CNT_IL: int ... 33 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 33 more fields]

In [7]:
val test_short = df_test_short.na.fill(-1)
test_short.show(2)

+------+--------------+----------------------+--------------------+--------------------+---------------+------------+----------------+---------------+-----------+-------------+-----+--------------+--------------------+------------------+----------------+-------------------+-------------+------------------------+-------------------+-----------------------+--------------------+------------------+----------------------+--------------------+-----------------------+----+----------------+------------------------+------------------------+------------------+----------------------+------------------+----------------------+
|    ID|CR_PROD_CNT_IL|TURNOVER_DYNAMIC_IL_1M|REST_DYNAMIC_FDEP_1M|REST_DYNAMIC_SAVE_3M|CR_PROD_CNT_VCU|REST_AVG_CUR|CR_PROD_CNT_TOVR|CR_PROD_CNT_PIL|TURNOVER_CC|TURNOVER_PAYM|  AGE|CR_PROD_CNT_CC|REST_DYNAMIC_FDEP_3M|REST_DYNAMIC_IL_1M|CR_PROD_CNT_CCFP|REST_DYNAMIC_CUR_1M|REST_AVG_PAYM|LDEAL_GRACE_DAYS_PCT_MED|REST_DYNAMIC_CUR_3M|TURNOVER_DYNAMIC_CUR_1M|REST_DYNAMIC_PAYM_3M|

test_short = [ID: int, CR_PROD_CNT_IL: int ... 32 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 32 more fields]

## 3. Работаем со стринговой переменной PACK (StringIndexer)

In [14]:
train_short.printSchema

root
 |-- ID: integer (nullable = false)
 |-- CR_PROD_CNT_IL: integer (nullable = false)
 |-- TURNOVER_DYNAMIC_IL_1M: float (nullable = false)
 |-- REST_DYNAMIC_FDEP_1M: float (nullable = false)
 |-- REST_DYNAMIC_SAVE_3M: float (nullable = false)
 |-- CR_PROD_CNT_VCU: float (nullable = false)
 |-- REST_AVG_CUR: float (nullable = false)
 |-- CR_PROD_CNT_TOVR: float (nullable = false)
 |-- CR_PROD_CNT_PIL: float (nullable = false)
 |-- TURNOVER_CC: float (nullable = false)
 |-- TURNOVER_PAYM: float (nullable = false)
 |-- AGE: float (nullable = false)
 |-- CR_PROD_CNT_CC: float (nullable = false)
 |-- REST_DYNAMIC_FDEP_3M: float (nullable = false)
 |-- REST_DYNAMIC_IL_1M: float (nullable = false)
 |-- CR_PROD_CNT_CCFP: float (nullable = false)
 |-- REST_DYNAMIC_CUR_1M: float (nullable = false)
 |-- REST_AVG_PAYM: float (nullable = false)
 |-- LDEAL_GRACE_DAYS_PCT_MED: float (nullable = false)
 |-- REST_DYNAMIC_CUR_3M: float (nullable = false)
 |-- TURNOVER_DYNAMIC_CUR_1M: float (nullable

In [15]:
val train_short1 = train_short.filter(col("PACK").isNotNull)
train_short1.printSchema

root
 |-- ID: integer (nullable = false)
 |-- CR_PROD_CNT_IL: integer (nullable = false)
 |-- TURNOVER_DYNAMIC_IL_1M: float (nullable = false)
 |-- REST_DYNAMIC_FDEP_1M: float (nullable = false)
 |-- REST_DYNAMIC_SAVE_3M: float (nullable = false)
 |-- CR_PROD_CNT_VCU: float (nullable = false)
 |-- REST_AVG_CUR: float (nullable = false)
 |-- CR_PROD_CNT_TOVR: float (nullable = false)
 |-- CR_PROD_CNT_PIL: float (nullable = false)
 |-- TURNOVER_CC: float (nullable = false)
 |-- TURNOVER_PAYM: float (nullable = false)
 |-- AGE: float (nullable = false)
 |-- CR_PROD_CNT_CC: float (nullable = false)
 |-- REST_DYNAMIC_FDEP_3M: float (nullable = false)
 |-- REST_DYNAMIC_IL_1M: float (nullable = false)
 |-- CR_PROD_CNT_CCFP: float (nullable = false)
 |-- REST_DYNAMIC_CUR_1M: float (nullable = false)
 |-- REST_AVG_PAYM: float (nullable = false)
 |-- LDEAL_GRACE_DAYS_PCT_MED: float (nullable = false)
 |-- REST_DYNAMIC_CUR_3M: float (nullable = false)
 |-- TURNOVER_DYNAMIC_CUR_1M: float (nullable

train_short1 = [ID: int, CR_PROD_CNT_IL: int ... 33 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 33 more fields]

In [16]:
train_short1.select(train_short("PACK")).distinct().collect()

Array([101], [107], [K01], [M01], [104], [102], [103], [108], [105], [O01], [109], [301])

In [17]:
test_short.select(test_short("PACK")).distinct().collect()

Array([101], [107], [K01], [M01], [104], [102], [103], [105], [O01], [109], [301])

In [18]:
import org.apache.spark.ml.feature.StringIndexer

val qualification_indexer = new StringIndexer().setInputCol("PACK").setOutputCol("pack_encoded")
val train_indexed = qualification_indexer.fit(train_short1).transform(train_short1)
val test_indexed = qualification_indexer.fit(test_short).transform(test_short)

qualification_indexer = strIdx_2be8f5e7f083
train_indexed = [ID: int, CR_PROD_CNT_IL: int ... 34 more fields]
test_indexed = [ID: int, CR_PROD_CNT_IL: int ... 33 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 33 more fields]

In [19]:
train_indexed.printSchema()

root
 |-- ID: integer (nullable = false)
 |-- CR_PROD_CNT_IL: integer (nullable = false)
 |-- TURNOVER_DYNAMIC_IL_1M: float (nullable = false)
 |-- REST_DYNAMIC_FDEP_1M: float (nullable = false)
 |-- REST_DYNAMIC_SAVE_3M: float (nullable = false)
 |-- CR_PROD_CNT_VCU: float (nullable = false)
 |-- REST_AVG_CUR: float (nullable = false)
 |-- CR_PROD_CNT_TOVR: float (nullable = false)
 |-- CR_PROD_CNT_PIL: float (nullable = false)
 |-- TURNOVER_CC: float (nullable = false)
 |-- TURNOVER_PAYM: float (nullable = false)
 |-- AGE: float (nullable = false)
 |-- CR_PROD_CNT_CC: float (nullable = false)
 |-- REST_DYNAMIC_FDEP_3M: float (nullable = false)
 |-- REST_DYNAMIC_IL_1M: float (nullable = false)
 |-- CR_PROD_CNT_CCFP: float (nullable = false)
 |-- REST_DYNAMIC_CUR_1M: float (nullable = false)
 |-- REST_AVG_PAYM: float (nullable = false)
 |-- LDEAL_GRACE_DAYS_PCT_MED: float (nullable = false)
 |-- REST_DYNAMIC_CUR_3M: float (nullable = false)
 |-- TURNOVER_DYNAMIC_CUR_1M: float (nullable

In [20]:
test_indexed.printSchema()

root
 |-- ID: integer (nullable = false)
 |-- CR_PROD_CNT_IL: integer (nullable = false)
 |-- TURNOVER_DYNAMIC_IL_1M: float (nullable = false)
 |-- REST_DYNAMIC_FDEP_1M: float (nullable = false)
 |-- REST_DYNAMIC_SAVE_3M: float (nullable = false)
 |-- CR_PROD_CNT_VCU: float (nullable = false)
 |-- REST_AVG_CUR: float (nullable = false)
 |-- CR_PROD_CNT_TOVR: float (nullable = false)
 |-- CR_PROD_CNT_PIL: float (nullable = false)
 |-- TURNOVER_CC: float (nullable = false)
 |-- TURNOVER_PAYM: float (nullable = false)
 |-- AGE: float (nullable = false)
 |-- CR_PROD_CNT_CC: float (nullable = false)
 |-- REST_DYNAMIC_FDEP_3M: float (nullable = false)
 |-- REST_DYNAMIC_IL_1M: float (nullable = false)
 |-- CR_PROD_CNT_CCFP: float (nullable = false)
 |-- REST_DYNAMIC_CUR_1M: float (nullable = false)
 |-- REST_AVG_PAYM: float (nullable = false)
 |-- LDEAL_GRACE_DAYS_PCT_MED: float (nullable = false)
 |-- REST_DYNAMIC_CUR_3M: float (nullable = false)
 |-- TURNOVER_DYNAMIC_CUR_1M: float (nullable

In [21]:
train_indexed.select(train_indexed("pack_encoded")).distinct().collect()

Array([8.0], [0.0], [7.0], [1.0], [4.0], [11.0], [3.0], [2.0], [10.0], [6.0], [5.0], [9.0])

In [22]:
test_indexed.select(test_indexed("pack_encoded")).distinct().collect()

Array([8.0], [0.0], [7.0], [1.0], [4.0], [3.0], [2.0], [10.0], [6.0], [5.0], [9.0])

## 3. Разбиваем на train и validation, VectorAssembler, оцениваем модель

In [23]:
val cols: Array[String] = Array[String]("CR_PROD_CNT_IL",
 "TURNOVER_DYNAMIC_IL_1M",
 "REST_DYNAMIC_FDEP_1M",
 "REST_DYNAMIC_SAVE_3M",
 "CR_PROD_CNT_VCU",
 "REST_AVG_CUR",
 "CR_PROD_CNT_TOVR",
 "CR_PROD_CNT_PIL",
 "TURNOVER_CC",
 "TURNOVER_PAYM",
 "AGE",
 "CR_PROD_CNT_CC",
 "REST_DYNAMIC_FDEP_3M",
 "REST_DYNAMIC_IL_1M",
 "CR_PROD_CNT_CCFP",
 "REST_DYNAMIC_CUR_1M",
 "REST_AVG_PAYM",
 "LDEAL_GRACE_DAYS_PCT_MED",
 "REST_DYNAMIC_CUR_3M",
 "TURNOVER_DYNAMIC_CUR_1M",
 "REST_DYNAMIC_PAYM_3M",
 "REST_DYNAMIC_IL_3M",
 "TURNOVER_DYNAMIC_IL_3M",
 "REST_DYNAMIC_PAYM_1M",
 "TURNOVER_DYNAMIC_CUR_3M",
 "CLNT_SETUP_TENOR",
 "TURNOVER_DYNAMIC_PAYM_3M",
 "TURNOVER_DYNAMIC_PAYM_1M",
 "REST_DYNAMIC_CC_1M",
 "TURNOVER_DYNAMIC_CC_1M",
 "REST_DYNAMIC_CC_3M",
 "TURNOVER_DYNAMIC_CC_3M",
 "pack_encoded"
                                       )
cols

cols = Array(CR_PROD_CNT_IL, TURNOVER_DYNAMIC_IL_1M, REST_DYNAMIC_FDEP_1M, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, CR_PROD_CNT_TOVR, CR_PROD_CNT_PIL, TURNOVER_CC, TURNOVER_PAYM, AGE, CR_PROD_CNT_CC, REST_DYNAMIC_FDEP_3M, REST_DYNAMIC_IL_1M, CR_PROD_CNT_CCFP, REST_DYNAMIC_CUR_1M, REST_AVG_PAYM, LDEAL_GRACE_DAYS_PCT_MED, REST_DYNAMIC_CUR_3M, TURNOVER_DYNAMIC_CUR_1M, REST_DYNAMIC_PAYM_3M, REST_DYNAMIC_IL_3M, TURNOVER_DYNAMIC_IL_3M, REST_DYNAMIC_PAYM_1M, TURNOVER_DYNAMIC_CUR_3M, CLNT_SETUP_TENOR, TURNOVER_DYNAMIC_PAYM_3M, TURNOVER_DYNAMIC_PAYM_1M, REST_DYNAMIC_CC_1M, TURNOVER_DYNAMIC_CC_1M, REST_DYNAMIC_CC_3M, TURNOVER_DYNAMIC_CC_3M, pack_encoded)


Array(CR_PROD_CNT_IL, TURNOVER_DYNAMIC_IL_1M, REST_DYNAMIC_FDEP_1M, REST_DYNAMIC_SAVE_3M, CR_P...

In [24]:
val assembler = new VectorAssembler().setInputCols(cols).setOutputCol("features")
val train_indexed1 = assembler.transform(train_indexed)
val test_indexed1 = assembler.transform(test_indexed)

assembler = vecAssembler_097dab818fc2
train_indexed1 = [ID: int, CR_PROD_CNT_IL: int ... 35 more fields]
test_indexed1 = [ID: int, CR_PROD_CNT_IL: int ... 34 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 34 more fields]

In [25]:
train_indexed1.printSchema

root
 |-- ID: integer (nullable = false)
 |-- CR_PROD_CNT_IL: integer (nullable = false)
 |-- TURNOVER_DYNAMIC_IL_1M: float (nullable = false)
 |-- REST_DYNAMIC_FDEP_1M: float (nullable = false)
 |-- REST_DYNAMIC_SAVE_3M: float (nullable = false)
 |-- CR_PROD_CNT_VCU: float (nullable = false)
 |-- REST_AVG_CUR: float (nullable = false)
 |-- CR_PROD_CNT_TOVR: float (nullable = false)
 |-- CR_PROD_CNT_PIL: float (nullable = false)
 |-- TURNOVER_CC: float (nullable = false)
 |-- TURNOVER_PAYM: float (nullable = false)
 |-- AGE: float (nullable = false)
 |-- CR_PROD_CNT_CC: float (nullable = false)
 |-- REST_DYNAMIC_FDEP_3M: float (nullable = false)
 |-- REST_DYNAMIC_IL_1M: float (nullable = false)
 |-- CR_PROD_CNT_CCFP: float (nullable = false)
 |-- REST_DYNAMIC_CUR_1M: float (nullable = false)
 |-- REST_AVG_PAYM: float (nullable = false)
 |-- LDEAL_GRACE_DAYS_PCT_MED: float (nullable = false)
 |-- REST_DYNAMIC_CUR_3M: float (nullable = false)
 |-- TURNOVER_DYNAMIC_CUR_1M: float (nullable

In [26]:
test_indexed1.printSchema

root
 |-- ID: integer (nullable = false)
 |-- CR_PROD_CNT_IL: integer (nullable = false)
 |-- TURNOVER_DYNAMIC_IL_1M: float (nullable = false)
 |-- REST_DYNAMIC_FDEP_1M: float (nullable = false)
 |-- REST_DYNAMIC_SAVE_3M: float (nullable = false)
 |-- CR_PROD_CNT_VCU: float (nullable = false)
 |-- REST_AVG_CUR: float (nullable = false)
 |-- CR_PROD_CNT_TOVR: float (nullable = false)
 |-- CR_PROD_CNT_PIL: float (nullable = false)
 |-- TURNOVER_CC: float (nullable = false)
 |-- TURNOVER_PAYM: float (nullable = false)
 |-- AGE: float (nullable = false)
 |-- CR_PROD_CNT_CC: float (nullable = false)
 |-- REST_DYNAMIC_FDEP_3M: float (nullable = false)
 |-- REST_DYNAMIC_IL_1M: float (nullable = false)
 |-- CR_PROD_CNT_CCFP: float (nullable = false)
 |-- REST_DYNAMIC_CUR_1M: float (nullable = false)
 |-- REST_AVG_PAYM: float (nullable = false)
 |-- LDEAL_GRACE_DAYS_PCT_MED: float (nullable = false)
 |-- REST_DYNAMIC_CUR_3M: float (nullable = false)
 |-- TURNOVER_DYNAMIC_CUR_1M: float (nullable

In [27]:
val train_splitting = train_indexed1.filter(col("TARGET").isNotNull).randomSplit(Array(0.8, 0.2), seed = 5757)
val train = train_splitting(0)
val valid = train_splitting(1)

train_splitting = Array([ID: int, CR_PROD_CNT_IL: int ... 35 more fields], [ID: int, CR_PROD_CNT_IL: int ... 35 more fields])
train = [ID: int, CR_PROD_CNT_IL: int ... 35 more fields]
valid = [ID: int, CR_PROD_CNT_IL: int ... 35 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 35 more fields]

In [31]:
val gbt = new GBTClassifier()
    .setLabelCol("TARGET")
    .setFeaturesCol("features")
    .setMaxIter(10)

gbt = gbtc_642527d33a40


gbtc_642527d33a40

In [32]:
val gbt_model = gbt.fit(train)

gbt_model = GBTClassificationModel (uid=gbtc_642527d33a40) with 10 trees


GBTClassificationModel (uid=gbtc_642527d33a40) with 10 trees

In [33]:
val predictions_valid = gbt_model.transform(valid)

val evaluator = new BinaryClassificationEvaluator()
    .setLabelCol("TARGET")
    .setMetricName("areaUnderROC")

val score = evaluator.evaluate(predictions_valid)

println("areaUnderROC: " + score)

areaUnderROC: 0.8043041688054996


predictions_valid = [ID: int, CR_PROD_CNT_IL: int ... 38 more fields]
evaluator = binEval_c0c789c5bfd4
score = 0.8043041688054996


0.8043041688054996

In [42]:
val test_predictions = gbt_model.transform(test_indexed1)

test_predictions = [ID: int, CR_PROD_CNT_IL: int ... 37 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 37 more fields]

In [43]:
val probValue = udf((vec: Vector) => vec(1))
test_predictions.withColumn("target", probValue(col("probability")))
    .withColumnRenamed("ID", "id")
    .select("id", "target")
    .write
    .option("header", true)
    .option("delimiter", "\t")
    .mode("overwrite")
    .csv("lab05_scala.csv")

probValue = UserDefinedFunction(<function1>,DoubleType,Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))


UserDefinedFunction(<function1>,DoubleType,Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))

In [49]:
"hdfs dfs -getmerge lab05_scala.csv /data/home/yuriy.perevezentsev/lab05_scala.csv" !



0