In [1]:
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

In [2]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}

In [3]:
import sys.process._

In [4]:
val spark = SparkSession.builder().appName("lab05").config("spark.master", "yarn").getOrCreate()

spark = org.apache.spark.sql.SparkSession@3b233c94


org.apache.spark.sql.SparkSession@3b233c94

## Чтение источников
- lab05_train.csv — тренировочная выборка с известными значениями оттока.
- lab05_test.csv — проверочная выборка, значения оттока для которой вам и надо предсказать.

In [5]:
val train_csv = spark.read.option("header", "True")
                    .option("encoding","utf-8").option("sep", ",").csv("/labs/slaba05/lab05_train.csv")
val test_csv = spark.read.option("header", "True")
                    .option("encoding","utf-8").option("sep", ",").csv("/labs/slaba05/lab05_test.csv")

train_csv = [_c0: string, ID: string ... 115 more fields]
test_csv = [_c0: string, ID: string ... 114 more fields]


[_c0: string, ID: string ... 114 more fields]

In [6]:
train_csv.printSchema

root
 |-- _c0: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- CR_PROD_CNT_IL: string (nullable = true)
 |-- AMOUNT_RUB_CLO_PRC: string (nullable = true)
 |-- PRC_ACCEPTS_A_EMAIL_LINK: string (nullable = true)
 |-- APP_REGISTR_RGN_CODE: string (nullable = true)
 |-- PRC_ACCEPTS_A_POS: string (nullable = true)
 |-- PRC_ACCEPTS_A_TK: string (nullable = true)
 |-- TURNOVER_DYNAMIC_IL_1M: string (nullable = true)
 |-- CNT_TRAN_AUT_TENDENCY1M: string (nullable = true)
 |-- SUM_TRAN_AUT_TENDENCY1M: string (nullable = true)
 |-- AMOUNT_RUB_SUP_PRC: string (nullable = true)
 |-- PRC_ACCEPTS_A_AMOBILE: string (nullable = true)
 |-- SUM_TRAN_AUT_TENDENCY3M: string (nullable = true)
 |-- CLNT_TRUST_RELATION: string (nullable = true)
 |-- PRC_ACCEPTS_TK: string (nullable = true)
 |-- PRC_ACCEPTS_A_MTP: string (nullable = true)
 |-- REST_DYNAMIC_FDEP_1M: string (nullable = true)
 |-- CNT_TRAN_AUT_TENDENCY3M: string (nullable = true)
 |-- CNT_ACCEPTS_TK: string (nullable = true)
 |--

In [7]:
test_csv.printSchema

root
 |-- _c0: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- CR_PROD_CNT_IL: string (nullable = true)
 |-- AMOUNT_RUB_CLO_PRC: string (nullable = true)
 |-- PRC_ACCEPTS_A_EMAIL_LINK: string (nullable = true)
 |-- APP_REGISTR_RGN_CODE: string (nullable = true)
 |-- PRC_ACCEPTS_A_POS: string (nullable = true)
 |-- PRC_ACCEPTS_A_TK: string (nullable = true)
 |-- TURNOVER_DYNAMIC_IL_1M: string (nullable = true)
 |-- CNT_TRAN_AUT_TENDENCY1M: string (nullable = true)
 |-- SUM_TRAN_AUT_TENDENCY1M: string (nullable = true)
 |-- AMOUNT_RUB_SUP_PRC: string (nullable = true)
 |-- PRC_ACCEPTS_A_AMOBILE: string (nullable = true)
 |-- SUM_TRAN_AUT_TENDENCY3M: string (nullable = true)
 |-- CLNT_TRUST_RELATION: string (nullable = true)
 |-- PRC_ACCEPTS_TK: string (nullable = true)
 |-- PRC_ACCEPTS_A_MTP: string (nullable = true)
 |-- REST_DYNAMIC_FDEP_1M: string (nullable = true)
 |-- CNT_TRAN_AUT_TENDENCY3M: string (nullable = true)
 |-- CNT_ACCEPTS_TK: string (nullable = true)
 |--

непонятная колонка _c0

In [8]:
val train = train_csv.drop("_c0")
val test = test_csv.drop("_c0")

train = [ID: string, CR_PROD_CNT_IL: string ... 114 more fields]
test = [ID: string, CR_PROD_CNT_IL: string ... 113 more fields]


[ID: string, CR_PROD_CNT_IL: string ... 113 more fields]

##  Фичи
- в качестве фичей берем все колонки ( кроме ID и TARGET)
- все фичи надо преобразовать к числовому формату

In [9]:
//имена фичей 
val feature_col_names= train.columns.filter(x=> (x != "ID")).filter(x=> (x != "TARGET"))
//Тренировочный  DataFrame
val dftrain = train.select(List(col("ID"), col("TARGET").cast(IntegerType)) 
                        ++ feature_col_names.map(x => col(x).cast(FloatType)): _*).na.fill(value = 0.0)
//Тестовый DataFrame
val dftest  = test.select(List(col("ID")) 
                        ++ feature_col_names.map(x => col(x).cast(FloatType)): _*).na.fill(value = 0.0)

feature_col_names = Array(CR_PROD_CNT_IL, AMOUNT_RUB_CLO_PRC, PRC_ACCEPTS_A_EMAIL_LINK, APP_REGISTR_RGN_CODE, PRC_ACCEPTS_A_POS, PRC_ACCEPTS_A_TK, TURNOVER_DYNAMIC_IL_1M, CNT_TRAN_AUT_TENDENCY1M, SUM_TRAN_AUT_TENDENCY1M, AMOUNT_RUB_SUP_PRC, PRC_ACCEPTS_A_AMOBILE, SUM_TRAN_AUT_TENDENCY3M, CLNT_TRUST_RELATION, PRC_ACCEPTS_TK, PRC_ACCEPTS_A_MTP, REST_DYNAMIC_FDEP_1M, CNT_TRAN_AUT_TENDENCY3M, CNT_ACCEPTS_TK, APP_MARITAL_STATUS, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, CNT_TRAN_MED_TENDENCY1M, APP_KIND_OF_PROP_HABITATION, CLNT_JOB_POSITION_TYPE, AMOUNT_RUB_NAS_PRC, CLNT_JOB_POSITION, APP_DRIVING_LICENSE, TRANS_COUNT_SUP_PRC, APP_EDUCATION, CNT_TRAN_CLO_TENDENCY1M, SUM_TRAN_MED_TENDENCY1M, PRC_ACCEPTS_A_ATM, PRC_ACCEPTS_MTP, TRANS_COUNT_NAS_PRC, APP_TRAVEL_PASS, CNT...


Array(CR_PROD_CNT_IL, AMOUNT_RUB_CLO_PRC, PRC_ACCEPTS_A_EMAIL_LINK, APP_REGISTR_RGN_CODE, PRC_ACCEPTS_A_POS, PRC_ACCEPTS_A_TK, TURNOVER_DYNAMIC_IL_1M, CNT_TRAN_AUT_TENDENCY1M, SUM_TRAN_AUT_TENDENCY1M, AMOUNT_RUB_SUP_PRC, PRC_ACCEPTS_A_AMOBILE, SUM_TRAN_AUT_TENDENCY3M, CLNT_TRUST_RELATION, PRC_ACCEPTS_TK, PRC_ACCEPTS_A_MTP, REST_DYNAMIC_FDEP_1M, CNT_TRAN_AUT_TENDENCY3M, CNT_ACCEPTS_TK, APP_MARITAL_STATUS, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, CNT_TRAN_MED_TENDENCY1M, APP_KIND_OF_PROP_HABITATION, CLNT_JOB_POSITION_TYPE, AMOUNT_RUB_NAS_PRC, CLNT_JOB_POSITION, APP_DRIVING_LICENSE, TRANS_COUNT_SUP_PRC, APP_EDUCATION, CNT_TRAN_CLO_TENDENCY1M, SUM_TRAN_MED_TENDENCY1M, PRC_ACCEPTS_A_ATM, PRC_ACCEPTS_MTP, TRANS_COUNT_NAS_PRC, APP_TRAVEL_PASS, CNT...

##  Обучение

### векторизация

In [10]:
val vectorator = new VectorAssembler()
                    .setInputCols(feature_col_names)
                    .setOutputCol("features")

vectorator = vecAssembler_7da005f88083


vecAssembler_7da005f88083

In [11]:
val train_v = vectorator.transform(dftrain).repartition(50)

train_v = [ID: string, TARGET: int ... 115 more fields]


[ID: string, TARGET: int ... 115 more fields]

### обучение на RandomForest

In [12]:
val randomforest = new RandomForestClassifier()
      .setFeaturesCol("features")
      .setLabelCol("TARGET")
      .setNumTrees(10)
      .setMaxDepth(10)
      .setSubsamplingRate(0.1)

randomforest = rfc_0072f2118fee


rfc_0072f2118fee

In [13]:
val model_fit = randomforest.fit(train_v)

model_fit = RandomForestClassificationModel (uid=rfc_0072f2118fee) with 10 trees


RandomForestClassificationModel (uid=rfc_0072f2118fee) with 10 trees

In [14]:
val train_prediction = model_fit.transform(train_v)

train_prediction = [ID: string, TARGET: int ... 118 more fields]


[ID: string, TARGET: int ... 118 more fields]

### оценка прогноза по ROC (Gini)

In [15]:
val rocauc = new BinaryClassificationEvaluator()
                        .setLabelCol("TARGET")
                        .setRawPredictionCol("probability")
                        .setMetricName("areaUnderROC").evaluate(train_prediction)

rocauc = 0.8014185543522931


0.8014185543522931

In [16]:
println(s"Gini: ${2*rocauc-1}")

Gini: 0.6028371087045863


##  Прогноз

In [17]:
val vectorToArray = udf((v: Vector) =>  v.toArray)

vectorToArray = UserDefinedFunction(<function1>,ArrayType(DoubleType,false),Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))


UserDefinedFunction(<function1>,ArrayType(DoubleType,false),Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))

In [18]:
val test_v  = vectorator.transform(dftest).repartition(50)
val prediction = model_fit.transform(test_v)
prediction.select(col("id"), vectorToArray(col("probability")).getItem(1)).show(5,0,false)

+------+-------------------+
|id    |UDF(probability)[1]|
+------+-------------------+
|242788|0.06736539439310088|
|149580|0.02010444844266099|
|559729|0.07312782425076089|
|236733|0.0507243472193956 |
|584901|0.12101039092218988|
+------+-------------------+
only showing top 5 rows



test_v = [ID: string, CR_PROD_CNT_IL: float ... 114 more fields]
prediction = [ID: string, CR_PROD_CNT_IL: float ... 117 more fields]


[ID: string, CR_PROD_CNT_IL: float ... 117 more fields]

In [19]:
prediction.select(col("ID").alias("id"), vectorToArray(col("probability"))
                                      .getItem(1).alias("target")).coalesce(1)
.write.mode("overwrite").format("csv").option("delimiter", "\t")
.option("header", "True").option("encoding", "UTF-8").save("lab05")

In [20]:
"""hadoop fs -mv lab05/part-00000-*.csv lab05/lab05.csv""".!!
"""rm -r lab05.csv""".!!
"""hadoop fs -copyToLocal lab05/lab05.csv""".!!

res35: String = ""


In [21]:
spark.stop()