In [1]:
import scala.util.Try
import scala.math.max
import org.apache.spark._
import org.apache.log4j._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.spark.sql.functions.{concat, lit}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.functions.expr
import org.apache.spark.sql.functions._
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.sql.types._
import java.net.URLDecoder

In [2]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier}

In [3]:
val spark = SparkSession.builder().appName("test_lab05").config("spark.master", "yarn").getOrCreate()

spark = org.apache.spark.sql.SparkSession@32d64685


org.apache.spark.sql.SparkSession@32d64685

# 1. Импортируем данные

In [4]:
val train = spark.read
                    .option("delimiter", ",")
                    .option("header", "True")
                    .option("encoding","utf-8")
                    .csv("/labs/slaba05/lab05_train.csv")
val test = spark.read
                    .option("delimiter", ",")
                    .option("header", "True")
                    .option("encoding","utf-8")
                    .csv("/labs/slaba05/lab05_test.csv")

train = [_c0: string, ID: string ... 115 more fields]
test = [_c0: string, ID: string ... 114 more fields]


[_c0: string, ID: string ... 114 more fields]

In [5]:
train.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- CR_PROD_CNT_IL: string (nullable = true)
 |-- AMOUNT_RUB_CLO_PRC: string (nullable = true)
 |-- PRC_ACCEPTS_A_EMAIL_LINK: string (nullable = true)
 |-- APP_REGISTR_RGN_CODE: string (nullable = true)
 |-- PRC_ACCEPTS_A_POS: string (nullable = true)
 |-- PRC_ACCEPTS_A_TK: string (nullable = true)
 |-- TURNOVER_DYNAMIC_IL_1M: string (nullable = true)
 |-- CNT_TRAN_AUT_TENDENCY1M: string (nullable = true)
 |-- SUM_TRAN_AUT_TENDENCY1M: string (nullable = true)
 |-- AMOUNT_RUB_SUP_PRC: string (nullable = true)
 |-- PRC_ACCEPTS_A_AMOBILE: string (nullable = true)
 |-- SUM_TRAN_AUT_TENDENCY3M: string (nullable = true)
 |-- CLNT_TRUST_RELATION: string (nullable = true)
 |-- PRC_ACCEPTS_TK: string (nullable = true)
 |-- PRC_ACCEPTS_A_MTP: string (nullable = true)
 |-- REST_DYNAMIC_FDEP_1M: string (nullable = true)
 |-- CNT_TRAN_AUT_TENDENCY3M: string (nullable = true)
 |-- CNT_ACCEPTS_TK: string (nullable = true)
 |--

# 2. Избавляемся от ненужных столбцов

In [6]:
var df_train = train.drop("APP_CAR", "APP_COMP_TYPE", "APP_COMP_TYPE", "APP_DRIVING_LICENSE", "APP_EMP_TYPE", 
                      "APP_KIND_OF_PROP_HABITATION", "APP_MARITAL_STATUS", "APP_POSITION_TYPE", "APP_TRAVEL_PASS",
                      "APP_REGISTR_RGN_CODE", "APP_TRAVEL_PASS", 
                      "CLNT_JOB_POSITION_TYPE", "PACK", "_c0",
                     "CLNT_TRUST_RELATION", "CLNT_JOB_POSITION")
var df_test = test.drop("APP_CAR", "APP_COMP_TYPE", "APP_COMP_TYPE", "APP_DRIVING_LICENSE", "APP_EMP_TYPE", 
                      "APP_KIND_OF_PROP_HABITATION", "APP_MARITAL_STATUS", "APP_POSITION_TYPE", "APP_TRAVEL_PASS",
                      "APP_REGISTR_RGN_CODE", "APP_TRAVEL_PASS", 
                      "CLNT_JOB_POSITION_TYPE", "PACK", "_c0",
                     "CLNT_TRUST_RELATION", "CLNT_JOB_POSITION")

df_train = [ID: string, CR_PROD_CNT_IL: string ... 101 more fields]
df_test = [ID: string, CR_PROD_CNT_IL: string ... 100 more fields]


[ID: string, CR_PROD_CNT_IL: string ... 100 more fields]

# 3. Записываем все признаки в отдельный список

In [7]:
var feature_list = Array[String]()
for (t <- df_train.columns if (t!="ID"))
{ df_train = df_train.withColumn(t, col(t).cast(FloatType))
        feature_list +:= t}
val b = feature_list.filter(! _.contains("TARGET"))

feature_list = Array(TARGET, LDEAL_ACT_DAYS_PCT_CURR, LDEAL_ACT_DAYS_PCT_TR4, LDEAL_ACT_DAYS_PCT_TR, TURNOVER_DYNAMIC_CC_3M, LDEAL_DELINQ_PER_MAXYWZ, LDEAL_ACT_DAYS_PCT_AAVG, LDEAL_ACT_DAYS_PCT_TR3, MED_DEBT_PRC_YWZ, REST_DYNAMIC_CC_3M, LDEAL_ACT_DAYS_ACC_PCT_AVG, AVG_PCT_DEBT_TO_DEAL_AMT, TURNOVER_DYNAMIC_CC_1M, LDEAL_USED_AMT_AVG_YWZ, REST_DYNAMIC_CC_1M, LDEAL_USED_AMT_AVG_YQZ, TRANS_CNT_TENDENCY3M, MED_DEBT_PRC_YQZ, TRANS_AMOUNT_TENDENCY3M, CLNT_SALARY_VALUE, TURNOVER_DYNAMIC_PAYM_1M, LDEAL_DELINQ_PER_MAXYQZ, TURNOVER_DYNAMIC_PAYM_3M, DEAL_GRACE_DAYS_ACC_MAX, CLNT_SETUP_TENOR, LDEAL_YQZ_PC, MAX_PCLOSE_DATE, TURNOVER_DYNAMIC_CUR_3M, DEAL_GRACE_DAYS_ACC_AVG, CNT_TRAN_SUP_TENDENCY1M, REST_DYNAMIC_PAYM_1M, SUM_TRAN_ATM_TENDENCY1M, DEAL_YWZ_IR_MAX, SUM_TRAN_SUP_TENDENCY1M, ...


Array(TARGET, LDEAL_ACT_DAYS_PCT_CURR, LDEAL_ACT_DAYS_PCT_TR4, LDEAL_ACT_DAYS_PCT_TR, TURNOVER_DYNAMIC_CC_3M, LDEAL_DELINQ_PER_MAXYWZ, LDEAL_ACT_DAYS_PCT_AAVG, LDEAL_ACT_DAYS_PCT_TR3, MED_DEBT_PRC_YWZ, REST_DYNAMIC_CC_3M, LDEAL_ACT_DAYS_ACC_PCT_AVG, AVG_PCT_DEBT_TO_DEAL_AMT, TURNOVER_DYNAMIC_CC_1M, LDEAL_USED_AMT_AVG_YWZ, REST_DYNAMIC_CC_1M, LDEAL_USED_AMT_AVG_YQZ, TRANS_CNT_TENDENCY3M, MED_DEBT_PRC_YQZ, TRANS_AMOUNT_TENDENCY3M, CLNT_SALARY_VALUE, TURNOVER_DYNAMIC_PAYM_1M, LDEAL_DELINQ_PER_MAXYQZ, TURNOVER_DYNAMIC_PAYM_3M, DEAL_GRACE_DAYS_ACC_MAX, CLNT_SETUP_TENOR, LDEAL_YQZ_PC, MAX_PCLOSE_DATE, TURNOVER_DYNAMIC_CUR_3M, DEAL_GRACE_DAYS_ACC_AVG, CNT_TRAN_SUP_TENDENCY1M, REST_DYNAMIC_PAYM_1M, SUM_TRAN_ATM_TENDENCY1M, DEAL_YWZ_IR_MAX, SUM_TRAN_SUP_TENDENCY1M, ...

In [8]:
for (t <- df_test.columns if (t!="ID"))
{ df_test = df_test.withColumn(t, col(t).cast(FloatType))}

# 4. Избавляемся от NULL и записываем все признаки в VectorAssembler

In [9]:
df_test = df_test.na.fill(value=0.0)
df_train = df_train.na.fill(value=0.0)

df_test = [ID: string, CR_PROD_CNT_IL: float ... 100 more fields]
df_train = [ID: string, CR_PROD_CNT_IL: float ... 101 more fields]


[ID: string, CR_PROD_CNT_IL: float ... 101 more fields]

In [10]:
val assembler = new VectorAssembler()
                    .setInputCols(b)
                    .setOutputCol("features")

assembler = vecAssembler_e270360cb6b7


vecAssembler_e270360cb6b7

# 5. Приводим колонку "TARGET" к типу INT и репартицируем датафреймы

In [11]:
df_train = df_train.withColumn("TARGET", col("TARGET").cast(IntegerType))

df_train = [ID: string, CR_PROD_CNT_IL: float ... 101 more fields]


[ID: string, CR_PROD_CNT_IL: float ... 101 more fields]

In [12]:
val train_data=assembler.transform(df_train).repartition(30)
val test_data=assembler.transform(df_test).repartition(30)

train_data = [ID: string, CR_PROD_CNT_IL: float ... 102 more fields]
test_data = [ID: string, CR_PROD_CNT_IL: float ... 101 more fields]


[ID: string, CR_PROD_CNT_IL: float ... 101 more fields]

# 6. Создаем evaluator и модель GBT

In [13]:
val evaluator = new BinaryClassificationEvaluator()
                        .setLabelCol("TARGET")
                        .setRawPredictionCol("probability")
                        .setMetricName("areaUnderROC")

evaluator = binEval_3ca4edea84a0


binEval_3ca4edea84a0

In [14]:
val gbt = new GBTClassifier()
      .setFeaturesCol("features")
      .setLabelCol("TARGET")
      .setMaxDepth(4)
      .setMaxIter(50)

gbt = gbtc_90d1bbecce77


gbtc_90d1bbecce77

# 7. Обучаем модель и смотрим ROC_AUC на train

In [15]:
val model_gbt = gbt.fit(train_data)

model_gbt = GBTClassificationModel (uid=gbtc_90d1bbecce77) with 50 trees


GBTClassificationModel (uid=gbtc_90d1bbecce77) with 50 trees

In [16]:
val predictions_train = model_gbt.transform(train_data)

predictions_train = [ID: string, CR_PROD_CNT_IL: float ... 105 more fields]


[ID: string, CR_PROD_CNT_IL: float ... 105 more fields]

In [17]:
val ROC_AUC = evaluator.evaluate(predictions_train)

ROC_AUC = 0.8293589208008176


0.8293589208008176

# 8. Predict и сохранение результатов в файл

In [18]:
val predictions_test = model_gbt.transform(test_data)

predictions_test = [ID: string, CR_PROD_CNT_IL: float ... 104 more fields]


[ID: string, CR_PROD_CNT_IL: float ... 104 more fields]

In [19]:
val sparse_to_dense = udf((v: Vector) =>  v.toArray)

sparse_to_dense = UserDefinedFunction(<function1>,ArrayType(DoubleType,false),Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))


UserDefinedFunction(<function1>,ArrayType(DoubleType,false),Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))

In [20]:
val to_file = predictions_test.select(col("ID").alias("id"), sparse_to_dense(col("probability")).getItem(1).alias("target")).coalesce(1)

to_file = [id: string, target: double]


[id: string, target: double]

In [21]:
to_file.show()

+------+--------------------+
|    id|              target|
+------+--------------------+
|338743|0.024703606216359697|
|396274|0.038319445947453956|
|317805|0.024703606216359697|
|398424|0.026231297985328528|
|373496| 0.29402280706138073|
|209206| 0.24721192381115542|
|319688| 0.06191977843636676|
|345326|0.026398391232047347|
|373023|0.026510344419407028|
|225004| 0.18651991730199002|
|192599| 0.10789677089742566|
|470522| 0.12419599444675544|
|537732|0.049830804275254104|
|444496|0.041459993736681855|
|244935| 0.02567018201043081|
|485741| 0.23463243831439395|
|551227| 0.08138436723647946|
|395048| 0.12493945140515272|
|255069|0.048483783548697645|
|274320|   0.127791919027616|
+------+--------------------+
only showing top 20 rows



In [22]:
to_file.coalesce(1)
      .write
      .option("header","true")
      .option("sep","\t")
      .mode("overwrite")
      .csv("lab05.csv")

In [27]:
import scala.sys.process._
"""hdfs dfs -get lab05""".!!

get: `lab05': No such file or directory


Name: java.lang.RuntimeException
Message: Nonzero exit value: 1
StackTrace:   at scala.sys.package$.error(package.scala:27)
  at scala.sys.process.ProcessBuilderImpl$AbstractBuilder.slurp(ProcessBuilderImpl.scala:132)
  at scala.sys.process.ProcessBuilderImpl$AbstractBuilder.$bang$bang(ProcessBuilderImpl.scala:102)

In [28]:
spark.stop()

lastException: Throwable = null
