### Spark ML ランダムフォレストによる分類予測

* bank-fullのy列について分類予測する
* 特徴量は簡易化のため、数値列と文字列の'default'列のみを使用する
* パイプラインを使用せずに実行する

In [37]:
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, when, col
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

#### SparkSessionの作成

In [13]:
spark = SparkSession.builder.getOrCreate()

#### データの読み込み

In [14]:
data = spark.read.load('data/bank-full.csv',
                       format = 'csv',
                       sep = ';',
                       header = True,
                       inferSchema = True)

In [15]:
data.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

In [16]:
data.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('y', 'string')]

#### 目的変数の作成

In [17]:
data = data.withColumn('converted_y', when(col('y') == 'yes', lit(1.0)).otherwise(lit(0.0)))
data.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+-----------+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|converted_y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+-----------+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|        0.0|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|        0.0|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|        0.0|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|        0.0|

#### default列のインデックス化

In [38]:
default_index = StringIndexer(inputCol = 'default', outputCol = 'default_index')
data = default_index.fit(data).transform(data)

IllegalArgumentException: requirement failed: Output column default_index already exists.

In [20]:
data.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+-----------+-------------+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|converted_y|default_index|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+-----------+-------------+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|        0.0|          0.0|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|        0.0|          0.0|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|        0.0|          0.0|
| 47| blue-collar| married|  unknown|     no|   1506|    y

#### 特徴量のアッセンブル化

In [31]:
use_data = data.select(['age', 'balance', 'duration', 'campaign', 'previous', 'default_index', 'converted_y'])
features = ['age', 'balance', 'duration', 'campaign', 'previous', 'default_index']
assemble = VectorAssembler(inputCols = features, outputCol = 'features')
df = assemble.transform(use_data)

In [11]:
pipeline = Pipeline(stages = [default_index, assemble, scaler, lr])

#### 訓練データ、テストデータの作成

In [33]:
train_df, test_df = df.randomSplit([0.7, 0.3], seed = 1)

#### ランダムフォレストのインスタンス化

In [39]:
clf = RandomForestClassifier(featuresCol = 'features', labelCol = 'converted_y', impurity = 'gini')

#### 学習・モデルの確認

In [40]:
model = clf.fit(train_df)

In [42]:
list(zip(features, model.featureImportances))

[('age', 0.09087071851444256),
 ('balance', 0.024584708285500398),
 ('duration', 0.7533667648196009),
 ('campaign', 0.008547053468375754),
 ('previous', 0.12224716403624954),
 ('default_index', 0.0003835908758308951)]

#### 推論

In [43]:
# 訓練データの推論
pred_train = model.transform(train_df)
pred_train.show()

+---+-------+--------+--------+--------+-------------+-----------+--------------------+--------------------+--------------------+----------+
|age|balance|duration|campaign|previous|default_index|converted_y|            features|       rawPrediction|         probability|prediction|
+---+-------+--------+--------+--------+-------------+-----------+--------------------+--------------------+--------------------+----------+
| 18|    108|     167|       1|       0|          0.0|        1.0|[18.0,108.0,167.0...|[18.9573529299031...|[0.94786764649515...|       0.0|
| 18|    608|     267|       1|       0|          0.0|        1.0|[18.0,608.0,267.0...|[18.2993006871985...|[0.91496503435992...|       0.0|
| 18|   1944|     122|       3|       0|          0.0|        0.0|[18.0,1944.0,122....|[18.9724096512982...|[0.94862048256491...|       0.0|
| 19|      0|      72|       4|       0|          0.0|        0.0|[19.0,0.0,72.0,4....|[19.1300805309482...|[0.95650402654741...|       0.0|
| 19|     56|

In [44]:
# テストデータの推論
pred_test = model.transform(test_df)
pred_test.show()

+---+-------+--------+--------+--------+-------------+-----------+--------------------+--------------------+--------------------+----------+
|age|balance|duration|campaign|previous|default_index|converted_y|            features|       rawPrediction|         probability|prediction|
+---+-------+--------+--------+--------+-------------+-----------+--------------------+--------------------+--------------------+----------+
| 19|      0|     123|       3|       0|          0.0|        0.0|[19.0,0.0,123.0,3...|[19.0649286371135...|[0.95324643185567...|       0.0|
| 19|     27|      86|      12|       0|          0.0|        0.0|[19.0,27.0,86.0,1...|[19.1300805309482...|[0.95650402654741...|       0.0|
| 19|    779|     184|       4|       0|          0.0|        1.0|[19.0,779.0,184.0...|[19.0008443780016...|[0.95004221890008...|       0.0|
| 19|   1169|     463|      18|       0|          0.0|        0.0|[19.0,1169.0,463....|[17.7366943716801...|[0.88683471858400...|       0.0|
| 19|   1247|

In [45]:
# predictionは予測結果
# probabilityは0である確率と、1である確率をそれぞれ表す
pred_train.select(['probability', 'prediction']).show(truncate = False)

+----------------------------------------+----------+
|probability                             |prediction|
+----------------------------------------+----------+
|[0.9478676464951595,0.05213235350484048]|0.0       |
|[0.9149650343599278,0.08503496564007215]|0.0       |
|[0.9486204825649118,0.05137951743508819]|0.0       |
|[0.9565040265474114,0.04349597345258869]|0.0       |
|[0.9442569396556861,0.05574306034431386]|0.0       |
|[0.9414167783311266,0.0585832216688734] |0.0       |
|[0.9501858000127698,0.04981419998723034]|0.0       |
|[0.9501858000127698,0.04981419998723034]|0.0       |
|[0.9175607659837561,0.08243923401624388]|0.0       |
|[0.9175037697324697,0.08249623026753022]|0.0       |
|[0.9475900683889413,0.05240993161105863]|0.0       |
|[0.9518780772566464,0.04812192274335364]|0.0       |
|[0.9565040265474114,0.04349597345258869]|0.0       |
|[0.949540775884645,0.05045922411535492] |0.0       |
|[0.9485865098848943,0.05141349011510565]|0.0       |
|[0.9442569396556861,0.05574

#### 精度評価

In [46]:
# 訓練データの精度評価
evaluator = BinaryClassificationEvaluator(labelCol = 'converted_y')
auc = evaluator.evaluate(pred_train)
auc

0.8499073601518856

In [47]:
# テストデータの精度評価
evaluator = BinaryClassificationEvaluator(labelCol = 'converted_y')
auc = evaluator.evaluate(pred_test)
auc

0.8428355659500473