### Spark ML ロジスティック回帰による分類予測

* bank-fullのy列について分類予測する
* 特徴量は簡易化のため、数値列と文字列の'default'列のみを使用する
* 数値列は標準化を行う
* 文字列はインデックス化（ラベルエンコーディング）を行う
* 評価はAUCにて行う

In [1]:
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, when, col
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

#### SparkSessionの作成

In [2]:
spark = SparkSession.builder.getOrCreate()

#### データの読み込み

In [3]:
data = spark.read.load('data/bank-full.csv',
                       format = 'csv',
                       sep = ';',
                       header = True,
                       inferSchema = True)

In [4]:
data.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

In [5]:
data.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('y', 'string')]

#### 目的変数の作成

In [6]:
data = data.withColumn('converted_y', when(col('y') == 'yes', lit(1.0)).otherwise(lit(0.0)))
data.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+-----------+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|converted_y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+-----------+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|        0.0|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|        0.0|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|        0.0|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|        0.0|

#### ①default列のインデックス化ステージ

In [7]:
default_index = StringIndexer(inputCol = 'default', outputCol = 'default_index')

#### ②特徴量のアッセンブル化ステージ

In [8]:
features = ['age', 'balance', 'duration', 'campaign', 'previous', 'default_index']
assemble = VectorAssembler(inputCols = features, outputCol = 'features')

#### ③特徴量の標準化ステージ

In [9]:
scaler = StandardScaler(inputCol = 'features', outputCol = 'scaled_features')

#### ④ロジスティック回帰のインスタンス化ステージ

In [10]:
clf = LogisticRegression(featuresCol = 'scaled_features', labelCol = 'converted_y')

#### パイプラインの登録（①～④）

In [11]:
pipeline = Pipeline(stages = [default_index, assemble, scaler, clf])

#### 訓練データ、テストデータの作成

In [12]:
df = data.select(['age', 'balance', 'duration', 'campaign', 'previous', 'default', 'converted_y'])
train_df, test_df = df.randomSplit([0.7, 0.3], seed = 1)

#### 学習・モデルの確認

In [13]:
model = pipeline.fit(train_df)

In [14]:
# 係数
model.stages[3].coefficients

DenseVector([0.0805, 0.1095, 0.9311, -0.4427, 0.3169, -0.0549])

In [15]:
# 切片
model.stages[3].intercept

-3.3476369699341983

#### 推論

In [16]:
# 訓練データの推論
pred_train = model.transform(train_df)
pred_train.show()

+---+-------+--------+--------+--------+-------+-----------+-------------+--------------------+--------------------+--------------------+--------------------+----------+
|age|balance|duration|campaign|previous|default|converted_y|default_index|            features|     scaled_features|       rawPrediction|         probability|prediction|
+---+-------+--------+--------+--------+-------+-----------+-------------+--------------------+--------------------+--------------------+--------------------+----------+
| 18|    108|     167|       1|       0|     no|        1.0|          0.0|[18.0,108.0,167.0...|[1.69530409029658...|[2.74984510998285...|[0.93990460162342...|       0.0|
| 18|    608|     267|       1|       0|     no|        1.0|          0.0|[18.0,608.0,267.0...|[1.69530409029658...|[2.37190871404581...|[0.91465996723821...|       0.0|
| 18|   1944|     122|       3|       0|     no|        0.0|          0.0|[18.0,1944.0,122....|[1.69530409029658...|[3.13571448331020...|[0.9583421278

In [17]:
# テストデータの推論
pred_test = model.transform(test_df)
pred_test.show()

+---+-------+--------+--------+--------+-------+-----------+-------------+--------------------+--------------------+--------------------+--------------------+----------+
|age|balance|duration|campaign|previous|default|converted_y|default_index|            features|     scaled_features|       rawPrediction|         probability|prediction|
+---+-------+--------+--------+--------+-------+-----------+-------------+--------------------+--------------------+--------------------+--------------------+----------+
| 19|      0|     123|       3|       0|     no|        0.0|          0.0|[19.0,0.0,123.0,3...|[1.78948765086861...|[3.19306148478539...|[0.96057233208168...|       0.0|
| 19|     27|      86|      12|       0|     no|        0.0|          0.0|[19.0,27.0,86.0,1...|[1.78948765086861...|[4.62347551189535...|[0.99027685579104...|       0.0|
| 19|    779|     184|       4|       0|     no|        1.0|          0.0|[19.0,779.0,184.0...|[1.78948765086861...|[3.09003796035752...|[0.9564799452

In [18]:
# predictionは予測結果
# probabilityは0である確率と、1である確率をそれぞれ表す
pred_train.select(['probability', 'prediction']).show(truncate = False)

+-----------------------------------------+----------+
|probability                              |prediction|
+-----------------------------------------+----------+
|[0.939904601623422,0.06009539837657796]  |0.0       |
|[0.914659967238211,0.08534003276178903]  |0.0       |
|[0.9583421278443363,0.041657872155663744]|0.0       |
|[0.971280731883753,0.028719268116247032] |0.0       |
|[0.9212498194751397,0.07875018052486027] |0.0       |
|[0.919390022147477,0.080609977852523]    |0.0       |
|[0.9627065076829667,0.03729349231703327] |0.0       |
|[0.9574527537681594,0.042547246231840585]|0.0       |
|[0.9248990528547075,0.07510094714529247] |0.0       |
|[0.946136738944358,0.05386326105564199]  |0.0       |
|[0.9480514709504603,0.05194852904953973] |0.0       |
|[0.9734610885576411,0.02653891144235887] |0.0       |
|[0.971285348429445,0.028714651570554972] |0.0       |
|[0.9414304786920291,0.058569521307970884]|0.0       |
|[0.9367473886744273,0.0632526113255727]  |0.0       |
|[0.931304

#### 精度評価

In [19]:
# 訓練データの精度評価
evaluator = BinaryClassificationEvaluator(labelCol = 'converted_y')
auc = evaluator.evaluate(pred_train)
auc

0.8323554117294152

In [20]:
# テストデータの精度評価
evaluator = BinaryClassificationEvaluator(labelCol = 'converted_y')
auc = evaluator.evaluate(pred_test)
auc

0.8286565521242742