## ロジスティック回帰による分類予測

In [1]:
import numpy as np
import pandas as pd

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("logistic_regression").getOrCreate()

your 131072x1 screen size is bogus. expect trouble


23/03/11 20:11:17 WARN Utils: Your hostname, NONAME resolves to a loopback address: 127.0.1.1; using 172.18.233.170 instead (on interface eth0)
23/03/11 20:11:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/11 20:11:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
filename = "./data/bank-full.csv"
data = spark.read.csv(filename, header=True, inferSchema=True, sep=";")
data.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

In [4]:
data.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('y', 'string')]

In [7]:
# 目的変数の作成
from pyspark.sql.functions import lit, when, col
data1 = data.withColumn("y1", when(col("y")=="yes", lit(1.0)).otherwise(lit(0.0)))
data1.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y| y1|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|0.0|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|0.0|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|0.0|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|0.0|
| 33|     unknown|  single|  unknown|     no|      1|  

In [8]:
# Stringの処理（default）
from pyspark.ml.feature import StringIndexer
default_index = StringIndexer(inputCol="default", outputCol="default_index")

In [9]:
## assemble
from pyspark.ml.feature import VectorAssembler
assemble = VectorAssembler(inputCols=["age", "balance", "duration", "campaign", "previous", "default_index"],
                            outputCol="features")

In [10]:
# 標準化
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")


In [12]:
# ロジスティック回帰
from pyspark.ml.classification import LogisticRegression
logistic_regression = LogisticRegression(featuresCol="scaled_features", labelCol="y1")

In [13]:
# パイプラインの登録
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[default_index, assemble, scaler, logistic_regression])

In [15]:
df = data1.select("age", "balance", "duration", "campaign", "previous", "default", "y", "y1")

In [16]:
# 訓練データとテストデータを分ける
train_df, test_df = df.randomSplit([0.7, 0.3], seed=1234)

In [17]:
# 訓練データによるモデリング
fit_model = pipeline.fit(train_df)

                                                                                

23/03/11 20:32:47 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/03/11 20:32:47 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


In [18]:
# モデリング結果の確認
fit_model.stages[3].coefficients

DenseVector([0.0461, 0.1101, 0.9227, -0.4444, 0.2843, -0.0603])

In [20]:
fit_model.stages[3].intercept

-3.235967044154471

In [21]:
# 訓練データを使って推論
pred_train = fit_model.transform(train_df)

In [22]:
pred_train.show()

+---+-------+--------+--------+--------+-------+---+---+-------------+--------------------+--------------------+--------------------+--------------------+----------+
|age|balance|duration|campaign|previous|default|  y| y1|default_index|            features|     scaled_features|       rawPrediction|         probability|prediction|
+---+-------+--------+--------+--------+-------+---+---+-------------+--------------------+--------------------+--------------------+--------------------+----------+
| 18|     35|     104|       2|       0|     no| no|0.0|          0.0|[18.0,35.0,104.0,...|[1.69579509967235...|[3.06350348098031...|[0.95536194323606...|       0.0|
| 18|    108|      92|       1|       1|     no|yes|1.0|          0.0|[18.0,108.0,92.0,...|[1.69579509967235...|[2.81052152711537...|[0.94324174651528...|       0.0|
| 18|    108|     169|       1|       0|     no|yes|1.0|          0.0|[18.0,108.0,169.0...|[1.69579509967235...|[2.68263210486711...|[0.93599399206992...|       0.0|
| 18

In [23]:
pred_train.select("rawPrediction", "probability").show(truncate=False)

+----------------------------------------+-----------------------------------------+
|rawPrediction                           |probability                              |
+----------------------------------------+-----------------------------------------+
|[3.0635034809803106,-3.0635034809803106]|[0.9553619432360639,0.04463805676393606] |
|[2.810521527115374,-2.810521527115374]  |[0.9432417465152875,0.05675825348471253] |
|[2.682632104867116,-2.682632104867116]  |[0.9359939920699281,0.06400600793007194] |
|[1.7481311839187144,-1.7481311839187144]|[0.8517169345743546,0.1482830654256454]  |
|[2.105961860368695,-2.105961860368695]  |[0.8914812910541509,0.10851870894584914] |
|[1.7413142898799734,-1.7413142898799734]|[0.8508539277745952,0.14914607222540477] |
|[2.364111773608193,-2.364111773608193]  |[0.9140493896360999,0.08595061036390006] |
|[2.3088349021877366,-2.3088349021877366]|[0.9096061033526104,0.09039389664738962] |
|[3.071697801995961,-3.071697801995961]  |[0.9557100931299585,0.0

In [24]:
z = np.array([3.0635034809803106,-3.0635034809803106])
q = 1 / (1+np.exp(-z))
print(q)

[0.95536194 0.04463806]


In [25]:
# 精度評価（訓練データ） AUCの算出
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="y1")
AUC = evaluator.evaluate(pred_train)
print(AUC)

                                                                                

0.8331547859811365


In [26]:
# テストデータ
test_df.show()

+---+-------+--------+--------+--------+-------+---+---+
|age|balance|duration|campaign|previous|default|  y| y1|
+---+-------+--------+--------+--------+-------+---+---+
| 18|      3|     130|       2|       0|     no|yes|1.0|
| 18|      5|     143|       2|       0|     no| no|0.0|
| 18|    108|     167|       1|       0|     no|yes|1.0|
| 19|    103|      96|       2|       2|     no| no|0.0|
| 19|    108|     168|       1|       2|     no|yes|1.0|
| 19|    108|     273|       2|       1|     no|yes|1.0|
| 19|    134|     271|       2|       0|     no|yes|1.0|
| 19|    179|      62|       3|       0|     no| no|0.0|
| 19|    291|     291|       5|       0|     no| no|0.0|
| 19|    329|     169|       1|       2|     no|yes|1.0|
| 19|    329|     252|       2|       0|     no|yes|1.0|
| 19|    526|     122|       3|       0|     no| no|0.0|
| 19|   1803|     124|       1|       1|     no| no|0.0|
| 20|   -322|      73|       4|       0|     no| no|0.0|
| 20|     29|      85|       2|

In [27]:
test_df.count()

13562

In [28]:
# テストデータの推論
pred_test = fit_model.transform(test_df)

In [29]:
pred_test.show()

+---+-------+--------+--------+--------+-------+---+---+-------------+--------------------+--------------------+--------------------+--------------------+----------+
|age|balance|duration|campaign|previous|default|  y| y1|default_index|            features|     scaled_features|       rawPrediction|         probability|prediction|
+---+-------+--------+--------+--------+-------+---+---+-------------+--------------------+--------------------+--------------------+--------------------+----------+
| 18|      3|     130|       2|       0|     no|yes|1.0|          0.0|[18.0,3.0,130.0,2...|[1.69579509967235...|[2.97025888993604...|[0.95121229283086...|       0.0|
| 18|      5|     143|       2|       0|     no| no|0.0|          0.0|[18.0,5.0,143.0,2...|[1.69579509967235...|[2.92298874908348...|[0.94897122314169...|       0.0|
| 18|    108|     167|       1|       0|     no|yes|1.0|          0.0|[18.0,108.0,167.0...|[1.69579509967235...|[2.68989335994989...|[0.93642763358003...|       0.0|
| 19

In [30]:
# 精度評価（訓練データ） AUCの算出
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="y1")
AUC = evaluator.evaluate(pred_test)
print(AUC)

0.8321723113642512


#### 混同行列

In [32]:

from sklearn.metrics import confusion_matrix
y_true = pred_test.select("y1")
y_true = y_true.toPandas()
y_true

Unnamed: 0,y1
0,1.0
1,0.0
2,1.0
3,0.0
4,1.0
...,...
13557,0.0
13558,1.0
13559,1.0
13560,0.0


In [33]:
from sklearn.metrics import confusion_matrix
y_pred = pred_test.select("prediction")
y_pred = y_pred.toPandas()
y_pred

Unnamed: 0,prediction
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
13557,0.0
13558,1.0
13559,1.0
13560,0.0


In [35]:
class_name = [0, 1]
cnf_matrix = confusion_matrix(y_true, y_pred, labels=class_name)
cnf_matrix

array([[11749,   214],
       [ 1304,   295]])

In [37]:
tn, fp, fn, tp = cnf_matrix.flatten()
tn, fp, fn, tp

(11749, 214, 1304, 295)

In [38]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("accuracy:", accuracy_score(y_true, y_pred)) 

accuracy: 0.8880696062527651


In [39]:
print("precision_score:", precision_score(y_true, y_pred))
print("recall:", recall_score(y_true, y_pred))
print("f1_score:", f1_score(y_true, y_pred))


precision_score: 0.5795677799607073
recall: 0.18449030644152595
f1_score: 0.27988614800759015
