In [1]:
!pip install pyspark findspark



In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

conf = SparkConf().set('spark.ui.port', '4050').set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')\
                  .set('spark.dynamicAllocation.enabled', 'true')\
                  .set('spark.shuffle.service.enabled', 'true') #трекер, чтобы возвращать ресурсы
sc = SparkContext(conf=conf)
spark = SparkSession.builder.master('local[*]').getOrCreate()

Анализировать будет датасет отсюда https://www.kaggle.com/shelvigarg/credit-card-buyers

Definition

ID - Unique Identifier for a row

Gender - Gender of the Customer

Age - Age of the Customer (in Years)

Region_Code - Code of the Region for the customers

Occupation - Occupation Type for the customer

Channel_Code - Acquisition Channel Code for the Customer (Encoded)

Vintage - Vintage for the Customer (In Months)

Credit_Product - If the Customer has any active credit product (Home loan Personal loan, Credit Card etc.)

AvgAccountBalance - Average Account Balance for the Customer in last 12 Months

Is_Active - If the Customer is Active in last 3 Months

Загрузим данные и посмотрим, что там внутри

In [3]:
data = spark.read.csv('credit_card_data.csv', header=True, inferSchema=True)

In [4]:
data.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Region_Code: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Channel_Code: string (nullable = true)
 |-- Vintage: integer (nullable = true)
 |-- Credit_Product: string (nullable = true)
 |-- Avg_Account_Balance: integer (nullable = true)
 |-- Is_Active: string (nullable = true)
 |-- Is_Lead: integer (nullable = true)



In [5]:
data.show()

+--------+------+---+-----------+-------------+------------+-------+--------------+-------------------+---------+-------+
|      ID|Gender|Age|Region_Code|   Occupation|Channel_Code|Vintage|Credit_Product|Avg_Account_Balance|Is_Active|Is_Lead|
+--------+------+---+-----------+-------------+------------+-------+--------------+-------------------+---------+-------+
|NNVBBKZB|Female| 73|      RG268|        Other|          X3|     43|            No|            1045696|       No|      0|
|IDD62UNG|Female| 30|      RG277|     Salaried|          X1|     32|            No|             581988|       No|      0|
|HD3DSEMC|Female| 56|      RG268|Self_Employed|          X3|     26|            No|            1484315|      Yes|      0|
|BF3NC7KV|  Male| 34|      RG270|     Salaried|          X1|     19|            No|             470454|       No|      0|
|TEASRWXV|Female| 30|      RG282|     Salaried|          X1|     33|            No|             886787|       No|      0|
|ACUTYTWS|  Male| 56|   

Посмотрим различные базовые вещи

In [6]:
from pyspark.sql.functions import col,isnan, when, count

In [7]:
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

+---+------+---+-----------+----------+------------+-------+--------------+-------------------+---------+-------+
| ID|Gender|Age|Region_Code|Occupation|Channel_Code|Vintage|Credit_Product|Avg_Account_Balance|Is_Active|Is_Lead|
+---+------+---+-----------+----------+------------+-------+--------------+-------------------+---------+-------+
|  0|     0|  0|          0|         0|           0|      0|         29325|                  0|        0|      0|
+---+------+---+-----------+----------+------------+-------+--------------+-------------------+---------+-------+



Пропуски только в кредитном продукте, логично заметь на тип, что кредита нет

In [8]:
data.select(col('Credit_Product')).groupBy('Credit_Product').count().show()

+--------------+------+
|Credit_Product| count|
+--------------+------+
|          NULL| 29325|
|            No|144357|
|           Yes| 72043|
+--------------+------+



In [9]:
data = data.fillna({'Credit_Product': 'No'})

Проверим

In [10]:
data.select(col('Credit_Product')).groupBy('Credit_Product').count().show()

+--------------+------+
|Credit_Product| count|
+--------------+------+
|            No|173682|
|           Yes| 72043|
+--------------+------+



Посмотри на данные с точки зрения дисбаланса классов

In [11]:
data.count()

245725

In [12]:
import pyspark.sql.functions as F

In [13]:
data.select(col('Is_Lead'))\
    .groupBy('Is_Lead')\
    .count()\
    .withColumn('count', F.round(col('count') / data.count(), 2))\
    .show()

+-------+-----+
|Is_Lead|count|
+-------+-----+
|      1| 0.24|
|      0| 0.76|
+-------+-----+



Ладно, достаточно, мы тут сейчас говорим про MLlib, всякие анализы - тема прошлого семинара

**Некоторые преобразования данных**

Начнем с простой обработки категориальных переменных

In [14]:
from pyspark.ml.feature import StringIndexer, IndexToString, OneHotEncoder

In [15]:
gender_indexer = StringIndexer(inputCol="Gender", outputCol="GenderIndex")
gender_indexer = gender_indexer.fit(data)
data = gender_indexer.transform(data)

data.show()

+--------+------+---+-----------+-------------+------------+-------+--------------+-------------------+---------+-------+-----------+
|      ID|Gender|Age|Region_Code|   Occupation|Channel_Code|Vintage|Credit_Product|Avg_Account_Balance|Is_Active|Is_Lead|GenderIndex|
+--------+------+---+-----------+-------------+------------+-------+--------------+-------------------+---------+-------+-----------+
|NNVBBKZB|Female| 73|      RG268|        Other|          X3|     43|            No|            1045696|       No|      0|        1.0|
|IDD62UNG|Female| 30|      RG277|     Salaried|          X1|     32|            No|             581988|       No|      0|        1.0|
|HD3DSEMC|Female| 56|      RG268|Self_Employed|          X3|     26|            No|            1484315|      Yes|      0|        1.0|
|BF3NC7KV|  Male| 34|      RG270|     Salaried|          X1|     19|            No|             470454|       No|      0|        0.0|
|TEASRWXV|Female| 30|      RG282|     Salaried|          X1|  

In [16]:
gender_indexer.labels

['Male', 'Female']

Обратная трансформация доступна через метод

In [17]:
converter = IndexToString(inputCol="GenderIndex", outputCol="originalGender")
data = converter.transform(data)
data.show()

+--------+------+---+-----------+-------------+------------+-------+--------------+-------------------+---------+-------+-----------+--------------+
|      ID|Gender|Age|Region_Code|   Occupation|Channel_Code|Vintage|Credit_Product|Avg_Account_Balance|Is_Active|Is_Lead|GenderIndex|originalGender|
+--------+------+---+-----------+-------------+------------+-------+--------------+-------------------+---------+-------+-----------+--------------+
|NNVBBKZB|Female| 73|      RG268|        Other|          X3|     43|            No|            1045696|       No|      0|        1.0|        Female|
|IDD62UNG|Female| 30|      RG277|     Salaried|          X1|     32|            No|             581988|       No|      0|        1.0|        Female|
|HD3DSEMC|Female| 56|      RG268|Self_Employed|          X3|     26|            No|            1484315|      Yes|      0|        1.0|        Female|
|BF3NC7KV|  Male| 34|      RG270|     Salaried|          X1|     19|            No|             470454|   

Давайте аналогично поступим с каналом продаж и типом занятости

In [18]:
occupation_indexer = StringIndexer(inputCol="Occupation", outputCol="OccupationIndex")
occupation_indexer = occupation_indexer.fit(data)
data = occupation_indexer.transform(data)

channel_indexer = StringIndexer(inputCol="Channel_Code", outputCol="ChannelIndex")
channel_indexer = channel_indexer.fit(data)
data = channel_indexer.transform(data)

data.show()

+--------+------+---+-----------+-------------+------------+-------+--------------+-------------------+---------+-------+-----------+--------------+---------------+------------+
|      ID|Gender|Age|Region_Code|   Occupation|Channel_Code|Vintage|Credit_Product|Avg_Account_Balance|Is_Active|Is_Lead|GenderIndex|originalGender|OccupationIndex|ChannelIndex|
+--------+------+---+-----------+-------------+------------+-------+--------------+-------------------+---------+-------+-----------+--------------+---------------+------------+
|NNVBBKZB|Female| 73|      RG268|        Other|          X3|     43|            No|            1045696|       No|      0|        1.0|        Female|            2.0|         1.0|
|IDD62UNG|Female| 30|      RG277|     Salaried|          X1|     32|            No|             581988|       No|      0|        1.0|        Female|            1.0|         0.0|
|HD3DSEMC|Female| 56|      RG268|Self_Employed|          X3|     26|            No|            1484315|      Y

In [19]:
print(f'Occupation len = {len(occupation_indexer.labels)}, Channel_code len = {len(channel_indexer.labels)}')

Occupation len = 4, Channel_code len = 4


Тут по 4 категории, что самое простое, что приходит в голову? Правильно - OHE

In [20]:
ohe_encoder = OneHotEncoder(inputCols=["OccupationIndex", "ChannelIndex"],
                            outputCols=["OccupationVector", "ChannelVec"])
ohe_encoder = ohe_encoder.fit(data)
data = ohe_encoder.transform(data)

data.show()

+--------+------+---+-----------+-------------+------------+-------+--------------+-------------------+---------+-------+-----------+--------------+---------------+------------+----------------+-------------+
|      ID|Gender|Age|Region_Code|   Occupation|Channel_Code|Vintage|Credit_Product|Avg_Account_Balance|Is_Active|Is_Lead|GenderIndex|originalGender|OccupationIndex|ChannelIndex|OccupationVector|   ChannelVec|
+--------+------+---+-----------+-------------+------------+-------+--------------+-------------------+---------+-------+-----------+--------------+---------------+------------+----------------+-------------+
|NNVBBKZB|Female| 73|      RG268|        Other|          X3|     43|            No|            1045696|       No|      0|        1.0|        Female|            2.0|         1.0|   (3,[2],[1.0])|(3,[1],[1.0])|
|IDD62UNG|Female| 30|      RG277|     Salaried|          X1|     32|            No|             581988|       No|      0|        1.0|        Female|            1.0|

In [21]:
ohe_encoder.categorySizes

[4, 4]

Странный формат, не правда ли? Все из-за того, что тут у нас SparseVector

 На 4 категории нужен вектор размерности 3, а дальше храним позицию и 1 там, где нужная категория

In [22]:
data.select(col('OccupationVector')).head()

Row(OccupationVector=SparseVector(3, {2: 1.0}))

Теперь все надо собрать в одну структуру, чтобы можно было анализировать данные и строить модели

In [23]:
from pyspark.ml.feature import VectorAssembler

In [24]:
data.show(5)

+--------+------+---+-----------+-------------+------------+-------+--------------+-------------------+---------+-------+-----------+--------------+---------------+------------+----------------+-------------+
|      ID|Gender|Age|Region_Code|   Occupation|Channel_Code|Vintage|Credit_Product|Avg_Account_Balance|Is_Active|Is_Lead|GenderIndex|originalGender|OccupationIndex|ChannelIndex|OccupationVector|   ChannelVec|
+--------+------+---+-----------+-------------+------------+-------+--------------+-------------------+---------+-------+-----------+--------------+---------------+------------+----------------+-------------+
|NNVBBKZB|Female| 73|      RG268|        Other|          X3|     43|            No|            1045696|       No|      0|        1.0|        Female|            2.0|         1.0|   (3,[2],[1.0])|(3,[1],[1.0])|
|IDD62UNG|Female| 30|      RG277|     Salaried|          X1|     32|            No|             581988|       No|      0|        1.0|        Female|            1.0|

In [25]:
feature_columns = [
                   'Age',
                   'Vintage',
                   'Avg_Account_Balance',
                   'GenderIndex',
                   'OccupationVector',
                   'ChannelVec'                 
]

In [26]:
df_va = VectorAssembler(inputCols = feature_columns, outputCol = 'features')
data = df_va.transform(data)

In [27]:
data.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Region_Code: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Channel_Code: string (nullable = true)
 |-- Vintage: integer (nullable = true)
 |-- Credit_Product: string (nullable = false)
 |-- Avg_Account_Balance: integer (nullable = true)
 |-- Is_Active: string (nullable = true)
 |-- Is_Lead: integer (nullable = true)
 |-- GenderIndex: double (nullable = false)
 |-- originalGender: string (nullable = true)
 |-- OccupationIndex: double (nullable = false)
 |-- ChannelIndex: double (nullable = false)
 |-- OccupationVector: vector (nullable = true)
 |-- ChannelVec: vector (nullable = true)
 |-- features: vector (nullable = true)



In [28]:
data.show()

+--------+------+---+-----------+-------------+------------+-------+--------------+-------------------+---------+-------+-----------+--------------+---------------+------------+----------------+-------------+--------------------+
|      ID|Gender|Age|Region_Code|   Occupation|Channel_Code|Vintage|Credit_Product|Avg_Account_Balance|Is_Active|Is_Lead|GenderIndex|originalGender|OccupationIndex|ChannelIndex|OccupationVector|   ChannelVec|            features|
+--------+------+---+-----------+-------------+------------+-------+--------------+-------------------+---------+-------+-----------+--------------+---------------+------------+----------------+-------------+--------------------+
|NNVBBKZB|Female| 73|      RG268|        Other|          X3|     43|            No|            1045696|       No|      0|        1.0|        Female|            2.0|         1.0|   (3,[2],[1.0])|(3,[1],[1.0])|[73.0,43.0,104569...|
|IDD62UNG|Female| 30|      RG277|     Salaried|          X1|     32|            

В полученном features можно автоматичеки проанализировать все переменные и если у кого-то уникальных значений меньше заданного вами порога, то они автоматичсеки переведутся в индексы при помощи pyspark.ml.feature import VectorIndexer

**Статистика**

В ml pyspark есть некоторые статистические методы, которые можно использовать для анализа

Корреляция

In [29]:
from pyspark.ml.stat import Correlation

In [30]:
corr = Correlation.corr(data, 'features', method='pearson').collect()[0][0]

In [31]:
corr

DenseMatrix(10, 10, [1.0, 0.6312, 0.1452, -0.1521, 0.1527, -0.5632, 0.3948, -0.6646, ..., 0.0102, -0.116, 0.2933, -0.3238, 0.0005, -0.5272, -0.3843, 1.0], False)

In [32]:
corr.toArray()

array([[ 1.00000000e+00,  6.31242411e-01,  1.45232189e-01,
        -1.52075940e-01,  1.52651808e-01, -5.63226982e-01,
         3.94834177e-01, -6.64600051e-01,  4.56338440e-01,
         2.73153253e-01],
       [ 6.31242411e-01,  1.00000000e+00,  1.67433481e-01,
        -1.46379743e-01,  2.21023818e-01, -4.10109383e-01,
         1.55662661e-01, -5.71828453e-01,  5.38828562e-01,
         1.44931244e-01],
       [ 1.45232189e-01,  1.67433481e-01,  1.00000000e+00,
        -2.24772031e-02,  3.46714040e-03, -7.16906860e-02,
         6.03874569e-02, -9.81785292e-02,  1.06905544e-01,
         1.01634033e-02],
       [-1.52075940e-01, -1.46379743e-01, -2.24772031e-02,
         1.00000000e+00, -8.58626857e-02,  1.22439249e-01,
        -2.58175123e-02,  1.84372479e-01, -8.07817702e-02,
        -1.16018433e-01],
       [ 1.52651808e-01,  2.21023818e-01,  3.46714040e-03,
        -8.58626857e-02,  1.00000000e+00, -5.37283514e-01,
        -5.27660791e-01, -4.34990948e-01,  1.63662837e-01,
         2.

Можно вычислить корреляцию спирмена

In [33]:
corr = Correlation.corr(data, 'features', method='spearman').collect()[0][0]
corr.toArray()

array([[ 1.00000000e+00,  6.52477119e-01,  1.78395617e-01,
        -1.66276503e-01,  2.50479037e-01, -5.94192974e-01,
         3.16280765e-01, -7.14324073e-01,  4.68523607e-01,
         3.11874367e-01],
       [ 6.52477119e-01,  1.00000000e+00,  1.98039737e-01,
        -1.37241231e-01,  2.26617372e-01, -4.03612329e-01,
         1.44248315e-01, -5.43361590e-01,  4.88850315e-01,
         1.86509197e-01],
       [ 1.78395617e-01,  1.98039737e-01,  1.00000000e+00,
        -3.35726184e-02,  1.74003362e-02, -9.65169583e-02,
         6.80005979e-02, -1.34528255e-01,  1.34941462e-01,
         2.41122172e-02],
       [-1.66276503e-01, -1.37241231e-01, -3.35726184e-02,
         1.00000000e+00, -8.58626857e-02,  1.22439249e-01,
        -2.58175123e-02,  1.84372479e-01, -8.07817702e-02,
        -1.16018433e-01],
       [ 2.50479037e-01,  2.26617372e-01,  1.74003362e-02,
        -8.58626857e-02,  1.00000000e+00, -5.37283514e-01,
        -5.27660791e-01, -4.34990948e-01,  1.63662837e-01,
         2.

Можно использовать хи-квадрат тест для оценки независимости каждой переменной в features относительно целевого признака, но этот тест для категориальных переменных, поэтому для примера на одной фиче

In [34]:
from pyspark.ml.stat import ChiSquareTest, KolmogorovSmirnovTest, Summarizer

In [35]:
r = ChiSquareTest.test(data, "OccupationVector", "Is_Lead")

In [36]:
r.show()

+--------------------+----------------+--------------------+
|             pValues|degreesOfFreedom|          statistics|
+--------------------+----------------+--------------------+
|[0.0,0.0,1.161583...|       [1, 1, 1]|[1420.86324574575...|
+--------------------+----------------+--------------------+



KS-тест

In [37]:
data.select(
    F.mean(col('Age')).alias('mean_Age'),
    F.stddev(col('Age')).alias('std_Age')
).collect()

[Row(mean_Age=43.85630684708516, std_Age=14.828671804648)]

In [38]:
ks = KolmogorovSmirnovTest.test(data, 'Age', 'norm', 44, 15).first()

In [39]:
ks

Row(pValue=2.045950076023928e-10, statistic=0.12561207843265512)

Еще можно посчитать разные статистики

In [40]:
summarizer = Summarizer.metrics("mean", "count")
data.select(summarizer.summary(data.features)).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|aggregate_metrics(features, 1.0)                                                                                                                                                                         |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{[43.8563068470851,46.95914131651203,1128403.1010194335,0.4538732322718486,0.4105646556109472,0.29300640960423235,0.2855753382846678,0.42208973445925324,0.2796296673110184,0.27561705158205313], 245725}|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

**Работа с фичами**

Квантизация

In [41]:
from pyspark.ml.feature import QuantileDiscretizer

Обучаем

In [42]:
discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Age", outputCol="Age_quant")
discretizer = discretizer.fit(data)

In [43]:
data = discretizer.transform(data)

In [44]:
data.select('Age', 'Age_quant')\
    .groupby('Age_quant').agg(
        F.min('Age').alias('min_age'),
        F.max('Age').alias('max_age'),
        F.count('Age').alias('count')
    )\
    .orderBy('Age_quant')\
    .show(5)

+---------+-------+-------+-----+
|Age_quant|min_age|max_age|count|
+---------+-------+-------+-----+
|      0.0|     23|     28|43790|
|      1.0|     29|     35|52017|
|      2.0|     36|     46|46007|
|      3.0|     47|     55|50808|
|      4.0|     56|     85|53103|
+---------+-------+-------+-----+



Заполнить пропуски можно через Imputer

Заполнять пропуски умеет только для числовых переменных, поэтому попробуем на игрушечном примере



In [45]:
from pyspark.ml.feature import Imputer

In [47]:
df = spark.createDataFrame([
    (1.0, float("nan")),
    (2.0, float("nan")),
    (float("nan"), 3.0),
    (4.0, 4.0),
    (5.0, 5.0)
], ["a", "b"])

#стратегия может быть 'mean', 'median', 'mode'
#через setMissingValue(0.0) можно сказать, что пропуски - это 0
imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"], strategy='mean')
imputer = imputer.fit(df)
imputer.transform(df).show()

Py4JJavaError: An error occurred while calling o561.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 6 in stage 79.0 failed 1 times, most recent failure: Lost task 6.0 in stage 79.0 (TID 160) (DESKTOP-TS0U1T0 executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:203)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.net.PlainSocketImpl.accept(Unknown Source)
	at java.net.ServerSocket.implAccept(Unknown Source)
	at java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:190)
	... 34 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:203)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.net.PlainSocketImpl.accept(Unknown Source)
	at java.net.ServerSocket.implAccept(Unknown Source)
	at java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:190)
	... 34 more


**Pipeline**

Как и в scikit-learn можно создавать пайплайны обработки данных

Мы много делали преобразований, давайте соберем все в 1 пайплайн

In [None]:
from pyspark.ml import Pipeline

In [None]:
#string в индесы
gender_indexer = StringIndexer(inputCol="Gender", outputCol="GenderIndex")
occupation_indexer = StringIndexer(inputCol="Occupation", outputCol="OccupationIndex")
channel_indexer = StringIndexer(inputCol="Channel_Code", outputCol="ChannelIndex")

#OHE
ohe_encoder = OneHotEncoder(inputCols=["OccupationIndex", "ChannelIndex"],
                        outputCols=["OccupationVector", "ChannelVec"])

#квантизация
discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Age", outputCol="Age_quant")

#собираем все в вектор
feature_columns = [
                   'Age',
                   'Vintage',
                   'Avg_Account_Balance',
                   'GenderIndex',
                   'OccupationVector',
                   'ChannelVec',
                   'Age_quant'                 
]
vector_assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'features')

собираем все в пайплайн

In [None]:
pipeline = Pipeline(stages=[
                           gender_indexer,
                           occupation_indexer,
                           channel_indexer,
                           ohe_encoder,
                           discretizer,
                           vector_assembler,
])

Давайте заново загрузим данные и сделаем трансформацию

---



In [None]:
data = spark.read.csv('credit_card_data.csv', header=True, inferSchema=True)
data = data.fillna({'Credit_Product': 'No'})
pipeline = pipeline.fit(data)

In [None]:
transformed_data = pipeline.transform(data)

In [None]:
transformed_data.show()

In [None]:
transformed_data.select('Is_Lead', 'features').show(5)

**Модельки**

Пора нам уже что-то обучить, начнем с логрега

In [None]:
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel

In [None]:
lr = LogisticRegression(featuresCol='features', labelCol='Is_Lead', predictionCol='prediction',
                        maxIter=100, probabilityCol='proba')

lr = lr.fit(transformed_data)

Сохраним

In [None]:
lr.save('logreg_model')

Загрузка

In [None]:
lr2 = LogisticRegressionModel.load('logreg_model')

Коэффициенты и метрики

In [None]:
print("Coefficients: " + str(lr.coefficients))
print("Intercept: " + str(lr.intercept))

In [None]:
print("Coefficients: " + str(lr2.coefficients))
print("Intercept: " + str(lr2.intercept))

In [None]:
print(f'ROC_AUC = {lr.summary.areaUnderROC}')

In [None]:
lr.summary.recallByLabel

In [None]:
lr.params

In [None]:
lr.transform(transformed_data.select('Is_Lead', 'features')).show()

**Подбор параметров**

Тут нет всяких hyperopt, optuna...есть стандартная кросс-валидация и поиск по сетке

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

Для этого соберем все в пайплайн. Можно было "вложить" старый пайплайн в новый, но соберем все с самого начала

In [None]:
#string в индесы
gender_indexer = StringIndexer(inputCol="Gender", outputCol="GenderIndex")
occupation_indexer = StringIndexer(inputCol="Occupation", outputCol="OccupationIndex")
channel_indexer = StringIndexer(inputCol="Channel_Code", outputCol="ChannelIndex")

#OHE
ohe_encoder = OneHotEncoder(inputCols=["OccupationIndex", "ChannelIndex"],
                        outputCols=["OccupationVector", "ChannelVec"])

#квантизация
discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Age", outputCol="Age_quant")

#собираем все в вектор
feature_columns = [
                   'Age',
                   'Vintage',
                   'Avg_Account_Balance',
                   'GenderIndex',
                   'OccupationVector',
                   'ChannelVec',
                   'Age_quant'                 
]
vector_assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'features')

lr = LogisticRegression(featuresCol='features', labelCol='Is_Lead', predictionCol='prediction',
                        maxIter=100, probabilityCol='proba')

In [None]:
pipeline = Pipeline(stages=[
                           gender_indexer,
                           occupation_indexer,
                           channel_indexer,
                           ohe_encoder,
                           discretizer,
                           vector_assembler,
                           lr
])

Сетка параметров

In [None]:
paramGrid = ParamGridBuilder() \
    .addGrid(discretizer.numBuckets, [5, 10]) \
    .addGrid(lr.maxIter, [10, 20]) \
    .build()

Разобьем данные на train, test

In [None]:
train, test = data.randomSplit([0.7, 0.3], seed=7)

Описываем стратегию кросс-валидации

In [None]:
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',
                                                                  labelCol='Is_Lead', metricName='areaUnderROC'),
                          numFolds=2,
                          parallelism=2)

Поняем сетку. Знаю, перебор по сетке прошлый век, но что поделать)

In [None]:
cvModel = crossval.fit(train)

In [None]:
cvModel.avgMetrics

Параметры

In [None]:
import numpy as np
print(cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics)])

Сделаем предикт

In [None]:
test_pred = cvModel.transform(test)

In [None]:
test_pred.show()

Проверим модель

In [None]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',
                                          labelCol='Is_Lead', metricName='areaUnderROC')

In [None]:
evaluator.evaluate(test_pred)

Сохраним пайплайн

In [None]:
cvModel.write().save('model')

вместо кросс-валидации можно взять TrainValidationSplit для подбора параметров, это train_test_split

**Ваша любимая домашка**

Кто проходил курс GPU прекрасно знают датасет.
Данные находятся в файле Train_Set_90621.csv
Amount Defaulted - эту переменную нужно удалить=)

Что ожидается? - творчество)

    1) Начните с анализа баланса классов, пропусков, статистик при помощи DataFrame API
    2) Посомтрите статистики, заполните пропуски при помощи уже MLlib
    3) Соберите пайплайн, похожий на наш, где будет обработка данных, обучение моделей и все при помощи Spark
    4) Разбейте данные на train/test + реализуйте подбор параметров одним из способов спарка
    5) Cохраниет пайплайн на диск
    6) Проверьте качество модели на отложенной test выборке