In [1]:
# Import other modules not related to PySpark
import os
import sys
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
import math
from IPython.core.interactiveshell import InteractiveShell
from datetime import *
import statistics as stats
import seaborn as sns
# This helps auto print out the items without explixitly using 'print'
InteractiveShell.ast_node_interactivity = "all" 
%matplotlib inline

In [2]:
# Import PySpark related modules
import pyspark
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions
from pyspark.sql.functions import lit, desc, col, size, array_contains\
, isnan, udf, hour, array_min, array_max, countDistinct
from pyspark.sql.types import *

MAX_MEMORY = '4G'
# Initialize a spark session.
conf = pyspark.SparkConf().setMaster("local[*]") \
        .set('spark.executor.heartbeatInterval', 10000) \
        .set('spark.network.timeout', 10000) \
        .set("spark.core.connection.ack.wait.timeout", "3600") \
        .set("spark.executor.memory", MAX_MEMORY) \
        .set("spark.driver.memory", MAX_MEMORY)
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Pyspark guide") \
        .config(conf=conf) \
        .getOrCreate()
    return spark

spark = init_spark()

df = spark.read.parquet('bar.parquet')
df.show()

print('Data frame type: ' + str(type(df)))

+----------------------+------+-----------------+---------------------+-----------------------+--------------------+--------------------------------+------------------------------------+--------------------------------------+
|count_in_links_to_item| views|count_item_aliase|en_description_length|count_out_links_to_item|views_was_is_missing|count_item_aliase_was_is_missing|en_description_length_was_is_missing|count_out_links_to_item_was_is_missing|
+----------------------+------+-----------------+---------------------+-----------------------+--------------------+--------------------------------+------------------------------------+--------------------------------------+
|                    18|163515|               21|                   45|                     58|                   0|                               0|                                   0|                                     0|
|                    31| 42880|                2|                   54|                     61| 

# **Задача регрессии**
## **Линейная регрессии**

In [3]:
columns = ['count_out_links_to_item', 'count_in_links_to_item', 'views_was_is_missing', 'count_item_aliase', 'en_description_length']
df_target = df.select(columns)
df_target.show(4)

+-----------------------+----------------------+--------------------+-----------------+---------------------+
|count_out_links_to_item|count_in_links_to_item|views_was_is_missing|count_item_aliase|en_description_length|
+-----------------------+----------------------+--------------------+-----------------+---------------------+
|                     58|                    18|                   0|               21|                   45|
|                     61|                    31|                   0|                2|                   54|
|                    633|                   431|                   0|               12|                   33|
|                    122|                    34|                   0|                6|                   25|
+-----------------------+----------------------+--------------------+-----------------+---------------------+
only showing top 4 rows



In [4]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import Normalizer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

selected_columns = ['count_in_links_to_item', 'views_was_is_missing', 'count_item_aliase', 'en_description_length']
data_subset = df_target.select(columns)
# Создадим столбец features, который объединяет все признаки в один вектор
assembler = VectorAssembler(inputCols=selected_columns, outputCol="features")
 # = assembler.transform(df_target)
#final_data = data_subset.select("features", "count_out_links_to_item")

train_data, test_data = data_subset.randomSplit([0.7, 0.3], seed=42)

normalizer_link = Normalizer(inputCol='features', outputCol='norm_features', p=1.0)

# Создание модели LinearRegression
lr = LinearRegression(featuresCol="norm_features", labelCol="count_out_links_to_item", predictionCol="predicted_count_out_links_to_item")

# Создание конвейера
pipeline = Pipeline(stages=[assembler, normalizer_link, lr])

# Обучение модели на обучающей выборке
lr_model = pipeline.fit(train_data)

# Прогнозирование на тестовой выборке
predictions = lr_model.transform(test_data)
predictions.show(5)

+-----------------------+----------------------+--------------------+-----------------+---------------------+------------------+--------------------+---------------------------------+
|count_out_links_to_item|count_in_links_to_item|views_was_is_missing|count_item_aliase|en_description_length|          features|       norm_features|predicted_count_out_links_to_item|
+-----------------------+----------------------+--------------------+-----------------+---------------------+------------------+--------------------+---------------------------------+
|                      4|                     1|                   0|                1|                   27|[1.0,0.0,1.0,27.0]|[0.03448275862068...|                67.60023482007189|
|                      5|                     2|                   0|                1|                   83|[2.0,0.0,1.0,83.0]|[0.02325581395348...|                66.85753131120306|
|                      6|                     5|                   0|           

In [5]:
# Оценка качества модели
evaluator = RegressionEvaluator(labelCol="count_out_links_to_item", predictionCol="predicted_count_out_links_to_item", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data: {:.3f}".format(rmse))

evaluator_r2 = RegressionEvaluator(labelCol="count_out_links_to_item", predictionCol="predicted_count_out_links_to_item", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print("R-squared (R2) on test data: {:.3f}".format(r2))

Root Mean Squared Error (RMSE) on test data: 12.541
R-squared (R2) on test data: 0.105


In [6]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


# Определение сетки параметров для кросс-валидации
param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()


# Создание объекта CrossValidator
cv = CrossValidator(estimator=pipeline,
                            estimatorParamMaps=param_grid,
                            evaluator=evaluator,
                            numFolds=4) 

# Обучение и подбор гиперпараметров
cv_model = cv.fit(train_data)

best_cv_model = cv_model.bestModel

# Оценка производительности на тестовом наборе
cv_prediction = best_cv_model.transform(test_data)

cv_prediction.show(8)

+-----------------------+----------------------+--------------------+-----------------+---------------------+------------------+--------------------+---------------------------------+
|count_out_links_to_item|count_in_links_to_item|views_was_is_missing|count_item_aliase|en_description_length|          features|       norm_features|predicted_count_out_links_to_item|
+-----------------------+----------------------+--------------------+-----------------+---------------------+------------------+--------------------+---------------------------------+
|                      4|                     1|                   0|                1|                   27|[1.0,0.0,1.0,27.0]|[0.03448275862068...|                67.85803511692474|
|                      5|                     2|                   0|                1|                   83|[2.0,0.0,1.0,83.0]|[0.02325581395348...|                67.09626042576396|
|                      6|                     5|                   0|           

In [7]:
# Оценка качества модели
evaluator = RegressionEvaluator(labelCol="count_out_links_to_item", predictionCol="predicted_count_out_links_to_item", metricName="rmse")
rmse = evaluator.evaluate(cv_prediction)
print("Root Mean Squared Error (RMSE) on test data: {:.3f}".format(rmse))

evaluator_r2 = RegressionEvaluator(labelCol="count_out_links_to_item", predictionCol="predicted_count_out_links_to_item", metricName="r2")
r2 = evaluator_r2.evaluate(cv_prediction)
print("R-squared (R2) on test data: {:.3f}".format(r2))

Root Mean Squared Error (RMSE) on test data: 12.549
R-squared (R2) on test data: 0.103


## **Градиентный бустинг**

In [13]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Выберем только нужные колонки
selected_columns = ['views_was_is_missing', 'count_out_links_to_item', 'count_in_links_to_item', 'count_item_aliase', 'en_description_length']
data_subset = df_target.select(selected_columns)

# Создадим столбец features, который объединяет все признаки в один вектор
assembler = VectorAssembler(inputCols=selected_columns[1:], outputCol="features")
#data_subset = assembler.transform(data_subset)


# Разделим данные на обучающую и тестовую выборки
(training_data, test_data) = data_subset.randomSplit([0.7, 0.3], seed=42)

# Инициализация модели GBTClassifier
gbt = GBTClassifier(labelCol="views_was_is_missing", featuresCol="features", maxIter=10)

# Инициализация оценщика (evaluator)
evaluator = MulticlassClassificationEvaluator(labelCol="views_was_is_missing", predictionCol="prediction", metricName="accuracy")

# evaluator_mse = RegressionEvaluator(labelCol="count_out_links_to_item", predictionCol="prediction", metricName="mse")

# Создание конвейера
pipeline = Pipeline(stages=[assembler, gbt])

# Обучение модели на обучающей выборке
gbt_model = pipeline.fit(training_data)

prediction = gbt_model.transform(test_data)
predicted = prediction.select("features", "prediction", "views_was_is_missing")
predicted.show(10)

+-------------------+----------+--------------------+
|           features|prediction|views_was_is_missing|
+-------------------+----------+--------------------+
| [4.0,1.0,1.0,27.0]|       0.0|                   0|
| [5.0,2.0,1.0,83.0]|       0.0|                   0|
|  [6.0,5.0,7.0,5.0]|       0.0|                   0|
| [7.0,1.0,1.0,27.0]|       0.0|                   0|
| [7.0,2.0,3.0,52.0]|       0.0|                   0|
| [7.0,4.0,1.0,27.0]|       0.0|                   0|
| [8.0,1.0,1.0,20.0]|       0.0|                   0|
| [9.0,1.0,2.0,84.0]|       0.0|                   0|
| [9.0,6.0,1.0,54.0]|       0.0|                   0|
|[10.0,1.0,3.0,55.0]|       0.0|                   0|
+-------------------+----------+--------------------+
only showing top 10 rows



In [18]:
evaluator = MulticlassClassificationEvaluator(
labelCol="views_was_is_missing", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Аccuracy = %g" % accuracy)

evaluator_f1 = MulticlassClassificationEvaluator(
labelCol="views_was_is_missing", predictionCol="prediction", metricName="f1")
f1 = evaluator_f1.evaluate(predictions)
print("F1_sore = %g" % f1)

Аccuracy = 0.987486
F1_sore = 0.987302


In [20]:
# Создадим сетку параметров для подбора
param_grid = (ParamGridBuilder()
              .addGrid(gbt.maxDepth, [5, 5])
              .addGrid(gbt.maxIter, [10, 10])
              .build())

# Инициализация кросс-валидации
cross_val = CrossValidator(estimator=pipeline, 
                           estimatorParamMaps=param_grid, 
                           evaluator=evaluator, numFolds=3)

# Обучение модели на обучающей выборке с кросс-валидацией
cv_model = cross_val.fit(training_data)

# Прогнозирование на тестовой выборке
cv_predictions = cv_model.transform(test_data)

# Вывод предсказаний и выбранных признаков
cv_predictions.select("views_was_is_missing", "prediction", *selected_columns[1:]).show()


+--------------------+----------+-----------------------+----------------------+-----------------+---------------------+
|views_was_is_missing|prediction|count_out_links_to_item|count_in_links_to_item|count_item_aliase|en_description_length|
+--------------------+----------+-----------------------+----------------------+-----------------+---------------------+
|                   0|       0.0|                      4|                     1|                1|                   27|
|                   0|       0.0|                      5|                     2|                1|                   83|
|                   0|       0.0|                      6|                     5|                7|                    5|
|                   0|       0.0|                      7|                     1|                1|                   27|
|                   0|       0.0|                      7|                     2|                3|                   52|
|                   0|       0.0

In [23]:
evaluator = MulticlassClassificationEvaluator(
labelCol="views_was_is_missing", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(cv_predictions)
print("Аccuracy = %g" % accuracy)

evaluator_f1 = MulticlassClassificationEvaluator(
labelCol="views_was_is_missing", predictionCol="prediction", metricName="f1")
f1 = evaluator_f1.evaluate(cv_predictions)
print("F1_sore = %g" % f1)

Аccuracy = 0.987486
F1_sore = 0.987302
