In [1]:
import os
import sys

spark_home = '/usr/hdp/current/spark2-client'
os.environ["SPARK_HOME"]=spark_home
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["PYSPARK_SUBMIT_ARGS"]="--num-executors 5 --executor-cores 1 --executor-memory 5g --driver-memory 2g --conf spark.sql.broadcastTimeout=6000 --conf spark.sql.catalogImplementation=in-memory pyspark-shell"

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, HiveContext

conf = SparkConf().set("spark.app.name", "lab03")
sc = SparkContext.getOrCreate(conf)
spark = HiveContext(sc)

sc.applicationId

'application_1667306389915_1680'

In [3]:
import pyspark.sql.types as T
import pyspark.sql.functions as F
from pyspark.sql.window import Window as W
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier

to_array = F.udf(lambda v: v.toArray().tolist(), T.ArrayType(T.FloatType()))

In [4]:
user_train_schema = T.StructType([
    T.StructField("user_id", T.IntegerType()),
    T.StructField("item_id", T.IntegerType()),
    T.StructField("purchase", T.IntegerType())
])
      
df_user_train = spark.read.option("header", True).schema(user_train_schema).csv("/labs/slaba03/laba03_train.csv")

items_schema = T.StructType([
    T.StructField('item_id', T.IntegerType()),
    T.StructField('channel_id', T.IntegerType()),
    T.StructField('datetime_availability_start', T.StringType()),
    T.StructField('datetime_availability_stop', T.StringType()),
    T.StructField('datetime_show_start', T.StringType()),
    T.StructField('datetime_show_stop', T.StringType()),
    T.StructField('content_type', T.IntegerType()),
    T.StructField('title', T.StringType(), nullable=True),
    T.StructField('year', T.FloatType(), nullable=True),
    T.StructField('genres', T.StringType()),
    T.StructField('region_id', T.IntegerType())
])

df_items = spark.read.option("header", True).option("sep", "\t").schema(items_schema).csv("/labs/slaba03/laba03_items.csv")

views_schema = T.StructType([
    T.StructField("user_id", T.IntegerType()),
    T.StructField("item_id", T.IntegerType()),
    T.StructField("ts_start", T.IntegerType()),
    T.StructField("ts_end", T.IntegerType()),
    T.StructField("item_type", T.StringType())
])

df_views = spark.read.option("header", True).schema(views_schema).csv("/labs/slaba03/laba03_views_programmes.csv")

user_test_schema = T.StructType([
    T.StructField("user_id", T.IntegerType(), True),
    T.StructField("item_id", T.IntegerType(), True)
])

df_user_test = spark.read.option("header", True).schema(user_test_schema).csv("/labs/slaba03/laba03_test.csv")

### Обучение модели

In [11]:
user_purchase_sum = df_user_train.groupBy("user_id").agg(F.sum("purchase").alias("user_purchase_sum")).cache()
user_purchase_cnt = df_user_train.groupBy("user_id").agg(F.count("purchase").alias("user_purchase_cnt")).cache()

item_purchase_sum = df_user_train.groupBy("item_id").agg(F.sum("purchase").alias("item_purchase_sum")).cache()
item_purchase_cnt = df_user_train.groupBy("item_id").agg(F.count("purchase").alias("item_purchase_cnt")).cache()

train = (
    df_user_train.join(user_purchase_sum, on="user_id", how='left')
    .join(item_purchase_sum, on="item_id", how='left')
    .join(user_purchase_cnt, on="user_id", how='left')
    .join(user_purchase_cnt, on="item_id", how='left')
    .withColumn("user_avg_check", F.col("user_purchase_sum") / F.col("user_purchase_cnt"))
    .withColumn("item_avg_check", F.col("item_purchase_sum") / F.col("item_purchase_cnt"))
)

test = (
    df_user_test.join(user_purchase_sum, on="user_id", how="left")
    .join(item_purchase_sum, on="item_id", how="left")
    .join(user_purchase_cnt, on="user_id", how="left")
    .join(user_purchase_cnt, on="item_id", how="left")
    .withColumn("user_avg_check", F.col("user_purchase_sum") / F.col("user_purchase_cnt"))
    .withColumn("item_avg_check", F.col("item_purchase_sum") / F.col("item_purchase_cnt"))
)

In [12]:
%%time

cols = ["user_purchase_sum", "item_purchase_sum", "user_avg_check", "item_avg_check"]
assembler = VectorAssembler(inputCols=cols, outputCol="features")
gbtc = GBTClassifier(labelCol="purchase", maxDepth=4, minInstancesPerNode=3)

pipeline = Pipeline(stages=[
    assembler,
    gbtc
])

pipeline_model = pipeline.fit(train)

CPU times: user 25.4 ms, sys: 23.5 ms, total: 48.9 ms
Wall time: 3min 38s


In [7]:
pipeline_model.write().overwrite().save("tmp/lab03/pipeline_model")
# pipeline_model = PipelineModel.load("tmp/lab03/pipeline_model")

### Предсказание на тестовых данных

In [13]:
test_predict = pipeline_model.transform(test)
test_predict.select("user_id", "item_id", "features", "rawPrediction", "probability", "prediction").show(10, 30)

+-------+-------+------------------------------+------------------------------+------------------------------+----------+
|user_id|item_id|                      features|                 rawPrediction|                   probability|prediction|
+-------+-------+------------------------------+------------------------------+------------------------------+----------+
|   1654|  94814|[5.0,1.0,0.0019470404984423...|[1.5397214895662545,-1.5397...|[0.956036778708755,0.043963...|       0.0|
|   1654|  93629|[5.0,4.0,0.0019470404984423...|[1.5387234865562902,-1.5387...|[0.9559528092591364,0.04404...|       0.0|
|   1654|   9980|[5.0,1.0,0.0019470404984423...|[1.5397214895662545,-1.5397...|[0.956036778708755,0.043963...|       0.0|
|   1654|  95099|[5.0,1.0,0.0019470404984423...|[1.5397214895662545,-1.5397...|[0.956036778708755,0.043963...|       0.0|
|   1654|  11265|[5.0,6.0,0.0019470404984423...|[1.5365226766791593,-1.5365...|[0.9557670976884263,0.04423...|       0.0|
|   1654|  88896|[5.0,4.

In [14]:
test_target = test_predict.withColumn("purchase", to_array(F.col("probability")).getItem(1)) \
.withColumn("", F.row_number().over(W.orderBy("user_id", "item_id")) - F.lit(1)).select("", "user_id", "item_id", "purchase")
test_target.coalesce(1).orderBy("user_id", "item_id").write.mode("overwrite").option("header", True).csv("tmp/lab03")

In [15]:
! rm -f lab03.csv
! hdfs dfs -get tmp/lab03/part-* lab03.csv
! sed -i 's/\"//g' lab03.csv

In [16]:
sc.stop()