In [43]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [44]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark.conf.set("spark.sql.crossJoin.enabled", True) # for cartesian product usage

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [45]:
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.sql.functions import udf

In [46]:
# В laba03_train.csv содержатся факты покупки (колонка purchase) пользователями (колонка user_id) телепередач (колонка item_id)

schema_train = StructType() \
    .add("user_id", IntegerType(), True) \
    .add("item_id", IntegerType(), True) \
    .add("purchase", IntegerType(), True) \

df_train = spark.read.format("csv") \
    .option("header", True) \
    .schema(schema_train) \
    .load("/labs/slaba03/laba03_train.csv")

df_train.filter("purchase == 1").show()

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|   9897|       1|
|   1654|   7394|       1|
|   1654|   9064|       1|
|   1654|  73216|       1|
|   1654|  88816|       1|
| 510087|  72067|       1|
| 510087|  82767|       1|
| 510087|  89260|       1|
| 510087|  92471|       1|
| 510087|  66473|       1|
| 510087|  74562|       1|
| 517612|  92818|       1|
| 520446|  73216|       1|
| 520446|  78271|       1|
| 520446|  72700|       1|
| 520446|  88773|       1|
| 520446|  88963|       1|
| 520446|   8633|       1|
| 520446|  93500|       1|
| 520446|  11513|       1|
+-------+-------+--------+
only showing top 20 rows



In [47]:
# laba03_test.csv — тестовый датасет без указанного целевого признака purchase, который вам и предстоит предсказать.

schema_test = StructType() \
    .add("user_id", IntegerType(), True) \
    .add("item_id", IntegerType(), True) \


df_test = spark.read.format("csv") \
    .option("header", True) \
    .schema(schema_test) \
    .load("/labs/slaba03/laba03_test.csv")

df_test.show()

+-------+-------+
|user_id|item_id|
+-------+-------+
|   1654|  94814|
|   1654|  93629|
|   1654|   9980|
|   1654|  95099|
|   1654|  11265|
|   1654|  88896|
|   1654|  67740|
|   1654|  74271|
|   1654|  99871|
|   1654|  78570|
|   1654|  71942|
|   1654|  74367|
|   1654|  98628|
|   1654|  95887|
|   1654|  77795|
|   1654|  75152|
|   1654|  74905|
|   1654|   9068|
|   1654|  72954|
|   1654| 102431|
+-------+-------+
only showing top 20 rows



In [48]:
# laba03_items.csv — дополнительные данные по items.
# В данном файле много лишней или ненужной информации, так что задача её фильтрации и отбора ложится на вас.

schema_items = StructType(fields = [StructField('item_id', IntegerType()),
    StructField('channel_id', IntegerType()),
    StructField('datetime_availability_start', StringType()),                                    
    StructField('datetime_availability_stop', StringType()), 
    StructField('datetime_show_start', StringType()),
    StructField('datetime_show_stop', StringType()), 
    StructField('content_type', IntegerType()),                                   
    StructField('title', StringType(), nullable=True),  
    StructField('year', FloatType(), nullable=True), 
    StructField('genres', StringType()), 
    StructField('region_id', IntegerType()),                    
    ]) 

df_items = spark.read.format("csv") \
    .option("header", True) \
    .option("sep", "\t") \
    .schema(schema_items) \
    .load("/labs/slaba03/laba03_items.csv")

df_items.show()

+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------+------+--------------------+---------+
|item_id|channel_id|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|content_type|               title|  year|              genres|region_id|
+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------+------+--------------------+---------+
|  65667|      null|       1970-01-01T00:00:00Z|      2018-01-01T00:00:00Z|               null|              null|           1|на пробах только ...|2013.0|             Эротика|     null|
|  65669|      null|       1970-01-01T00:00:00Z|      2018-01-01T00:00:00Z|               null|              null|           1|скуби ду: эротиче...|2011.0|             Эротика|     null|
|  65668|      null|       1970-01-01T00:00:00Z|      2018-01-01T

In [49]:
# Дополнительный файл laba03_views_programmes.csv по просмотрам передач с полями:

schema_views = StructType(fields = [StructField('user_id', IntegerType()),
    StructField('item_id', IntegerType()),
    StructField('ts_start', IntegerType()),
    StructField('ts_end', IntegerType()),
    StructField('item_type', StringType()),
    ]) 

df_views_programmes = spark.read.format("csv") \
    .option("header", True) \
    .schema(schema_views) \
    .load("/labs/slaba03/laba03_views_programmes.csv")

df_views_programmes.show()

+-------+-------+----------+----------+---------+
|user_id|item_id|  ts_start|    ts_end|item_type|
+-------+-------+----------+----------+---------+
|      0|7101053|1491409931|1491411600|     live|
|      0|7101054|1491412481|1491451571|     live|
|      0|7101054|1491411640|1491412481|     live|
|      0|6184414|1486191290|1486191640|     live|
|    257|4436877|1490628499|1490630256|     live|
|   1654|7489015|1493434801|1493435401|     live|
|   1654|7489023|1493444101|1493445601|     live|
|   1654|6617053|1489186156|1489200834|     live|
|   1654|6438693|1487840070|1487840433|     live|
|   1654|6526859|1488705452|1488706154|     live|
|   1654|6526754|1488532396|1488532895|      pvr|
|   1654|6239098|1486732011|1486732410|     live|
|   1654|6438763|1488305761|1488307286|      pvr|
|   1654|7489013|1493433301|1493434201|     live|
|   1654|6317094|1486829784|1486830389|     live|
|   1654|6799393|1490172025|1490173391|      pvr|
|   1654|6616978|1488962050|1488962874|      pvr|


In [50]:
users = df_train.groupBy('user_id').mean('purchase').coalesce(10).cache()
items = df_train.groupBy('item_id').mean('purchase').coalesce(10)

test = df_test.join(users, on = 'user_id', how = 'outer').coalesce(10).withColumnRenamed('avg(purchase)', 'average_user').cache()
test = test.join(items, on = 'item_id', how = 'outer').coalesce(10).withColumnRenamed('avg(purchase)', 'average_item').cache()

test.show()

+-------+-------+--------------------+--------------------+
|item_id|user_id|        average_user|        average_item|
+-------+-------+--------------------+--------------------+
|   8389| 900847|3.861003861003861E-4|0.005979073243647235|
|   8389| 826792|7.766990291262136E-4|0.005979073243647235|
|   8389| 882475|0.001520912547528517|0.005979073243647235|
|   8389| 775974|3.835826620636747E-4|0.005979073243647235|
|   8389| 848670|7.930214115781126E-4|0.005979073243647235|
|   8389| 836300|0.005096040768326146|0.005979073243647235|
|   8389| 854154|0.001913509376195...|0.005979073243647235|
|   8389| 895156|3.888024883359253...|0.005979073243647235|
|   8389| 865812|3.866976024748647E-4|0.005979073243647235|
|   8389| 897151|3.846153846153846E-4|0.005979073243647235|
|   8389| 908190|0.009284332688588007|0.005979073243647235|
|   8389| 659698|3.872966692486445E-4|0.005979073243647235|
|   8389| 792601|0.003431185665268776|0.005979073243647235|
|   8389| 841174|3.850596842510589E-4|0.

In [51]:
w1 = Window.partitionBy('user_id')
w2 = Window.partitionBy('item_id')

df_user_mean = df_train.withColumn('average_user', F.avg('purchase').over(w1)) \
                        .withColumn('average_item', F.avg('purchase').over(w2)).coalesce(10).cache()

df_user_mean.show()

+-------+-------+--------+--------------------+--------------------+
|user_id|item_id|purchase|        average_user|        average_item|
+-------+-------+--------+--------------------+--------------------+
| 793876|   8389|       0|0.001940240589833...|0.005979073243647235|
| 795620|   8389|       0|0.004243827160493827|0.005979073243647235|
| 851848|   8389|       0|3.888024883359253...|0.005979073243647235|
| 880451|   8389|       0|0.009220130618517095|0.005979073243647235|
| 900203|   8389|       0|0.003436426116838488|0.005979073243647235|
| 746713|   8389|       0|                 0.0|0.005979073243647235|
| 883098|   8389|       0|0.001948558067030...|0.005979073243647235|
| 903491|   8389|       0|0.001161440185830...|0.005979073243647235|
| 903826|   8389|       0|0.001544401544401...|0.005979073243647235|
| 916566|   8389|       0|3.840245775729646...|0.005979073243647235|
| 899224|   8389|       0|  7.8003120124805E-4|0.005979073243647235|
| 932980|   8389|       0|3.858024

In [52]:
# обучение модели и получение предсказаний

assembler = VectorAssembler(inputCols = ['average_user', 'average_item'], outputCol = 'features')
gbt = GBTClassifier(featuresCol = 'features', labelCol = 'purchase')

pipeline = Pipeline(stages=[
    assembler,
    gbt
])
pipeline_model = pipeline.fit(df_user_mean)
predictions = pipeline_model.transform(test)

predictions.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|item_id|user_id|        average_user|        average_item|            features|       rawPrediction|         probability|prediction|
+-------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|   8389| 900847|3.861003861003861E-4|0.005979073243647235|[3.86100386100386...|[1.53716901899170...|[0.95582171549311...|       0.0|
|   8389| 826792|7.766990291262136E-4|0.005979073243647235|[7.76699029126213...|[1.52826641761444...|[0.95506373119086...|       0.0|
|   8389| 882475|0.001520912547528517|0.005979073243647235|[0.00152091254752...|[1.51860832608846...|[0.95422741324183...|       0.0|
|   8389| 775974|3.835826620636747E-4|0.005979073243647235|[3.83582662063674...|[1.53716901899170...|[0.95582171549311...|       0.0|
|   8389| 848670|7.930214115781126E-4|0.005979073243647235|[7.

In [55]:
firstelement = udf(lambda v : float(v[1]), FloatType())
result = predictions.select('user_id', 'item_id', firstelement('probability').alias('purchase')).coalesce(1).cache()
result.show()

+-------+-------+-----------+
|user_id|item_id|   purchase|
+-------+-------+-----------+
| 900847|   8389|0.044178285|
| 826792|   8389| 0.04493627|
| 882475|   8389|0.045772586|
| 775974|   8389|0.044178285|
| 848670|   8389|0.045682915|
| 836300|   8389| 0.05301952|
| 854154|   8389|0.046274286|
| 895156|   8389|0.044178285|
| 865812|   8389|0.044178285|
| 897151|   8389|0.044178285|
| 908190|   8389| 0.05867206|
| 659698|   8389|0.044178285|
| 792601|   8389|0.049174704|
| 841174|   8389|0.044178285|
| 863735|   8389|0.044178285|
| 871442|   8389| 0.05867206|
| 872918|   8389|0.044178285|
| 911803|   8389|0.049174704|
| 915724|   8389| 0.08146937|
| 939341|   8389|0.044178285|
+-------+-------+-----------+
only showing top 20 rows



In [56]:
result_ordered = result.orderBy('user_id', 'item_id')
result_ordered.show()

+-------+-------+-----------+
|user_id|item_id|   purchase|
+-------+-------+-----------+
|   1654|    336| 0.04395757|
|   1654|    678| 0.04395757|
|   1654|    691| 0.04395757|
|   1654|    696|0.043966576|
|   1654|    763| 0.04395757|
|   1654|    795|0.046492074|
|   1654|    861| 0.04395757|
|   1654|   1137| 0.04424883|
|   1654|   1159|0.043966576|
|   1654|   1428| 0.04395757|
|   1654|   1685|0.043966576|
|   1654|   1686| 0.04395757|
|   1654|   1704|0.043966576|
|   1654|   2093| 0.04395757|
|   1654|   2343| 0.04395757|
|   1654|   2451| 0.04395757|
|   1654|   2469| 0.04723936|
|   1654|   2603| 0.04395757|
|   1654|   2609| 0.04395757|
|   1654|   2621|0.043966576|
+-------+-------+-----------+
only showing top 20 rows



In [48]:
result_ordered.repartition(1).write.option('header', True).csv('user/mikhail.galperin/lab03.csv', mode = 'overwrite')
!hdfs dfs -get /user/mikhail.galperin/lab03

In [57]:
spark.stop()