In [1]:
import os
import sys
import json

In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Markova-E") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, FloatType
from pyspark.sql.functions import col, array_contains

In [5]:
schema = StructType() \
    .add("user_id", IntegerType(), True) \
    .add("item_id", IntegerType(), True) \
    .add("purchase", IntegerType(), True) \

df_user = spark.read.format("csv") \
    .option("header", True) \
    .schema(schema) \
    .load("/labs/slaba03/laba03_train.csv")


In [6]:
df_user.filter("purchase == 1").show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|   9897|       1|
|   1654|   7394|       1|
|   1654|   9064|       1|
|   1654|  73216|       1|
|   1654|  88816|       1|
+-------+-------+--------+
only showing top 5 rows



In [7]:
schema_test = StructType() \
    .add("user_id", IntegerType(), True) \
    .add("item_id", IntegerType(), True) \


df_user_test = spark.read.format("csv") \
    .option("header", True) \
    .schema(schema_test) \
    .load("/labs/slaba03/laba03_test.csv")


In [8]:
df_user_test.show(5)

+-------+-------+
|user_id|item_id|
+-------+-------+
|   1654|  94814|
|   1654|  93629|
|   1654|   9980|
|   1654|  95099|
|   1654|  11265|
+-------+-------+
only showing top 5 rows



In [9]:
items_schema = StructType(fields = [StructField('item_id', IntegerType()),
StructField('channel_id', IntegerType()),
StructField('datetime_availability_start', StringType()),                                    
StructField('datetime_availability_stop', StringType()), 
StructField('datetime_show_start', StringType()),
StructField('datetime_show_stop', StringType()), 
StructField('content_type', IntegerType()),                                   
StructField('title', StringType(), nullable=True),  
StructField('year', FloatType(), nullable=True), 
StructField('genres', StringType()), 
StructField('region_id', IntegerType()),                    
]) 



df_items = spark.read.format("csv") \
    .option("header", True) \
    .option("sep", "\t") \
    .schema(items_schema) \
    .load("/labs/slaba03/laba03_items.csv")


In [10]:
df_items.show(2,False,False)

+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------------------------------------------+------+-------+---------+
|item_id|channel_id|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|content_type|title                                                   |year  |genres |region_id|
+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------------------------------------------+------+-------+---------+
|65667  |null      |1970-01-01T00:00:00Z       |2018-01-01T00:00:00Z      |null               |null              |1           |на пробах только девушки (all girl auditions)           |2013.0|Эротика|null     |
|65669  |null      |1970-01-01T00:00:00Z       |2018-01-01T00:00:00Z      |null               |null              |1           |скуби ду: эротическая пародия (sc

In [11]:
views_schema = StructType(fields = [StructField('user_id', IntegerType()),
StructField('item_id', IntegerType()),
StructField('ts_start', IntegerType()),
StructField('ts_end', IntegerType()),
StructField('item_type', StringType()),
]) 



df_views_programmes = spark.read.format("csv") \
    .option("header", True) \
    .schema(views_schema) \
    .load("/labs/slaba03/laba03_views_programmes.csv")


In [12]:
df_views_programmes.count()

20845607

In [14]:
df_user_u = df_user.groupBy("user_id").mean("purchase").coalesce(10).cache()

In [15]:
df_user_i = df_user.groupBy("item_id").mean("purchase").coalesce(10)

In [16]:
test = df_user_test.join(df_user_u, on = "user_id", how = "outer").coalesce(10).withColumnRenamed("avg(purchase)", "avg_u").cache()

In [17]:
test = test.join(df_user_i, on = "item_id", how = "outer").coalesce(10).withColumnRenamed("avg(purchase)", "avg_i").cache()

In [18]:
test.show(5)

+-------+-------+--------------------+--------------------+
|item_id|user_id|               avg_u|               avg_i|
+-------+-------+--------------------+--------------------+
|   8389| 642397|3.822629969418960...|0.005979073243647235|
|   8389| 852680|3.858024691358024...|0.005979073243647235|
|   8389| 929653|                 0.0|0.005979073243647235|
|   8389| 932999|3.898635477582846E-4|0.005979073243647235|
|   8389| 871154|                 0.0|0.005979073243647235|
+-------+-------+--------------------+--------------------+
only showing top 5 rows



In [19]:
from pyspark.sql.window import Window
from pyspark.sql import functions as f

In [20]:
w = Window.partitionBy('user_id')
ww = Window.partitionBy('item_id')

In [21]:
df_user_mean = df_user.withColumn("avg_u", f.avg("purchase").over(w)).withColumn("avg_i", f.avg("purchase").over(ww)).coalesce(10).cache()

In [22]:
df_user_test

DataFrame[user_id: int, item_id: int]

In [23]:
df_user_mean.show()

+-------+-------+--------+--------------------+--------------------+
|user_id|item_id|purchase|               avg_u|               avg_i|
+-------+-------+--------+--------------------+--------------------+
| 797350|   8389|       0|                 0.0|0.005979073243647235|
| 848246|   8389|       0|0.005838847800700...|0.005979073243647235|
| 902359|   8389|       0|0.002337358784573432|0.005979073243647235|
| 868695|   8389|       0|3.894080996884735E-4|0.005979073243647235|
| 905722|   8389|       0|3.859513701273639...|0.005979073243647235|
| 915444|   8389|       0|3.849114703618167...|0.005979073243647235|
| 921046|   8389|       0|0.002736512900703675|0.005979073243647235|
| 926143|   8389|       0|3.843197540353574E-4|0.005979073243647235|
| 776138|   8389|       0|                0.01|0.005979073243647235|
| 833838|   8389|       0|0.005740528128587...|0.005979073243647235|
| 865948|   8389|       0|3.821169277799006...|0.005979073243647235|
| 868207|   8389|       0|3.927729

In [26]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(featuresCol="features", labelCol="purchase")

In [27]:
from pyspark.ml import Pipeline

In [28]:
from pyspark.ml.feature import VectorAssembler

In [29]:
assembler = VectorAssembler(inputCols=["avg_u", "avg_i"], outputCol="features")

In [30]:
pipeline = Pipeline(stages=[
    assembler,
    gbt
])

In [31]:
pipeline_model = pipeline.fit(df_user_mean)

In [32]:
predictions = pipeline_model.transform(test)

In [33]:
predictions.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|item_id|user_id|               avg_u|               avg_i|            features|       rawPrediction|         probability|prediction|
+-------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|   8389| 642397|3.822629969418960...|0.005979073243647235|[3.82262996941896...|[1.53665480755936...|[0.95577826836632...|       0.0|
|   8389| 852680|3.858024691358024...|0.005979073243647235|[3.85802469135802...|[1.53665480755936...|[0.95577826836632...|       0.0|
|   8389| 929653|                 0.0|0.005979073243647235|[0.0,0.0059790732...|[1.53665480755936...|[0.95577826836632...|       0.0|
|   8389| 932999|3.898635477582846E-4|0.005979073243647235|[3.89863547758284...|[1.53665480755936...|[0.95577826836632...|       0.0|
|   8389| 871154|                 0.0|0.005979073243647235|[0.

In [34]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import lit, udf

def ith_(v, i):
    try:
        return float(v[i])
    except ValueError:
        return None

ith = udf(ith_, DoubleType())

In [35]:
firstelement=udf(lambda v:float(v[1]),FloatType())

In [36]:
 predictions.select("item_id", "user_id", "probability").show(5)

+-------+-------+--------------------+
|item_id|user_id|         probability|
+-------+-------+--------------------+
|   8389| 642397|[0.95577826836632...|
|   8389| 852680|[0.95577826836632...|
|   8389| 929653|[0.95577826836632...|
|   8389| 932999|[0.95577826836632...|
|   8389| 871154|[0.95577826836632...|
+-------+-------+--------------------+
only showing top 5 rows



In [37]:
res = predictions.select("user_id", "item_id", firstelement("probability").alias("purchase")).coalesce(1).cache()

In [38]:
res.show(5)

+-------+-------+-----------+
|user_id|item_id|   purchase|
+-------+-------+-----------+
| 642397|   8389|0.044221733|
| 852680|   8389|0.044221733|
| 929653|   8389|0.044221733|
| 932999|   8389|0.044221733|
| 871154|   8389|0.044221733|
+-------+-------+-----------+
only showing top 5 rows



In [39]:
res1 = res.orderBy("user_id", "item_id")
res1.show(4)

+-------+-------+-----------+
|user_id|item_id|   purchase|
+-------+-------+-----------+
|   1654|    336|0.043943133|
|   1654|    678|0.043943133|
|   1654|    691|0.043943133|
|   1654|    696| 0.04407389|
+-------+-------+-----------+
only showing top 4 rows



In [47]:
t=res1.toPandas()

In [9]:
!hdfs dfs -get lab05.csv