In [162]:
import os
import sys
import json

In [163]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [164]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Groo-IA") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [165]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, FloatType
from pyspark.sql.functions import col, array_contains

In [166]:
schema = StructType() \
    .add("user_id", IntegerType(), True) \
    .add("item_id", IntegerType(), True) \
    .add("purchase", IntegerType(), True) \

df_user = spark.read.format("csv") \
    .option("header", True) \
    .schema(schema) \
    .load("/labs/slaba03/laba03_train.csv")


In [167]:
df_user.filter("purchase == 1").show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|   9897|       1|
|   1654|   7394|       1|
|   1654|   9064|       1|
|   1654|  73216|       1|
|   1654|  88816|       1|
+-------+-------+--------+
only showing top 5 rows



In [11]:
schema_test = StructType() \
    .add("user_id", IntegerType(), True) \
    .add("item_id", IntegerType(), True) \


df_user_test = spark.read.format("csv") \
    .option("header", True) \
    .schema(schema_test) \
    .load("/labs/slaba03/laba03_test.csv")


In [13]:
df_user_test.show(5)

+-------+-------+
|user_id|item_id|
+-------+-------+
|   1654|  94814|
|   1654|  93629|
|   1654|   9980|
|   1654|  95099|
|   1654|  11265|
+-------+-------+
only showing top 5 rows



In [17]:
items_schema = StructType(fields = [StructField('item_id', IntegerType()),
StructField('channel_id', IntegerType()),
StructField('datetime_availability_start', StringType()),                                    
StructField('datetime_availability_stop', StringType()), 
StructField('datetime_show_start', StringType()),
StructField('datetime_show_stop', StringType()), 
StructField('content_type', IntegerType()),                                   
StructField('title', StringType(), nullable=True),  
StructField('year', FloatType(), nullable=True), 
StructField('genres', StringType()), 
StructField('region_id', IntegerType()),                    
]) 



df_items = spark.read.format("csv") \
    .option("header", True) \
    .option("sep", "\t") \
    .schema(items_schema) \
    .load("/labs/slaba03/laba03_items.csv")


In [19]:
df_items.show(2,False,False)

+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------------------------------------------+------+-------+---------+
|item_id|channel_id|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|content_type|title                                                   |year  |genres |region_id|
+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------------------------------------------+------+-------+---------+
|65667  |null      |1970-01-01T00:00:00Z       |2018-01-01T00:00:00Z      |null               |null              |1           |на пробах только девушки (all girl auditions)           |2013.0|Эротика|null     |
|65669  |null      |1970-01-01T00:00:00Z       |2018-01-01T00:00:00Z      |null               |null              |1           |скуби ду: эротическая пародия (sc

In [23]:
views_schema = StructType(fields = [StructField('user_id', IntegerType()),
StructField('item_id', IntegerType()),
StructField('ts_start', IntegerType()),
StructField('ts_end', IntegerType()),
StructField('item_type', StringType()),
]) 



df_views_programmes = spark.read.format("csv") \
    .option("header", True) \
    .schema(views_schema) \
    .load("/labs/slaba03/laba03_views_programmes.csv")


In [27]:
df_views_programmes.count()

20845607

In [None]:
df_items.select("")

In [88]:
df_user_u = df_user.groupBy("user_id").mean("purchase").coalesce(10).cache()

In [92]:
df_user_i = df_user.groupBy("item_id").mean("purchase").coalesce(10)

In [90]:
test = df_user_test.join(df_user_u, on = "user_id", how = "outer").coalesce(10).withColumnRenamed("avg(purchase)", "avg_u").cache()

In [93]:
test = test.join(df_user_i, on = "item_id", how = "outer").coalesce(10).withColumnRenamed("avg(purchase)", "avg_i").cache()

In [95]:
test.show(5)

+-------+-------+--------------------+--------------------+
|item_id|user_id|               avg_u|               avg_i|
+-------+-------+--------------------+--------------------+
|   8389| 761341|3.875968992248062E-4|0.005979073243647235|
|   8389| 776188|0.001152516327314637|0.005979073243647235|
|   8389| 846231|0.001923816852635629|0.005979073243647235|
|   8389| 822709|3.789314134141720...|0.005979073243647235|
|   8389| 824008|3.821169277799006...|0.005979073243647235|
+-------+-------+--------------------+--------------------+
only showing top 5 rows



In [45]:
from pyspark.sql.window import Window
from pyspark.sql import functions as f

In [49]:
w = Window.partitionBy('user_id')
ww = Window.partitionBy('item_id')

In [60]:
df_user_mean = df_user.withColumn("avg_u", f.avg("purchase").over(w)).withColumn("avg_i", f.avg("purchase").over(ww)).coalesce(10).cache()

In [None]:
df_user_test

In [61]:
df_user_mean.show()

+-------+-------+--------+--------------------+--------------------+
|user_id|item_id|purchase|               avg_u|               avg_i|
+-------+-------+--------+--------------------+--------------------+
| 754230|   8389|       0|0.027575641516660282|0.005979073243647235|
| 780033|   8389|       0|7.757951900698216E-4|0.005979073243647235|
| 798454|   8389|       0|3.840245775729646...|0.005979073243647235|
| 825061|   8389|       0|0.001931247585940...|0.005979073243647235|
| 833685|   8389|       0|0.007500986971969996|0.005979073243647235|
| 851486|   8389|       0|                 0.0|0.005979073243647235|
| 867850|   8389|       0|3.829950210647261...|0.005979073243647235|
| 870928|   8389|       0|7.674597083653108E-4|0.005979073243647235|
| 879401|   8389|       0|0.004283489096573208|0.005979073243647235|
| 901457|   8389|       0|                 0.0|0.005979073243647235|
| 927211|   8389|       0|3.916960438699569E-4|0.005979073243647235|
| 928140|   8389|       0|3.869969

In [None]:
log_with_regions.rdd.getNumPartitions()

In [81]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(featuresCol="features", labelCol="purchase")

In [56]:
from pyspark.ml import Pipeline

In [59]:
from pyspark.ml.feature import VectorAssembler

In [69]:
assembler = VectorAssembler(inputCols=["avg_u", "avg_i"], outputCol="features")

In [82]:
pipeline = Pipeline(stages=[
    assembler,
    gbt
])

In [83]:
pipeline_model = pipeline.fit(df_user_mean)

In [96]:
predictions = pipeline_model.transform(test)

In [97]:
predictions.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|item_id|user_id|               avg_u|               avg_i|            features|       rawPrediction|         probability|prediction|
+-------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|   8389| 761341|3.875968992248062E-4|0.005979073243647235|[3.87596899224806...|[1.53725804902778...|[0.95582923374786...|       0.0|
|   8389| 776188|0.001152516327314637|0.005979073243647235|[0.00115251632731...|[1.51978974130108...|[0.95433050497383...|       0.0|
|   8389| 846231|0.001923816852635629|0.005979073243647235|[0.00192381685263...|[1.50967598955631...|[0.95344076733642...|       0.0|
|   8389| 822709|3.789314134141720...|0.005979073243647235|[3.78931413414172...|[1.53725804902778...|[0.95582923374786...|       0.0|
|   8389| 824008|3.821169277799006...|0.005979073243647235|[3.

In [105]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import lit, udf

def ith_(v, i):
    try:
        return float(v[i])
    except ValueError:
        return None

ith = udf(ith_, DoubleType())

In [117]:
firstelement=udf(lambda v:float(v[1]),FloatType())

In [116]:
 predictions.select("item_id", "user_id", "probability").show(5)

+-------+-------+--------------------+
|item_id|user_id|         probability|
+-------+-------+--------------------+
|   8389| 761341|[0.95582923374786...|
|   8389| 776188|[0.95433050497383...|
|   8389| 846231|[0.95344076733642...|
|   8389| 822709|[0.95582923374786...|
|   8389| 824008|[0.95582923374786...|
+-------+-------+--------------------+
only showing top 5 rows



In [153]:
res = predictions.select("user_id", "item_id", firstelement("probability").alias("purchase")).coalesce(1).cache()

In [154]:
res.show(5)

+-------+-------+-----------+
|user_id|item_id|   purchase|
+-------+-------+-----------+
| 761341|   8389|0.044170767|
| 776188|   8389|0.045669496|
| 846231|   8389|0.046559233|
| 822709|   8389|0.044170767|
| 824008|   8389|0.044170767|
+-------+-------+-----------+
only showing top 5 rows



In [157]:
res1 = res.orderBy("user_id", "item_id")
res1.show(4)

+-------+-------+-----------+
|user_id|item_id|   purchase|
+-------+-------+-----------+
|   1654|    336| 0.04389559|
|   1654|    678| 0.04389559|
|   1654|    691| 0.04389559|
|   1654|    696|0.044106178|
+-------+-------+-----------+
only showing top 4 rows



In [158]:
res1.repartition(1).write.option("header",True).csv('/user/ivan.groo/lab03', mode="overwrite")

In [159]:
!hdfs dfs -get /user/ivan.groo/lab03

In [160]:
!mv lab03/part-00000-7edf5573-a848-4261-9180-0a18e379f115-c000.csv lab03.csv

In [104]:
predictions.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- avg_u: double (nullable = true)
 |-- avg_i: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [168]:
spark.stop()