In [1]:
!hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2021-02-27 22:12 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2021-02-27 22:12 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2021-02-27 22:12 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2021-02-27 22:12 /labs/slaba03/laba03_views_programmes.csv


In [2]:
import os
import sys


os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [3]:
from pyspark import SparkContext, SparkConf

conf = SparkConf()
conf.set("spark.app.name", "lab2") 

sc = SparkContext(conf=conf)

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(conf=conf).appName("belov_app").getOrCreate()

In [6]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

schema = StructType(fields=[StructField("user_id", IntegerType()),
                            StructField("item_id", IntegerType()),
                            StructField("purchase", IntegerType())])

train = spark.read.csv('/labs/slaba03/laba03_train.csv', schema=schema, header=True)
train.show(5)


+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
+-------+-------+--------+
only showing top 5 rows



In [7]:
train.agg(countDistinct("user_id")).show()

+-----------------------+
|count(DISTINCT user_id)|
+-----------------------+
|                   1941|
+-----------------------+



In [8]:
train.agg(countDistinct("item_id")).show()

+-----------------------+
|count(DISTINCT item_id)|
+-----------------------+
|                   3704|
+-----------------------+



In [9]:
train.rdd.getNumPartitions()

3

In [10]:
schema = StructType(fields=[StructField("user_id", IntegerType()),
                            StructField("item_id", IntegerType()),
                            StructField("purchase", IntegerType())])

test = spark.read.csv('/labs/slaba03/laba03_test.csv', schema=schema, header=True)
test.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  94814|    null|
|   1654|  93629|    null|
|   1654|   9980|    null|
|   1654|  95099|    null|
|   1654|  11265|    null|
+-------+-------+--------+
only showing top 5 rows



In [11]:
test.agg(countDistinct("user_id")).show()

+-----------------------+
|count(DISTINCT user_id)|
+-----------------------+
|                   1941|
+-----------------------+



In [12]:
test.agg(countDistinct("item_id")).show()

+-----------------------+
|count(DISTINCT item_id)|
+-----------------------+
|                   3704|
+-----------------------+



In [13]:
test.rdd.getNumPartitions()

3

In [14]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [15]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="purchase", metricName="areaUnderROC")

In [16]:
als = ALS(maxIter=20, regParam=2.2, rank=6, coldStartStrategy="nan", \
          userCol='user_id', itemCol='item_id', ratingCol='purchase', \
          nonnegative=False, implicitPrefs=True, alpha=5.0, seed=87)
%time als_model = als.fit(train)

CPU times: user 30.6 ms, sys: 465 µs, total: 31.1 ms
Wall time: 28.3 s


In [18]:
predict_train = als_model.transform(train)
#%time predict_train.show(5)

In [20]:
predict_train = predict_train.coalesce(4).cache()

In [21]:
predict_train.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)
 |-- prediction: float (nullable = false)



In [22]:
predict_train = predict_train.withColumn("prediction", predict_train.prediction.cast(DoubleType()))

In [23]:
predict_test = als_model.transform(test)

In [24]:
predict_test = predict_test.coalesce(4).cache()

In [25]:
output = predict_test.select('user_id', 'item_id', col('prediction').alias('purchase')) \
                     .orderBy(['user_id', 'item_id'])
output.show(5)

+-------+-------+------------+
|user_id|item_id|    purchase|
+-------+-------+------------+
|   1654|    336|         0.0|
|   1654|    678|         0.0|
|   1654|    691|         0.0|
|   1654|    696|1.7609971E-4|
|   1654|    763|0.0017800244|
+-------+-------+------------+
only showing top 5 rows



In [26]:
output.coalesce(1).write.csv('/user/vladimir.belov/lab03', header=True, sep=',', mode='overwrite')

In [27]:
!hdfs dfs -ls /user/vladimir.belov/

Found 4 items
drwx------   - vladimir.belov vladimir.belov          0 2020-10-04 21:00 /user/vladimir.belov/.Trash
drwxr-xr-x   - vladimir.belov vladimir.belov          0 2021-03-06 19:14 /user/vladimir.belov/.sparkStaging
drwxr-xr-x   - vladimir.belov vladimir.belov          0 2021-03-06 19:24 /user/vladimir.belov/lab03
drwxr-xr-x   - vladimir.belov vladimir.belov          0 2020-10-04 13:08 /user/vladimir.belov/visits


In [29]:
!hadoop fs -copyToLocal /user/vladimir.belov/lab03 /data/home/vladimir.belov/lab03

In [28]:
!pwd

/data/home/vladimir.belov


In [31]:
sc.close()

AttributeError: 'SparkContext' object has no attribute 'close'