In [1]:
import pandas as pd

In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("third_lab")
         .getOrCreate())

In [4]:
spark

In [5]:
!hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2022-01-06 18:46 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2022-01-06 18:46 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2022-01-06 18:46 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2022-01-06 18:46 /labs/slaba03/laba03_views_programmes.csv


In [6]:
df_items = spark.read.format("csv").option("header", "true").load("/labs/slaba03/laba03_items.csv", sep='\t')
df_program = spark.read.format("csv").option("header", "true").load("/labs/slaba03/laba03_views_programmes.csv")

In [7]:
schema = StructType(fields=[StructField("user_id", IntegerType()),
                            StructField("item_id", IntegerType()),
                            StructField("purchase", IntegerType())])

df_train = spark.read.csv('/labs/slaba03/laba03_train.csv', schema=schema, header=True)

In [8]:
df_test = spark.read.csv('/labs/slaba03/laba03_test.csv', schema=schema, header=True)

In [9]:
df_test.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  94814|    null|
|   1654|  93629|    null|
|   1654|   9980|    null|
|   1654|  95099|    null|
|   1654|  11265|    null|
+-------+-------+--------+
only showing top 5 rows



In [27]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col

In [11]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="purchase", metricName="areaUnderROC")

In [13]:
als = ALS(maxIter=15, regParam=2, rank=5, coldStartStrategy="nan", userCol='user_id', itemCol='item_id', ratingCol='purchase', \
          nonnegative=False, implicitPrefs=True, alpha=5.0, seed=55)
als_model = als.fit(df_train)

In [14]:
train_pred = als_model.transform(df_train)

In [15]:
train_pred = train_pred.coalesce(4).cache()

In [18]:
train_pred.show(5)

+-------+-------+--------+------------+
|user_id|item_id|purchase|  prediction|
+-------+-------+--------+------------+
| 754230|   8389|       0|  0.06577225|
| 780033|   8389|       0| 6.845501E-4|
| 798454|   8389|       0| 3.175918E-4|
| 825061|   8389|       0|-0.027836053|
| 833685|   8389|       0|  0.14098734|
+-------+-------+--------+------------+
only showing top 5 rows



In [19]:
from pyspark.sql.types import DoubleType
changedTypedf = train_pred.withColumn("prediction", train_pred["prediction"].cast(DoubleType()))

In [21]:
train_rocauc = evaluator.evaluate(changedTypedf)
print(train_rocauc)

0.9649886285314325


In [23]:
test_pred = als_model.transform(df_test)
test_pred.show(5)

+-------+-------+--------+-------------+
|user_id|item_id|purchase|   prediction|
+-------+-------+--------+-------------+
| 761341|   8389|    null| 0.0013493458|
| 776188|   8389|    null|  0.003965745|
| 846231|   8389|    null| -0.028875731|
| 822709|   8389|    null|2.5825293E-14|
| 824008|   8389|    null|  -5.79391E-4|
+-------+-------+--------+-------------+
only showing top 5 rows



In [24]:
test_pred = test_pred.coalesce(4).cache()

In [25]:
test_pred.show(3)

+-------+-------+--------+------------+
|user_id|item_id|purchase|  prediction|
+-------+-------+--------+------------+
| 761341|   8389|    null|0.0013493458|
| 776188|   8389|    null| 0.003965745|
| 846231|   8389|    null|-0.028875731|
+-------+-------+--------+------------+
only showing top 3 rows



In [28]:
final_df = test_pred.select('user_id', 'item_id', col('prediction').alias('purchase')) \
                     .orderBy(['user_id', 'item_id'])
final_df.show(5)

+-------+-------+------------+
|user_id|item_id|    purchase|
+-------+-------+------------+
|   1654|    336|         0.0|
|   1654|    678|         0.0|
|   1654|    691|         0.0|
|   1654|    696|-7.420811E-4|
|   1654|    763|0.0013899496|
+-------+-------+------------+
only showing top 5 rows



In [29]:
!pwd

/data/home/yana.akulich


In [30]:
final_df.toPandas().to_csv('lab03.csv', header=True, sep=',', index=False)

In [31]:
sc.stop()