In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("Tatiana Gavrikova")
         .getOrCreate())

In [3]:
spark

In [4]:
sc = spark.sparkContext
sc

In [None]:
!hdfs dfs -ls /labs/slaba03/

In [None]:
!hdfs dfs -head /labs/slaba03/laba03_items.csv | sed -n '1,3p'

In [None]:
!hdfs dfs -head /labs/slaba03/laba03_train.csv | sed -n '1,3p'

In [None]:
!hdfs dfs -head /labs/slaba03/laba03_test.csv | sed -n '1,3p'

In [None]:
!hdfs dfs -head /labs/slaba03/laba03_views_programmes.csv | sed -n '1,3p'

# Данные train

In [5]:
schema = StructType() \
      .add("user_id", IntegerType(), True) \
      .add("item_id", IntegerType(), True) \
      .add("purchase", IntegerType(), True)
      
df_user = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("/labs/slaba03/laba03_train.csv")

df_user.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)



In [19]:
df_user_rating = df_user.withColumn('rating', df_user.purchase * F.lit(9) + F.lit(1))

# Данные test

In [7]:
schema = StructType() \
      .add("user_id", IntegerType(), True) \
      .add("item_id", IntegerType(), True) 
      
      
df_user_test = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("/labs/slaba03/laba03_test.csv")

df_user_test.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)



# Данные Items

In [9]:
read_items_schema = StructType(fields=[StructField('item_id', IntegerType()), 
StructField('channel_id', IntegerType()),
StructField('datetime_availability_start', StringType()),
StructField('datetime_availability_stop', StringType()),
StructField('datetime_show_start', StringType()),
StructField('datetime_show_stop', StringType()),
StructField('content_type', IntegerType()),
StructField('title', StringType(), nullable=True),
StructField('year', FloatType(), nullable=True),
StructField('genres', StringType()),
StructField('region_id', IntegerType()),
]) 

df_items = spark.read.format("csv") \
      .option("header", True) \
      .option("sep", "\t")\
      .schema(read_items_schema) \
      .load("/labs/slaba03/laba03_items.csv")

df_items.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- channel_id: integer (nullable = true)
 |-- datetime_availability_start: string (nullable = true)
 |-- datetime_availability_stop: string (nullable = true)
 |-- datetime_show_start: string (nullable = true)
 |-- datetime_show_stop: string (nullable = true)
 |-- content_type: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- year: float (nullable = true)
 |-- genres: string (nullable = true)
 |-- region_id: integer (nullable = true)



In [20]:
from pyspark.ml.recommendation import ALS
als = ALS(
         userCol="user_id", 
         itemCol="item_id",
         ratingCol="rating", 
         nonnegative = True, 
         implicitPrefs = False,
         coldStartStrategy="drop"
)

In [21]:
als_model = als.fit(df_user_rating)

In [22]:
predictions = als_model.transform(df_user_test)

In [23]:
predictions.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- prediction: float (nullable = false)



In [24]:
predictions.summary().show()

+-------+-----------------+-----------------+------------------+
|summary|          user_id|          item_id|        prediction|
+-------+-----------------+-----------------+------------------+
|  count|          2156840|          2156840|           2156840|
|   mean|869652.3733920922|66896.00283609354|0.9215853092817836|
| stddev|60706.51616333823| 35227.8313070464|0.0628215254980348|
|    min|             1654|              326|         0.8798825|
|    25%|           846231|            65668|         0.8930278|
|    50%|           885247|            79856|        0.90242106|
|    75%|           908588|            93606|         0.9270463|
|    max|           941450|           104165|         3.5359435|
+-------+-----------------+-----------------+------------------+



In [29]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler

In [31]:
assembler = VectorAssembler(inputCols=['prediction'],outputCol='prediction_arr')

In [32]:
scaler = MinMaxScaler(inputCol='prediction_arr', outputCol='scaled_prediction')

In [33]:
from pyspark.ml import Pipeline

In [34]:
pipeline = Pipeline(stages=[assembler, scaler])

In [35]:
preds = pipeline.fit(predictions).transform(predictions)

In [36]:
preds.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- prediction: float (nullable = false)
 |-- prediction_arr: vector (nullable = true)
 |-- scaled_prediction: vector (nullable = true)



In [40]:
result = preds.withColumn('purchase', F.udf(lambda v: float(v[0]),FloatType())(preds.scaled_prediction))\
    .select('user_id', 'item_id', 'purchase')\
    .orderBy('user_id', 'item_id')

In [41]:
result.toPandas().to_csv('lab03.csv')

In [42]:
spark.stop()