In [3]:
import os
import sys

In [4]:
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 4 --driver-memory 8g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [5]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import Row
import json

In [6]:
from pyspark.ml.feature import Imputer, OneHotEncoderEstimator, VectorAssembler, StringIndexer
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.classification import LogisticRegression, GBTClassifier, DecisionTreeClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import Imputer, OneHotEncoderEstimator, VectorAssembler, StringIndexer
from pyspark.ml.linalg import VectorUDT

In [7]:
conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("KMP_lab03_2v")
         .getOrCreate())

In [8]:
spark

In [2]:
spark.stop()

NameError: name 'spark' is not defined

# 1. Читаем и обрабатываем данные.

In [8]:
! hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2022-01-06 18:46 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2022-01-06 18:46 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2022-01-06 18:46 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2022-01-06 18:46 /labs/slaba03/laba03_views_programmes.csv


## 1.1. Телепередачи.

**`laba03_items.csv`** — дополнительные данные по items. В данном файле много лишней или ненужной информации, так что задача её фильтрации и отбора ложится на вас. Поля в файле, на которых хотелось бы остановиться:

- `item_id` — primary key. Соответствует item_id в предыдущем файле.
- `content_type` — тип телепередачи (1 — платная, 0 — бесплатная). Вас интересуют платные передачи.
- `title` — название передачи, текстовое поле.
- `year` — год выпуска передачи, число.
- `genres` — поле с жанрами передачи, разделёнными через запятую.

In [9]:
items = spark.read.csv('/labs/slaba03/laba03_items.csv', sep="\t", header=True)\
             .filter(F.col("content_type") == F.lit(1))
items = items.withColumn("year", F.col("year").cast(T.IntegerType()))\
    .withColumn("datetime_availability_start", F.to_timestamp(F.col("datetime_availability_start"), 
                                                        "yyyy-MM-dd'T'HH:mm:ss'Z'"))\
    .withColumn("datetime_availability_stop", F.to_timestamp(F.col("datetime_availability_stop"), 
                                                        "yyyy-MM-dd'T'HH:mm:ss'Z'"))\
    .withColumn("datetime_show_start", F.to_timestamp(F.col("datetime_show_start"), 
                                                        "yyyy-MM-dd'T'HH:mm:ss'Z'"))\
    .withColumn("datetime_show_stop", F.to_timestamp(F.col("datetime_show_stop"), 
                                                        "yyyy-MM-dd'T'HH:mm:ss'Z'"))\
    .withColumn("title", F.lower(F.col("title")))\
    .withColumn("genres", F.lower(F.col("genres")))

In [10]:
items = items.filter(F.col("year").isNotNull())
items = items.filter(F.col("genres").isNotNull())

Разметим жанры.

In [11]:
items.select(F.col("item_id")).count(), items.select(F.col("item_id")).distinct().count()

(3668, 3668)

In [12]:
items.groupBy("genres").count().orderBy(F.col("count").desc()).show(10)

+--------------------+-----+
|              genres|count|
+--------------------+-----+
|ужасы,триллеры,за...|   79|
|мультфильмы,детск...|   72|
|  комедии,зарубежные|   66|
|  эротика,зарубежные|   58|
|        комедии,наши|   53|
|             эротика|   51|
|комедии,драмы,зар...|   50|
|    драмы,зарубежные|   48|
|триллеры,драмы,за...|   46|
|    ужасы,зарубежные|   45|
+--------------------+-----+
only showing top 10 rows



Т.к. жанр может встретиться в 1 фильме несколько раз, отметим ТОП 10 жанров. Остальные в Прочее.

In [11]:
from collections import Counter

In [12]:
genres_top = Counter(",".join([gen[0].replace(" ", "") for gen in items.select(F.col("genres")).collect()]).split(','))

In [13]:
sorted(genres_top.items(), key=lambda x: x[1], reverse=True)[:10]

[('зарубежные', 1739),
 ('драмы', 957),
 ('комедии', 857),
 ('триллеры', 655),
 ('русские', 582),
 ('боевики', 543),
 ('наши', 517),
 ('мелодрамы', 473),
 ('приключения', 437),
 ('длядетей', 427)]

In [14]:
items = items.withColumn("gen_top1", 
                         F.when(F.col("genres").like("%зарубежные%"), F.lit(1))\
                         .otherwise(F.lit(0)))\
             .withColumn("gen_top2", 
                         F.when(F.col("genres").like("%драмы%"), F.lit(1))\
                         .otherwise(F.lit(0)))\
             .withColumn("gen_top3", 
                         F.when(F.col("genres").like("%комедии%"), F.lit(1))\
                         .otherwise(F.lit(0)))\
             .withColumn("gen_top4", 
                         F.when(F.col("genres").like("%триллеры%"), F.lit(1))\
                         .otherwise(F.lit(0)))\
             .withColumn("gen_top5", 
                         F.when(F.col("genres").like("%русские%"), F.lit(1))\
                         .otherwise(F.lit(0)))\
             .withColumn("gen_top6", 
                         F.when(F.col("genres").like("%боевики%"), F.lit(1))\
                         .otherwise(F.lit(0)))\
             .withColumn("gen_top7", 
                         F.when(F.col("genres").like("%наши%"), F.lit(1))\
                         .otherwise(F.lit(0)))\
             .withColumn("gen_top8", 
                         F.when(F.col("genres").like("%мелодрамы%"), F.lit(1))\
                         .otherwise(F.lit(0)))\
             .withColumn("gen_top9", 
                         F.when(F.col("genres").like("%приключения%"), F.lit(1))\
                         .otherwise(F.lit(0)))\
             .withColumn("gen_top10", 
                         F.when(F.col("genres").like("%для детей%"), F.lit(1))\
                         .otherwise(F.lit(0)))\
             .withColumn("gen_others", 
                         F.when((F.col("gen_top1")==F.lit(1))|(F.col("gen_top2")==F.lit(1))
                                |(F.col("gen_top3")==F.lit(1))|(F.col("gen_top4")==F.lit(1))
                                |(F.col("gen_top5")==F.lit(1))|(F.col("gen_top6")==F.lit(1))
                                |(F.col("gen_top7")==F.lit(1))|(F.col("gen_top8")==F.lit(1))
                                |(F.col("gen_top9")==F.lit(1))|(F.col("gen_top10")==F.lit(1)), 
                                F.lit(0))\
                         .otherwise(F.lit(1)))

In [17]:
items.select(F.min(F.col("year")), F.max(F.col("year"))).show()

+---------+---------+
|min(year)|max(year)|
+---------+---------+
|     1916|     2017|
+---------+---------+



In [15]:
items = items.withColumn("old_years", 
                         F.when(F.col("year") <= F.lit(1950), 
                                F.lit(1))\
                         .otherwise(F.lit(0)))\
             .withColumn("1951-1980", 
                         F.when(F.col("year").between(F.lit(1951), F.lit(1980)), 
                                F.lit(1))\
                         .otherwise(F.lit(0)))\
             .withColumn("1981-2000", 
                         F.when(F.col("year").between(F.lit(1981), F.lit(2000)), 
                                F.lit(1))\
                         .otherwise(F.lit(0)))\
             .withColumn("2001-2010", 
                         F.when(F.col("year").between(F.lit(2001), F.lit(2010)), 
                                F.lit(1))\
                         .otherwise(F.lit(0)))\
             .withColumn("new_years", 
                         F.when(F.col("year") >= F.lit(2011), 
                                F.lit(1))\
                         .otherwise(F.lit(0)))

In [19]:
items.take(1)

[Row(item_id='65667', channel_id=None, datetime_availability_start=datetime.datetime(1970, 1, 1, 0, 0), datetime_availability_stop=datetime.datetime(2018, 1, 1, 0, 0), datetime_show_start=None, datetime_show_stop=None, content_type='1', title='на пробах только девушки (all girl auditions)', year=2013, genres='эротика', region_id=None, gen_top1=0, gen_top2=0, gen_top3=0, gen_top4=0, gen_top5=0, gen_top6=0, gen_top7=0, gen_top8=0, gen_top9=0, gen_top10=0, gen_others=1, old_years=0, 1951-1980=0, 1981-2000=0, 2001-2010=0, new_years=1)]

In [20]:
item_id_p = [item[0] for item in items.select(F.col("item_id")).collect()]

### 1.2. Просмотры телепередач.

Дополнительный файл **`laba03_views_programmes.csv`** по просмотрам передач с полями:

- `ts_start` — время начала просмотра.
- `ts_end` — время окончания просмотра.
- `item_type` — тип просматриваемого контента:
    - `live` — просмотр "вживую", в момент показа контента в эфире.
    - `pvr` — просмотр в записи, после показа контента в эфире.

In [16]:
views_prog = spark.read.csv('/labs/slaba03/laba03_views_programmes.csv', header=True)\
                    .withColumn("ts_start", F.col("ts_start").cast(T.LongType()))\
                    .withColumn("ts_end", F.col("ts_end").cast(T.LongType()))
views_prog.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- ts_start: long (nullable = true)
 |-- ts_end: long (nullable = true)
 |-- item_type: string (nullable = true)



In [17]:
views_prog = views_prog.withColumn("ts_diff", F.col("ts_end") - F.col("ts_start"))

In [18]:
user_live_avg = views_prog\
                        .filter(F.col("item_type") == F.lit("live"))\
                        .groupBy(F.col("user_id"))\
                        .agg(F.mean(F.col("ts_diff")).alias("user_avg_live"),
                             F.count(F.col("item_id")).alias("user_cnt_live"))
user_pvr_avg = views_prog\
                        .filter(F.col("item_type") == F.lit("pvr"))\
                        .groupBy(F.col("user_id"))\
                        .agg(F.mean(F.col("ts_diff")).alias("user_avg_pvr"),
                             F.count(F.col("item_id")).alias("user_cnt_pvr"))

item_live_avg = views_prog\
                        .filter(F.col("item_type") == F.lit("live"))\
                        .groupBy(F.col("item_id"))\
                        .agg(F.mean(F.col("ts_diff")).alias("item_avg_live"))
item_pvr_avg = views_prog\
                        .filter(F.col("item_type") == F.lit("pvr"))\
                        .groupBy(F.col("item_id"))\
                        .agg(F.mean(F.col("ts_diff")).alias("item_avg_pvr"))

### 1.3. Факты покупки.

В **`laba03_train.csv`** содержатся факты покупки (колонка `purchase`) пользователями (колонка `user_id`) телепередач (колонка `item_id`). Такой формат файла вам уже знаком.

In [19]:
train = spark.read.csv('/labs/slaba03/laba03_train.csv', header=True)\
             .withColumn("user_id", F.col("user_id").cast(T.IntegerType()))\
             .withColumn("item_id", F.col("item_id").cast(T.IntegerType()))\
             .withColumn("purchase", F.col("purchase").cast(T.IntegerType()))
train.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)



In [25]:
train.count()

5032624

In [26]:
train.select(F.countDistinct(F.col("user_id")).alias("cnt_user"), 
             F.countDistinct(F.col("item_id")).alias("cnt_item")).show()

+--------+--------+
|cnt_user|cnt_item|
+--------+--------+
|    1941|    3704|
+--------+--------+



In [27]:
train.groupBy(F.col("purchase")).agg(F.count(F.col("user_id")).alias("rows"), 
                                     F.countDistinct(F.col("user_id")).alias("cnt_d_user"), 
                                     F.countDistinct(F.col("item_id")).alias("cnt_d_item")).show()

+--------+-------+----------+----------+
|purchase|   rows|cnt_d_user|cnt_d_item|
+--------+-------+----------+----------+
|       1|  10904|      1675|      3089|
|       0|5021720|      1941|      3704|
+--------+-------+----------+----------+



In [28]:
10904/5021720*100

0.21713675792357995

В тренировочных данных 0,2% фактов покупок

In [20]:
train_user = train.groupBy(F.col("user_id")).agg(F.mean(F.col("purchase")).alias('user_purchase_avg'),
                                                 F.count(F.col("item_id")).alias("user_item_id"),
                                                 F.sum(F.col("purchase")).alias("user_purchase_sum"))
train_item = train.groupBy(F.col("item_id")).agg(F.mean(F.col("purchase")).alias('item_purchase_avg'),
                                                 F.count(F.col("user_id")).alias("item_user_id"),
                                                 F.sum(F.col("purchase")).alias("item_purchase_sum"))

In [21]:
def f_merge_features(data):
    data = data.alias("t")\
                    .join(train_user.alias("tu"), F.col("t.user_id") == F.col("tu.user_id") , "left")\
                    .join(train_item.alias("ti"), F.col("t.item_id") == F.col("ti.item_id") , "left")\
                    .join(user_live_avg.alias("ul"), F.col("t.user_id") == F.col("ul.user_id") , "left")\
                    .join(user_pvr_avg.alias("up"), F.col("t.user_id") == F.col("up.user_id") , "left")\
                    .join(item_live_avg.alias("il"), F.col("t.item_id") == F.col("il.item_id") , "left")\
                    .join(item_pvr_avg.alias("ip"), F.col("t.item_id") == F.col("ip.item_id") , "left")\
                    .join(items.alias("i"), F.col("t.item_id") == F.col("i.item_id"), "left")\
                    .select("t.user_id", 
                            "t.item_id", 
                            "t.purchase", 
                            "tu.user_purchase_avg",
                            "tu.user_item_id",
                            "tu.user_purchase_sum",
                            "ti.item_purchase_avg",
                            "ti.item_user_id",
                            "ti.item_purchase_sum",
                            "ul.user_avg_live",
                            "ul.user_cnt_live",
                            "up.user_avg_pvr",
                            "up.user_cnt_pvr",
                            "il.item_avg_live",
                            "ip.item_avg_pvr",
                            "i.gen_top1",
                            "i.gen_top2",
                            "i.gen_top3",
                            "i.gen_top4",
                            "i.gen_top5",
                            "i.gen_top6",
                            "i.gen_top7",
                            "i.gen_top8",
                            "i.gen_top9",
                            "i.gen_top10",
                            "i.gen_others",
                            "i.old_years",
                            "i.1951-1980",
                            "i.1981-2000",
                            "i.2001-2010",
                            "i.new_years"
                           )
    data = data.na.fill(0)
    return data

In [22]:
df_train = f_merge_features(train)

In [23]:
def f_assembler(data):
    list_col_features = list(set(data.columns) - set(['user_id', 'item_id', 'purchase']))
    assembler = VectorAssembler(
        inputCols = list_col_features,
        outputCol="features")
    output = assembler.transform(data)
    return output

In [24]:
df_train_features_ = f_assembler(df_train)

In [34]:
df_train_features_.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)
 |-- user_purchase_avg: double (nullable = false)
 |-- user_item_id: long (nullable = true)
 |-- user_purchase_sum: long (nullable = true)
 |-- item_purchase_avg: double (nullable = false)
 |-- item_user_id: long (nullable = true)
 |-- item_purchase_sum: long (nullable = true)
 |-- user_avg_live: double (nullable = false)
 |-- user_cnt_live: long (nullable = true)
 |-- user_avg_pvr: double (nullable = false)
 |-- user_cnt_pvr: long (nullable = true)
 |-- item_avg_live: double (nullable = false)
 |-- item_avg_pvr: double (nullable = false)
 |-- gen_top1: integer (nullable = true)
 |-- gen_top2: integer (nullable = true)
 |-- gen_top3: integer (nullable = true)
 |-- gen_top4: integer (nullable = true)
 |-- gen_top5: integer (nullable = true)
 |-- gen_top6: integer (nullable = true)
 |-- gen_top7: integer (nullable = true)
 |-- gen_top8: integer (nullable = true)
 

In [25]:
df_train_features = df_train_features_.sampleBy("purchase", fractions={0: 0.009, 1: 1}, seed=4242).cache()

In [36]:
df_train_features.groupby('purchase').count().show()

+--------+-----+
|purchase|count|
+--------+-----+
|       1|10904|
|       0|44918|
+--------+-----+



In [37]:
df_train_features.take(1)

[Row(user_id=844427, item_id=8389, purchase=1, user_purchase_avg=0.0034775888717156105, user_item_id=2588, user_purchase_sum=9, item_purchase_avg=0.005979073243647235, item_user_id=1338, item_purchase_sum=8, user_avg_live=6323.3835616438355, user_cnt_live=73, user_avg_pvr=2585.2105263157896, user_cnt_pvr=76, item_avg_live=0.0, item_avg_pvr=0.0, gen_top1=0, gen_top2=0, gen_top3=0, gen_top4=0, gen_top5=0, gen_top6=0, gen_top7=1, gen_top8=0, gen_top9=0, gen_top10=0, gen_others=0, old_years=0, 1951-1980=0, 1981-2000=1, 2001-2010=0, new_years=0, features=SparseVector(28, {2: 6323.3836, 8: 0.0035, 9: 73.0, 12: 1.0, 13: 76.0, 14: 9.0, 16: 2588.0, 18: 8.0, 21: 2585.2105, 22: 0.006, 23: 1338.0, 26: 1.0}))]

### 1.4. Тестовая выборка.

**`laba03_test.csv`** — тестовый датасет без указанного целевого признака purchase, который вам и предстоит предсказать.

In [26]:
test = spark.read.csv('/labs/slaba03/laba03_test.csv', header=True)\
             .withColumn("user_id", F.col("user_id").cast(T.IntegerType()))\
             .withColumn("item_id", F.col("item_id").cast(T.IntegerType()))\
             .withColumn("purchase", F.col("purchase").cast(T.IntegerType()))
test.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)



In [39]:
test.show(1)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  94814|    null|
+-------+-------+--------+
only showing top 1 row



In [40]:
test.count()

2156840

In [41]:
test.select(F.countDistinct(F.col("user_id")).alias("cnt_user"), 
             F.countDistinct(F.col("item_id")).alias("cnt_item")).show()

+--------+--------+
|cnt_user|cnt_item|
+--------+--------+
|    1941|    3704|
+--------+--------+



In [27]:
df_test = f_merge_features(test)

In [28]:
df_test_features = f_assembler(df_test)

In [29]:
df_test_features.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)
 |-- user_purchase_avg: double (nullable = false)
 |-- user_item_id: long (nullable = true)
 |-- user_purchase_sum: long (nullable = true)
 |-- item_purchase_avg: double (nullable = false)
 |-- item_user_id: long (nullable = true)
 |-- item_purchase_sum: long (nullable = true)
 |-- user_avg_live: double (nullable = false)
 |-- user_cnt_live: long (nullable = true)
 |-- user_avg_pvr: double (nullable = false)
 |-- user_cnt_pvr: long (nullable = true)
 |-- item_avg_live: double (nullable = false)
 |-- item_avg_pvr: double (nullable = false)
 |-- gen_top1: integer (nullable = true)
 |-- gen_top2: integer (nullable = true)
 |-- gen_top3: integer (nullable = true)
 |-- gen_top4: integer (nullable = true)
 |-- gen_top5: integer (nullable = true)
 |-- gen_top6: integer (nullable = true)
 |-- gen_top7: integer (nullable = true)
 |-- gen_top8: integer (nullable = true)
 

In [45]:
df_test_features.take(1)

[Row(user_id=761341, item_id=8389, purchase=0, user_purchase_avg=0.0003875968992248062, user_item_id=2580, user_purchase_sum=1, item_purchase_avg=0.005979073243647235, item_user_id=1338, item_purchase_sum=8, user_avg_live=1358.0625, user_cnt_live=16, user_avg_pvr=3240.4285714285716, user_cnt_pvr=28, item_avg_live=0.0, item_avg_pvr=0.0, gen_top1=0, gen_top2=0, gen_top3=0, gen_top4=0, gen_top5=0, gen_top6=0, gen_top7=1, gen_top8=0, gen_top9=0, gen_top10=0, gen_others=0, old_years=0, 1951-1980=0, 1981-2000=1, 2001-2010=0, new_years=0, features=SparseVector(28, {2: 1358.0625, 8: 0.0004, 9: 16.0, 12: 1.0, 13: 28.0, 14: 1.0, 16: 2580.0, 18: 8.0, 21: 3240.4286, 22: 0.006, 23: 1338.0, 26: 1.0}))]

## 2. Моделька.

In [46]:
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel

model = RandomForestClassifier(
    featuresCol='features', 
    labelCol='purchase',
    numTrees=500,
    maxDepth=20,
    maxBins=40,
#     featureSubsetStrategy = 'all' 
)

# from pyspark.ml.regression import GBTRegressor
# from pyspark.ml.evaluation import BinaryClassificationEvaluator
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# model = GBTRegressor(maxIter=50,
#                         subsamplingRate=1.0,
#                         maxDepth=9,
#                         featuresCol="features",
#                         labelCol="purchase")

model.fit(df_train_features).save('RF_lab3_1.sav')

KeyboardInterrupt: 

In [None]:
!hdfs dfs -ls

In [None]:
model = RandomForestClassificationModel.load('RF_lab3_1.sav')

In [None]:
probability = model.transform(df_test_features)\
.select(
    F.col('user_id'),
    F.col('item_id'),
    F.col('rawPrediction'),
    F.col('prediction'),
    F.col('probability')
).cache()

## 3. Запись результата.

In [None]:
probability.take(1)

In [None]:
result = probability.withColumn("probability", F.col("probability").cast(T.StringType()))\
                    .select(F.col("user_id"),
                            F.col("item_id"),
                            F.col("probability"))
result = result.sort(F.col("user_id").asc(), F.col("item_id").asc())

In [None]:
result.printSchema()

In [None]:
df = result.toPandas()

In [None]:
df['user_id'] = df['user_id'].astype(int)
df['item_id'] = df['item_id'].astype(int)
df['purchase'] = [x[1:-1].split(',')[1] for x in df['probability']]
df['purchase'] = df['purchase'].astype("float64")

In [None]:
df.info()

In [None]:
df[['user_id','item_id','purchase']].to_csv('lab03.csv')

Потолок 0,785 и при повышении параметров обучение падает, либо слишком долго отрабатывает - такой вариант не подходит. 

## ALS

In [30]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [48]:
train.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)



In [49]:
test.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)



In [32]:
als = ALS(maxIter=24, regParam=2.3, rank=6, coldStartStrategy="nan",
          userCol='user_id', itemCol='item_id', ratingCol='purchase',
          nonnegative=False, implicitPrefs=True, alpha=4.0, seed=89)
%time als_model = als.fit(train)

CPU times: user 12.5 ms, sys: 1.24 ms, total: 13.7 ms
Wall time: 40.4 s


In [39]:
pred_train = als_model.transform(train)
#%time pred_train.show(5)

In [40]:
pred_train = pred_train.withColumn("prediction", F.col("prediction").cast(T.DoubleType()))
pred_train = pred_train.coalesce(5).cache()
pred_train.rdd.getNumPartitions()

5

In [41]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="purchase", 
                                          metricName="areaUnderROC")

In [46]:
evaluator.evaluate(pred_train)

Py4JJavaError: An error occurred while calling o3894.evaluate.
: java.lang.IllegalStateException: SparkContext has been shutdown
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2053)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
	at org.apache.spark.RangePartitioner$.sketch(Partitioner.scala:309)
	at org.apache.spark.RangePartitioner.<init>(Partitioner.scala:171)
	at org.apache.spark.RangePartitioner.<init>(Partitioner.scala:151)
	at org.apache.spark.rdd.OrderedRDDFunctions$$anonfun$sortByKey$1.apply(OrderedRDDFunctions.scala:62)
	at org.apache.spark.rdd.OrderedRDDFunctions$$anonfun$sortByKey$1.apply(OrderedRDDFunctions.scala:61)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.OrderedRDDFunctions.sortByKey(OrderedRDDFunctions.scala:61)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.x$4$lzycompute(BinaryClassificationMetrics.scala:155)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.x$4(BinaryClassificationMetrics.scala:146)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.confusions$lzycompute(BinaryClassificationMetrics.scala:148)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.confusions(BinaryClassificationMetrics.scala:148)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.createCurve(BinaryClassificationMetrics.scala:226)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.roc(BinaryClassificationMetrics.scala:86)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.areaUnderROC(BinaryClassificationMetrics.scala:97)
	at org.apache.spark.ml.evaluation.BinaryClassificationEvaluator.evaluate(BinaryClassificationEvaluator.scala:87)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


In [None]:
pred_test = als_model.transform(test)
#%time pred_test.show(5)

In [None]:
pred_test = pred_test.coalesce(5).cache()
pred_test = pred_test.withColumn("prediction", F.col("prediction").cast(T.DoubleType()))
pred_test.rdd.getNumPartitions()

In [None]:
result = pred_test.select(F.col("user_id"),
                             F.col("item_id"),
                             F.col("prediction").alias("purchase"))\
                     .sort(F.col("user_id").asc(),
                           F.col("item_id").asc())

In [None]:
result.show()

In [60]:
result.toPandas().to_csv("lab03.csv")

In [None]:
a=6