In [1]:
! hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2021-02-27 22:12 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2021-02-27 22:12 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2021-02-27 22:12 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2021-02-27 22:12 /labs/slaba03/laba03_views_programmes.csv


### Spark context init

In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "KuznetsovA VideoRecsys app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [4]:
spark

In [5]:
sc = spark.sparkContext

### Data loading

In [6]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, ArrayType, FloatType
import pyspark.sql.functions as f

### Items

In [7]:
items_schema = StructType(fields=[
    StructField("item_id", IntegerType()),
    StructField("channel_id", IntegerType()),
    StructField("datetime_availability_start", TimestampType()),
    StructField("datetime_availability_stop", TimestampType()),
    StructField("datetime_show_start", TimestampType()),
    StructField("datetime_show_stop", TimestampType()),
    StructField("content_type", StringType()),
    StructField("title", StringType()),
    StructField("year", IntegerType()),
    StructField("genres", StringType()),
    StructField("region_id", IntegerType()),
])

In [8]:
items = (spark
         .read
#          .schema(items_schema)
         .format('csv')
         .option("sep", "\t")
         .option("header", True)
         .option('inferSchema', True)
         .load('/labs/slaba03/laba03_items.csv'))

In [9]:
items.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- channel_id: double (nullable = true)
 |-- datetime_availability_start: timestamp (nullable = true)
 |-- datetime_availability_stop: timestamp (nullable = true)
 |-- datetime_show_start: timestamp (nullable = true)
 |-- datetime_show_stop: timestamp (nullable = true)
 |-- content_type: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- year: double (nullable = true)
 |-- genres: string (nullable = true)
 |-- region_id: double (nullable = true)



In [10]:
items.show(5, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------
 item_id                     | 65667                                                                                  
 channel_id                  | null                                                                                   
 datetime_availability_start | 1970-01-01 03:00:00                                                                    
 datetime_availability_stop  | 2018-01-01 03:00:00                                                                    
 datetime_show_start         | null                                                                                   
 datetime_show_stop          | null                                                                                   
 content_type                | 1                                                                                      
 title                       | на пробах только 

In [11]:
items.rdd.getNumPartitions()

3

In [12]:
items.groupby("year").count().show()

+------+------+
|  year| count|
+------+------+
|1988.0|    15|
|1976.0|    20|
|1951.0|     1|
|1940.0|     2|
|1979.0|    22|
|1953.0|     5|
|1987.0|     8|
|1959.0|     3|
|1978.0|    18|
|1968.0|    18|
|2010.0|   259|
|  null|631868|
|1967.0|    19|
|1964.0|    16|
|1916.0|     1|
|1993.0|     6|
|2001.0|    20|
|1965.0|    16|
|1954.0|     8|
|1984.0|    25|
+------+------+
only showing top 20 rows



In [13]:
items_genres = (items
 .filter(f.col("genres").isNotNull())
 .withColumn("genres_array", f.split(f.col("genres"), ","))
 .withColumn("n_genres", f.size(f.col("genres_array")))
 .drop("channel_id", "year", "title", "region_id", "genres",
       "datetime_show_start", "datetime_show_stop", 
       "datetime_availability_start", "datetime_availability_stop")
)
items_genres.show(10, vertical=True,truncate=False)

-RECORD 0--------------------------------------------------------
 item_id      | 65667                                            
 content_type | 1                                                
 genres_array | [Эротика]                                        
 n_genres     | 1                                                
-RECORD 1--------------------------------------------------------
 item_id      | 65669                                            
 content_type | 1                                                
 genres_array | [Эротика]                                        
 n_genres     | 1                                                
-RECORD 2--------------------------------------------------------
 item_id      | 65668                                            
 content_type | 1                                                
 genres_array | [Эротика]                                        
 n_genres     | 1                                                
-RECORD 3-

In [14]:
items_genres.groupby("n_genres").count().show()

+--------+------+
|n_genres| count|
+--------+------+
|       1|632108|
|       6|   106|
|       3|  1361|
|       5|   309|
|       4|   775|
|       8|    11|
|       7|    46|
|       2|   819|
+--------+------+



In [15]:
genres_df = items_genres.filter(f.col("n_genres") == 1).select("genres_array", ).distinct().orderBy("genres_array").toPandas()

In [16]:
genres_df

Unnamed: 0,genres_array
0,[General]
1,[Боевики]
2,[Военные]
3,[Детективы]
4,[Детские]
5,[Для детей]
6,[Для самых маленьких]
7,[Документальные]
8,[Документальный]
9,[Драма]


In [17]:
items_genres.groupby("content_type").count().show()

+------------+------+
|content_type| count|
+------------+------+
|           1|  3671|
|           0|631864|
+------------+------+



In [18]:
from pyspark.ml.linalg import Vectors, DenseVector, SparseVector, VectorUDT

In [19]:
def sparse_to_array(v):
    return Vectors.dense(v).tolist()

sparse_to_array_udf = f.udf(sparse_to_array, ArrayType(FloatType()))

In [20]:
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="genres_array", outputCol="genres_features")
model_cv_genres = cv.fit(items_genres)
items_cv_genres = model_cv_genres.transform(items_genres)
items_cv_dense_genres = (
    items_cv_genres
    .withColumn("genres_dense_features", sparse_to_array_udf(f.col("genres_features")))
    .drop("genres_array", "n_genres")
)
items_cv_dense_genres.show(10, truncate=False, vertical=True)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 item_id               | 65667                                                                                                                                                                                                                                                                                                                                                                                                                                
 content_type          | 1                                                                                

In [21]:
items_features = items_cv_dense_genres.select("item_id", f.col("genres_dense_features").alias("item_features"))
items_features.cache()

DataFrame[item_id: int, item_features: array<float>]

In [22]:
items_features.show()

+-------+--------------------+
|item_id|       item_features|
+-------+--------------------+
|  65667|[0.0, 0.0, 0.0, 0...|
|  65669|[0.0, 0.0, 0.0, 0...|
|  65668|[0.0, 0.0, 0.0, 0...|
|  65671|[0.0, 0.0, 0.0, 0...|
|  65670|[0.0, 0.0, 0.0, 0...|
|  65809|[0.0, 0.0, 0.0, 1...|
|  65810|[0.0, 0.0, 0.0, 1...|
|    326|[0.0, 1.0, 1.0, 0...|
|    336|[0.0, 1.0, 0.0, 1...|
|    357|[0.0, 0.0, 0.0, 1...|
|    396|[0.0, 1.0, 1.0, 0...|
|    400|[0.0, 1.0, 0.0, 0...|
|    423|[0.0, 1.0, 1.0, 0...|
|    430|[0.0, 1.0, 1.0, 0...|
|    449|[0.0, 1.0, 0.0, 0...|
|    453|[0.0, 1.0, 1.0, 1...|
|    478|[0.0, 0.0, 0.0, 0...|
|    495|[0.0, 1.0, 0.0, 0...|
|    505|[0.0, 1.0, 1.0, 0...|
|    540|[0.0, 1.0, 1.0, 1...|
+-------+--------------------+
only showing top 20 rows



In [24]:
items_features.count()

635535

### Test

In [68]:
test = (spark
        .read
#          .schema(test_schema)
        .format('csv')
        .option("sep", ",")
        .option("header", True)
        .option('inferSchema', True)
        .load('/labs/slaba03/laba03_test.csv')
       )

In [69]:
test.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: string (nullable = true)



In [70]:
test.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  94814|    null|
|   1654|  93629|    null|
|   1654|   9980|    null|
|   1654|  95099|    null|
|   1654|  11265|    null|
+-------+-------+--------+
only showing top 5 rows



In [71]:
test.rdd.getNumPartitions()

5

In [72]:
test.select("user_id").distinct().count(), test.select("item_id").distinct().count()

(1941, 3704)

In [73]:
test.count()

2156840

### Train

In [74]:
train = (spark
         .read
#          .schema(train_schema)
         .format('csv')
         .option("sep", ",")
         .option("header", True)
         .option('inferSchema', True)
         .load('/labs/slaba03/laba03_train.csv')
        )

In [75]:
train.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)



In [76]:
train.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
+-------+-------+--------+
only showing top 5 rows



In [77]:
train.rdd.getNumPartitions()

5

In [78]:
train.groupBy('purchase').count().show()

+--------+-------+
|purchase|  count|
+--------+-------+
|       1|  10904|
|       0|5021720|
+--------+-------+



In [79]:
train.select("user_id").distinct().count(), train.select("item_id").distinct().count()

(1941, 3704)

In [80]:
train.count()

5032624

### Natalya's approach -- start

In [81]:
train_train = train.sampleBy("purchase", fractions={0.0: 0.8, 1.0: 0.8}, seed=5757)

In [82]:
train_valid = train.join(train_train, on=["user_id", "item_id"], how="leftanti")

In [83]:
train_purchases = (
    train
    .groupBy('user_id')
    .sum()
    .select(f.col("sum(purchase)").alias("user_purchases"), f.col("user_id"))
    .cache()
)

train_purchases.show(2)

+--------------+-------+
|user_purchases|user_id|
+--------------+-------+
|            72| 754230|
|             1| 761341|
+--------------+-------+
only showing top 2 rows



In [84]:
item_purchases = (
    train
    .groupBy('item_id')
    .sum()
    .select(f.col("sum(purchase)").alias("item_purchases"), f.col("item_id"))
    .cache()
)

item_purchases.show(2)

+--------------+-------+
|item_purchases|item_id|
+--------------+-------+
|             2|   8638|
|             1|  95940|
+--------------+-------+
only showing top 2 rows



In [85]:
train_train = train_train.join(train_purchases, on='user_id', how='left')
train_valid = train_valid.join(train_purchases, on='user_id', how='left')
test = test.join(train_purchases, on='user_id', how='left')

train_train = train_train.join(item_purchases, on='item_id', how='left')
train_valid = train_valid.join(item_purchases, on='item_id', how='left')
test = test.join(item_purchases, on='item_id', how='left')

In [86]:
train_user_attempts = (
    train
    .groupBy('user_id')
    .count()
    .select(f.col("count").alias("user_attempts"), f.col("user_id"))
    .cache()
)

train_user_attempts.show(2)

+-------------+-------+
|user_attempts|user_id|
+-------------+-------+
|         2611| 754230|
|         2580| 761341|
+-------------+-------+
only showing top 2 rows



In [87]:
train_item_attempts = (
    train
    .groupBy('item_id')
    .count()
    .select(f.col("count").alias("item_attempts"), f.col("item_id"))
    .cache()
)

train_item_attempts.show(2)

+-------------+-------+
|item_attempts|item_id|
+-------------+-------+
|         1379|   8638|
|         1409|  95940|
+-------------+-------+
only showing top 2 rows



In [88]:
train_train = train_train.join(train_user_attempts, on='user_id', how='left')
train_valid = train_valid.join(train_user_attempts, on='user_id', how='left')
test = test.join(train_user_attempts, on='user_id', how='left')

train_train = train_train.join(train_item_attempts, on='item_id', how='left')
train_valid = train_valid.join(train_item_attempts, on='item_id', how='left')
test = test.join(train_item_attempts, on='item_id', how='left')

In [89]:
train_train.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)
 |-- user_purchases: long (nullable = true)
 |-- item_purchases: long (nullable = true)
 |-- user_attempts: long (nullable = true)
 |-- item_attempts: long (nullable = true)



In [90]:
train_valid.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)
 |-- user_purchases: long (nullable = true)
 |-- item_purchases: long (nullable = true)
 |-- user_attempts: long (nullable = true)
 |-- item_attempts: long (nullable = true)



In [91]:
test.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- purchase: string (nullable = true)
 |-- user_purchases: long (nullable = true)
 |-- item_purchases: long (nullable = true)
 |-- user_attempts: long (nullable = true)
 |-- item_attempts: long (nullable = true)



In [92]:
train_train = train_train.withColumn('user_addict', f.col('user_purchases') / f.col('user_attempts'))
train_valid = train_valid.withColumn('user_addict', f.col('user_purchases') / f.col('user_attempts'))
test = test.withColumn('user_addict', f.col('user_purchases') / f.col('user_attempts'))

In [96]:
train_train = train_train.withColumn('item_addict', f.col('item_purchases') / f.col('item_attempts'))
train_valid = train_valid.withColumn('item_addict', f.col('item_purchases') / f.col('item_attempts'))
test = test.withColumn('item_addict', f.col('item_purchases') / f.col('item_attempts'))

In [97]:
test = test.na.fill(0)
train_train = train_train.na.fill(0)
train_valid = train_valid.na.fill(0)

In [98]:
train_purchases.unpersist()
item_purchases.unpersist()
train_user_attempts.unpersist()
train_item_attempts.unpersist()

DataFrame[item_attempts: bigint, item_id: int]

In [99]:
from pyspark.ml.feature import VectorAssembler
cols = ['item_purchases', 'user_purchases', 'user_addict', 'item_addict']
assembler = VectorAssembler(inputCols=cols, outputCol="features")

train_data = assembler.transform(train_train).cache()
valid_data = assembler.transform(train_valid)
test_data = assembler.transform(test)

In [100]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="purchase")

pipeline = Pipeline(stages=[
    lr
])

In [101]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="purchase", metricName='areaUnderROC')

In [102]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [104]:
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.0, 0.001, 0.01, 0.1]).build()

In [105]:
crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,
                          evaluator=evaluator, numFolds=3, parallelism=3)

In [106]:
cv_model = crossval.fit(train_data)

In [107]:
cv_model.avgMetrics

[0.9112065837379674,
 0.9168167794576112,
 0.9181673912830515,
 0.9252267030406724]

In [109]:
import numpy as np

In [110]:
cv_model.getEstimatorParamMaps()[np.argmax(cv_model.avgMetrics)]

{Param(parent='LogisticRegression_553603cb3e9e', name='regParam', doc='regularization parameter (>= 0).'): 0.1}

In [111]:
predictions_valid = cv_model.transform(valid_data)

In [112]:
evaluator.evaluate(predictions_valid)

0.9192379017100841

In [113]:
train_data.unpersist()

DataFrame[item_id: int, user_id: int, purchase: int, user_purchases: bigint, item_purchases: bigint, user_attempts: bigint, item_attempts: bigint, user_addict: double, item_addict: double, features: vector]

In [114]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="purchase", regParam=0.1)

lr_model = lr.fit(train_data)
predictions_valid = lr_model.transform(valid_data)

In [115]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="purchase", metricName='areaUnderROC')
score = evaluator.evaluate(predictions_valid)
score

0.9192379017100839

In [116]:
train_w_features = train_data.union(valid_data)
train_w_features.count()

5032624

In [117]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="purchase", regParam=0.1)

lr_final_model = lr.fit(train_w_features)

In [131]:
lr_final_model.summary.labels

[0.0, 1.0]

In [126]:
test_predicted = lr_final_model.transform(test_data)

In [119]:
test_predicted.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- purchase: string (nullable = true)
 |-- user_purchases: long (nullable = true)
 |-- item_purchases: long (nullable = true)
 |-- user_attempts: long (nullable = true)
 |-- item_attempts: long (nullable = true)
 |-- user_addict: double (nullable = false)
 |-- item_addict: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [121]:
test_predicted.show(5, vertical=True, truncate=False)

-RECORD 0--------------------------------------------------------------
 item_id        | 8389                                                 
 user_id        | 566758                                               
 purchase       | null                                                 
 user_purchases | 1                                                    
 item_purchases | 8                                                    
 user_attempts  | 2655                                                 
 item_attempts  | 1338                                                 
 user_addict    | 3.766478342749529E-4                                 
 item_addict    | 0.005979073243647235                                 
 features       | [8.0,1.0,3.766478342749529E-4,0.005979073243647235]  
 rawPrediction  | [6.107867514628841,-6.107867514628841]               
 probability    | [0.9977796497682532,0.002220350231746737]            
 prediction     | 0.0                                           

In [148]:
test_predicted.filter("user_id == 1654 and item_id == 336").show(vertical=True, truncate=False)

-RECORD 0----------------------------------------------------
 item_id        | 336                                        
 user_id        | 1654                                       
 purchase       | null                                       
 user_purchases | 5                                          
 item_purchases | 0                                          
 user_attempts  | 2568                                       
 item_attempts  | 1377                                       
 user_addict    | 0.0019470404984423676                      
 item_addict    | 0.0                                        
 features       | [0.0,5.0,0.0019470404984423676,0.0]        
 rawPrediction  | [6.193518934200147,-6.193518934200147]     
 probability    | [0.9979615375924372,0.0020384624075628103] 
 prediction     | 0.0                                        



In [154]:
from pyspark.ml.linalg import Vectors, DenseVector, SparseVector, VectorUDT

In [155]:
def sparse_to_array(v):
    return Vectors.dense(v).tolist()

sparse_to_array_udf = f.udf(sparse_to_array, ArrayType(FloatType()))

In [160]:
result = (
    test_predicted
    .select(
        f.col("user_id"), 
        f.col("item_id"), 
        sparse_to_array_udf(f.col("probability")).getItem(1).alias("purchase"))
    .orderBy("user_id", "item_id")
    .select(f.monotonically_increasing_id().alias(""), 
            f.col("user_id"), 
            f.col("item_id"), 
            f.col("purchase")
           )
    .cache()
)

In [161]:
result.show()

+---+-------+-------+------------+
|   |user_id|item_id|    purchase|
+---+-------+-------+------------+
|  0|   1654|    336|0.0020384623|
|  1|   1654|    678|0.0020384623|
|  2|   1654|    691|0.0020384623|
|  3|   1654|    696|0.0020964418|
|  4|   1654|    763| 0.002066894|
|  5|   1654|    795|0.0023081175|
|  6|   1654|    861| 0.002066852|
|  7|   1654|   1137|0.0021564376|
|  8|   1654|   1159|0.0020957638|
|  9|   1654|   1428| 0.002067064|
| 10|   1654|   1685|0.0020953426|
| 11|   1654|   1686| 0.002066915|
| 12|   1654|   1704| 0.002124982|
| 13|   1654|   2093|0.0020384623|
| 14|   1654|   2343| 0.002066894|
| 15|   1654|   2451|0.0020384623|
| 16|   1654|   2469|0.0024767197|
| 17|   1654|   2603| 0.002067118|
| 18|   1654|   2609|0.0020384623|
| 19|   1654|   2621| 0.002095384|
+---+-------+-------+------------+
only showing top 20 rows



In [170]:
!pwd

/data/home/alexander.kuznetsov


In [175]:
result.toPandas().to_csv("lab03.csv", header=True, sep=",", index=False)

In [176]:
!ls

custom_regex_transformer_skeleton.ipynb  lab01.json   lab02.json  lab03.ipynb
lab01.ipynb				 lab02.ipynb  lab03.csv


### Natalya's approach -- end

### Views

In [25]:
views_schema = StructType(fields=[
    StructField("user_id", IntegerType(), True),
    StructField("item_id", IntegerType(), True),
    StructField("ts_start", TimestampType(), True),
    StructField("ts_stop", TimestampType(), True),
    StructField("item_type", StringType(), True),
])

In [26]:
views = (spark
         .read
#          .schema(views_schema)
         .format('csv')
         .option("sep", ",")
         .option("header", True)
         .option('inferSchema', True)
         .load('/labs/slaba03/laba03_views_programmes.csv')
        )

In [27]:
views.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- ts_start: integer (nullable = true)
 |-- ts_end: integer (nullable = true)
 |-- item_type: string (nullable = true)



In [28]:
views.show()

+-------+-------+----------+----------+---------+
|user_id|item_id|  ts_start|    ts_end|item_type|
+-------+-------+----------+----------+---------+
|      0|7101053|1491409931|1491411600|     live|
|      0|7101054|1491412481|1491451571|     live|
|      0|7101054|1491411640|1491412481|     live|
|      0|6184414|1486191290|1486191640|     live|
|    257|4436877|1490628499|1490630256|     live|
|   1654|7489015|1493434801|1493435401|     live|
|   1654|7489023|1493444101|1493445601|     live|
|   1654|6617053|1489186156|1489200834|     live|
|   1654|6438693|1487840070|1487840433|     live|
|   1654|6526859|1488705452|1488706154|     live|
|   1654|6526754|1488532396|1488532895|      pvr|
|   1654|6239098|1486732011|1486732410|     live|
|   1654|6438763|1488305761|1488307286|      pvr|
|   1654|7489013|1493433301|1493434201|     live|
|   1654|6317094|1486829784|1486830389|     live|
|   1654|6799393|1490172025|1490173391|      pvr|
|   1654|6616978|1488962050|1488962874|      pvr|


In [29]:
views.rdd.getNumPartitions()

7

In [40]:
views.count()

20845607

In [41]:
views.select("user_id").distinct().count(), views.select("item_id").distinct().count()

(79385, 633840)

In [42]:
(views
 .withColumn("ts_start_dt", f.col("ts_start").cast(TimestampType()))
 .withColumn("ts_end_dt", f.col("ts_end").cast(TimestampType()))
 .agg(f.min("ts_start_dt"), f.max("ts_start_dt")).show())

+-------------------+-------------------+
|   min(ts_start_dt)|   max(ts_start_dt)|
+-------------------+-------------------+
|2017-02-01 03:00:19|2017-05-04 03:54:32|
+-------------------+-------------------+



In [30]:
views_riched = (views
 .select("user_id", "item_id", "ts_start", "ts_end", "item_type")
 .withColumn("ts_start_dt", f.col("ts_start").cast(TimestampType()))
 .withColumn("ts_end_dt", f.col("ts_end").cast(TimestampType()))
 .withColumn("duration_min", (f.col("ts_end") - f.col("ts_start")) / 60)
 .withColumn("hour_start", f.hour(f.col("ts_start_dt")))
 .withColumn("dayofweek_start_dt", f.dayofweek(f.col("ts_start_dt")))
 .withColumn("is_weekend_start_dt", f.dayofweek(f.col("ts_start_dt")).isin([1, 7]).cast("int"))
 .drop("ts_start", "ts_end", "ts_start_dt", "ts_end_dt")
#  .join(items.select("item_id", "content_type"), on="item_id", how="left")
)

In [28]:
# views_riched.groupby("content_type").count().show()

In [41]:
views_genres = (
    views_riched
    .join(f.broadcast(items_features.select("item_id", "item_features")), on="item_id", how="left")
    .drop("item_id", "dayofweek_start_dt")
#     .drop("content_type")
    .withColumn("item_features_notna", 
                f.when(f.col("item_features").isNull(), f.array([f.lit(0.0)] * 84))
                .otherwise(f.col("item_features")))
    .drop("item_features")
    .withColumn("duration_adj_item_features", 
                f.array(*[f.col("duration_min") * f.col("item_features_notna")[i] for i in range(85)]))
)
views_genres.cache()

DataFrame[user_id: int, item_type: string, duration_min: double, hour_start: int, is_weekend_start_dt: int, item_features_notna: array<double>, duration_adj_item_features: array<double>]

In [42]:
views_genres.show()

Py4JJavaError: An error occurred while calling o2279.showString.
: org.apache.spark.SparkException: Could not execute broadcast in 300 secs. You can increase the timeout for broadcasts via spark.sql.broadcastTimeout or disable broadcast join by setting spark.sql.autoBroadcastJoinThreshold to -1
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:150)
	at org.apache.spark.sql.execution.InputAdapter.doExecuteBroadcast(WholeStageCodegenExec.scala:387)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeBroadcast$1.apply(SparkPlan.scala:144)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeBroadcast$1.apply(SparkPlan.scala:140)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.executeBroadcast(SparkPlan.scala:140)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.prepareBroadcast(BroadcastHashJoinExec.scala:117)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.codegenOuter(BroadcastHashJoinExec.scala:259)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doConsume(BroadcastHashJoinExec.scala:102)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:189)
	at org.apache.spark.sql.execution.ProjectExec.consume(basicPhysicalOperators.scala:37)
	at org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:67)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:189)
	at org.apache.spark.sql.execution.FileSourceScanExec.consume(DataSourceScanExec.scala:159)
	at org.apache.spark.sql.execution.ColumnarBatchScan$class.produceRows(ColumnarBatchScan.scala:172)
	at org.apache.spark.sql.execution.ColumnarBatchScan$class.doProduce(ColumnarBatchScan.scala:85)
	at org.apache.spark.sql.execution.FileSourceScanExec.doProduce(DataSourceScanExec.scala:159)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.FileSourceScanExec.produce(DataSourceScanExec.scala:159)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:47)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:37)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:96)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:47)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:37)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:544)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:598)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.columnar.CachedRDDBuilder.buildBuffers(InMemoryRelation.scala:83)
	at org.apache.spark.sql.execution.columnar.CachedRDDBuilder.cachedColumnBuffers(InMemoryRelation.scala:59)
	at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.filteredCachedBatches(InMemoryTableScanExec.scala:276)
	at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.inputRDD$lzycompute(InMemoryTableScanExec.scala:105)
	at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.inputRDD(InMemoryTableScanExec.scala:104)
	at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.doExecute(InMemoryTableScanExec.scala:310)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:391)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:627)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:247)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:339)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.util.concurrent.TimeoutException: Futures timed out after [300 seconds]
	at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:223)
	at scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:227)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:220)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:146)
	... 104 more


In [33]:
views_genres.filter(f.col("duration_adj_item_features").isNull()).show()

+-------+---------+------------+----------+-------------------+-------------------+--------------------------+
|user_id|item_type|duration_min|hour_start|is_weekend_start_dt|item_features_notna|duration_adj_item_features|
+-------+---------+------------+----------+-------------------+-------------------+--------------------------+
+-------+---------+------------+----------+-------------------+-------------------+--------------------------+



In [37]:
views_genres.count()

20845607

In [51]:
item_type = "pvr"
user_pvr = (
    views_genres
    .filter(f.col("item_type") == item_type)
    .select("user_id", "duration_min", "hour_start", "is_weekend_start_dt", 
            "genres_dense_features_notna", "duration_adj_genres_dense_features")
    .groupby("user_id")
    .agg(f.sum("duration_min").alias(f"sum_duration_{item_type}"),
         f.mean("duration_min").alias(f"mean_duration_{item_type}"),
         f.stddev("duration_min").alias(f"std_duration_{item_type}"),
         f.mean("is_weekend_start_dt").alias(f"mean_is_weekend_start_dt_{item_type}"),
         f.stddev("is_weekend_start_dt").alias(f"std_is_weekend_start_dt_{item_type}"),
         f.mean("hour_start").alias(f"mean_hour_start_{item_type}"),
         f.stddev("hour_start").alias(f"std_hour_start_{item_type}"),
         f.array(*[f.sum(f.col("duration_adj_genres_dense_features")[i]) for i in range(84)])
          .alias(f"sum_duration_adj_genres_dense_features_{item_type}")
        )
    .withColumn(f"weighted_genres_dense_features_{item_type}", 
                f.array(*[(f.col(f"sum_duration_adj_genres_dense_features_{item_type}")[i] 
                           / f.col(f"sum_duration_{item_type}")) for i in range(84)]))
    .drop(f"sum_duration_adj_genres_dense_features_{item_type}")
)
user_pvr.cache()

DataFrame[user_id: int, sum_duration_pvr: double, mean_duration_pvr: double, std_duration_pvr: double, mean_is_weekend_start_dt_pvr: double, std_is_weekend_start_dt_pvr: double, mean_hour_start_pvr: double, std_hour_start_pvr: double, weighted_genres_dense_features_pvr: array<double>]

In [52]:
item_type = "live"
user_live = (
    views_genres
    .filter(f.col("item_type") == item_type)
    .select("user_id", "duration_min", "hour_start", "is_weekend_start_dt", 
            "genres_dense_features_notna", "duration_adj_genres_dense_features")
    .groupby("user_id")
    .agg(f.sum("duration_min").alias(f"sum_duration_{item_type}"),
         f.mean("duration_min").alias(f"mean_duration_{item_type}"),
         f.stddev("duration_min").alias(f"std_duration_{item_type}"),
         f.mean("is_weekend_start_dt").alias(f"mean_is_weekend_start_dt_{item_type}"),
         f.stddev("is_weekend_start_dt").alias(f"std_is_weekend_start_dt_{item_type}"),
         f.mean("hour_start").alias(f"mean_hour_start_{item_type}"),
         f.stddev("hour_start").alias(f"std_hour_start_{item_type}"),
         f.array(*[f.sum(f.col("duration_adj_genres_dense_features")[i]) for i in range(84)])
          .alias(f"sum_duration_adj_genres_dense_features_{item_type}")
        )
    .withColumn(f"weighted_genres_dense_features_{item_type}", 
                f.array(*[(f.col(f"sum_duration_adj_genres_dense_features_{item_type}")[i] 
                           / f.col(f"sum_duration_{item_type}")) for i in range(84)]))
    .drop(f"sum_duration_adj_genres_dense_features_{item_type}")
)
user_live.cache()

DataFrame[user_id: int, sum_duration_live: double, mean_duration_live: double, std_duration_live: double, mean_is_weekend_start_dt_live: double, std_is_weekend_start_dt_live: double, mean_hour_start_live: double, std_hour_start_live: double, weighted_genres_dense_features_live: array<double>]

In [39]:
item_type = "all"
user_all = (
    views_genres
    .select("user_id", "duration_min", "hour_start", "is_weekend_start_dt", 
            "item_features_notna", "duration_adj_item_features")
    .groupby("user_id")
    .agg(f.sum("duration_min").alias(f"sum_duration_{item_type}"),
         f.mean("duration_min").alias(f"mean_duration_{item_type}"),
         f.stddev("duration_min").alias(f"std_duration_{item_type}"),
         f.mean("is_weekend_start_dt").alias(f"mean_is_weekend_start_dt_{item_type}"),
         f.stddev("is_weekend_start_dt").alias(f"std_is_weekend_start_dt_{item_type}"),
         f.mean("hour_start").alias(f"mean_hour_start_{item_type}"),
         f.stddev("hour_start").alias(f"std_hour_start_{item_type}"),
         f.array(*[f.sum(f.col("duration_adj_item_features")[i]) for i in range(84)])
          .alias(f"sum_duration_adj_item_features_{item_type}")
        )
    .withColumn(f"weighted_item_features_{item_type}", 
                f.array(*[(f.col(f"sum_duration_adj_item_features_{item_type}")[i] 
                           / f.col(f"sum_duration_{item_type}")) for i in range(84)]))
    .drop(f"sum_duration_adj_item_features_{item_type}")
)
# user_all.cache()

In [None]:
user_pvr.show(10, vertical=True)

In [None]:
user_live.show(10, vertical=True)

In [40]:
user_all.show(10, vertical=True)

KeyboardInterrupt: 

In [61]:
users_features = (
    user_all
    .join(user_pvr, on="user_id", how="left")
    .join(user_live, on="user_id", how="left")
    .fillna({"sum_duration_pvr": 0.0, 
             "mean_duration_pvr": 0.0, 
             "std_duration_pvr": 0.0, 
             "mean_is_weekend_start_dt_pvr": 0.0, 
             "std_is_weekend_start_dt_pvr": 0.0, 
             "mean_hour_start_pvr": 0.0, 
             "std_hour_start_pvr": 0.0, 
             "sum_duration_live": 0.0, 
             "mean_duration_live": 0.0, 
             "std_duration_live": 0.0, 
             "mean_is_weekend_start_dt_live": 0.0, 
             "std_is_weekend_start_dt_live": 0.0, 
             "mean_hour_start_live": 0.0, 
             "std_hour_start_live": 0.0, 
             "sum_duration_all": 0.0, 
             "mean_duration_all": 0.0, 
             "std_duration_all": 0.0, 
             "mean_is_weekend_start_dt_all": 0.0, 
             "std_is_weekend_start_dt_all": 0.0, 
             "mean_hour_start_all": 0.0, 
             "std_hour_start_all": 0.0, 
            })
    .withColumn("weighted_genres_dense_features_pvr_notna", 
                f.when(f.col("weighted_genres_dense_features_pvr").isNull(), f.array([f.lit(0.0)] * 85))
                .otherwise(f.col("weighted_genres_dense_features_pvr")))
    .drop("weighted_genres_dense_features_pvr")
    .withColumn("weighted_genres_dense_features_live_notna", 
                f.when(f.col("weighted_genres_dense_features_live").isNull(), f.array([f.lit(0.0)] * 85))
                .otherwise(f.col("weighted_genres_dense_features_live")))
    .drop("weighted_genres_dense_features_live")
    .withColumn("weighted_genres_dense_features_all_notna", 
                f.when(f.col("weighted_genres_dense_features_all").isNull(), f.array([f.lit(0.0)] * 85))
                .otherwise(f.col("weighted_genres_dense_features_all")))
    .drop("weighted_genres_dense_features_all")
)

In [34]:
users_features = (
    user_all
    .fillna({"sum_duration_all": 0.0, 
             "mean_duration_all": 0.0, 
             "std_duration_all": 0.0, 
             "mean_is_weekend_start_dt_all": 0.0, 
             "std_is_weekend_start_dt_all": 0.0, 
             "mean_hour_start_all": 0.0, 
             "std_hour_start_all": 0.0, 
            })
    .withColumn("weighted_item_features_all_notna", 
                f.when(f.col("weighted_item_features_all").isNull(), f.array([f.lit(0.0)] * 84))
                .otherwise(f.col("weighted_item_features_all")))
    .drop("weighted_item_features_all")
)

In [35]:
users_features.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- sum_duration_all: double (nullable = false)
 |-- mean_duration_all: double (nullable = false)
 |-- std_duration_all: double (nullable = false)
 |-- mean_is_weekend_start_dt_all: double (nullable = false)
 |-- std_is_weekend_start_dt_all: double (nullable = false)
 |-- mean_hour_start_all: double (nullable = false)
 |-- std_hour_start_all: double (nullable = false)
 |-- weighted_item_features_all_notna: array (nullable = false)
 |    |-- element: double (containsNull = true)



In [64]:
users_features_concat = (
    users_features.select("user_id",
    f.concat(
    f.array(f.col("sum_duration_all")), # 1
    f.array(f.col("mean_duration_all")), # 2
    f.array(f.col("std_duration_all")), # 3
    f.array(f.col("mean_is_weekend_start_dt_all")), # 4
    f.array(f.col("std_is_weekend_start_dt_all")), # 5
    f.array(f.col("mean_hour_start_all")), # 6
    f.array(f.col("std_hour_start_all")), # 7
    f.array(f.col("sum_duration_pvr")), # 8
    f.array(f.col("mean_duration_pvr")), # 9
    f.array(f.col("std_duration_pvr")), # 10
    f.array(f.col("mean_is_weekend_start_dt_pvr")), # 11
    f.array(f.col("std_is_weekend_start_dt_pvr")), # 12
    f.array(f.col("mean_hour_start_pvr")), # 13
    f.array(f.col("std_hour_start_pvr")), # 14
    f.array(f.col("sum_duration_live")), # 15
    f.array(f.col("mean_duration_live")), # 16
    f.array(f.col("std_duration_live")), # 17
    f.array(f.col("mean_is_weekend_start_dt_live")), # 18
    f.array(f.col("std_is_weekend_start_dt_live")), # 19
    f.array(f.col("mean_hour_start_live")), # 20
    f.array(f.col("std_hour_start_live")), # 21
    f.col("weighted_genres_dense_features_all_notna"), # 22 : 22 + 85
    f.col("weighted_genres_dense_features_pvr_notna"), # 108 : 108 + 85
    f.col("weighted_genres_dense_features_live_notna"), # 194 : 194 + 85
           ).alias("user_feature")) # (0 : 280)
)
users_features_concat.cache()

DataFrame[user_id: int, user_feature: array<double>]

In [36]:
users_features_concat = (
    users_features.select("user_id",
    f.concat(
    f.array(f.col("sum_duration_all")), # 1
    f.array(f.col("mean_duration_all")), # 2
    f.array(f.col("std_duration_all")), # 3
    f.array(f.col("mean_is_weekend_start_dt_all")), # 4
    f.array(f.col("std_is_weekend_start_dt_all")), # 5
    f.array(f.col("mean_hour_start_all")), # 6
    f.array(f.col("std_hour_start_all")), # 7
    f.col("weighted_item_features_all_notna"), # 
           ).alias("user_feature")) # 
)
users_features_concat.cache()

DataFrame[user_id: int, user_feature: array<double>]

In [38]:
users_features.explain()

== Physical Plan ==
HashAggregate(keys=[user_id#258], functions=[sum(duration_min#314), avg(duration_min#314), stddev_samp(duration_min#314), avg(cast(is_weekend_start_dt#344 as bigint)), stddev_samp(cast(is_weekend_start_dt#344 as double)), avg(cast(hour_start#323 as bigint)), stddev_samp(cast(hour_start#323 as double)), sum(duration_adj_item_features#393[0]), sum(duration_adj_item_features#393[1]), sum(duration_adj_item_features#393[2]), sum(duration_adj_item_features#393[3]), sum(duration_adj_item_features#393[4]), sum(duration_adj_item_features#393[5]), sum(duration_adj_item_features#393[6]), sum(duration_adj_item_features#393[7]), sum(duration_adj_item_features#393[8]), sum(duration_adj_item_features#393[9]), sum(duration_adj_item_features#393[10]), sum(duration_adj_item_features#393[11]), sum(duration_adj_item_features#393[12]), sum(duration_adj_item_features#393[13]), sum(duration_adj_item_features#393[14]), sum(duration_adj_item_features#393[15]), sum(duration_adj_item_features

In [37]:
users_features_concat.show(5)

KeyboardInterrupt: 

In [88]:
users_features_concat.filter(f.col("user_feature").isNull()).show(5)

+-------+------------+
|user_id|user_feature|
+-------+------------+
+-------+------------+



In [89]:
items_features.filter(f.col("item_feature").isNull()).show(5)

+-------+------------+
|item_id|item_feature|
+-------+------------+
+-------+------------+



### Model

In [122]:
train_features = (
    train.select("user_id", "item_id", f.col("purchase").cast(FloatType()).alias("purchase"))
    .join(items_features, on="item_id", how="left")
    .join(users_features_concat, on="user_id", how="left")
    .select("user_id", "item_id", "purchase", 
        f.concat(f.col("item_feature"), f.col("user_feature")).alias("item_user_feature")
           )
    .withColumn("item_user_feature_notna", 
                f.when(f.col("item_user_feature").isNull(), f.array([f.lit(0.0)] * 361))
                .otherwise(f.col("item_user_feature")))
    .drop("item_user_feature")
)
train_features.cache()

DataFrame[user_id: int, item_id: int, purchase: float, item_user_feature_notna: array<double>]

In [123]:
train_features.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: float (nullable = true)
 |-- item_user_feature_notna: array (nullable = true)
 |    |-- element: double (containsNull = true)



In [124]:
train_features.show(10)

+-------+-------+--------+-----------------------+
|user_id|item_id|purchase|item_user_feature_notna|
+-------+-------+--------+-----------------------+
| 754230|    948|     0.0|   [0.0, 1.0, 1.0, 0...|
| 754230|   7038|     0.0|   [0.0, 0.0, 0.0, 0...|
| 754230|  66724|     0.0|   [0.0, 0.0, 0.0, 0...|
| 754230|  73019|     0.0|   [0.0, 0.0, 0.0, 0...|
| 754230|  73215|     0.0|   [0.0, 0.0, 0.0, 1...|
| 754230|  75384|     0.0|   [0.0, 1.0, 1.0, 0...|
| 754230|  87533|     0.0|   [0.0, 1.0, 0.0, 0...|
| 754230|  88770|     0.0|   [0.0, 0.0, 0.0, 1...|
| 754230|  90017|     0.0|   [0.0, 1.0, 1.0, 0...|
| 754230| 100029|     0.0|   [0.0, 0.0, 0.0, 0...|
+-------+-------+--------+-----------------------+
only showing top 10 rows



In [None]:
train_features.show(5, vertical=True, truncate=False)

In [97]:
train_features.filter(f.col("item_user_feature_notna").isNull()).show(10)

+-------+-------+--------+-----------------------+
|user_id|item_id|purchase|item_user_feature_notna|
+-------+-------+--------+-----------------------+
+-------+-------+--------+-----------------------+



In [138]:
train_features.groupby(f.size(f.col("item_user_feature_notna")).alias("feature_size")).count().show()

Py4JJavaError: An error occurred while calling o3675.showString.
: org.apache.spark.SparkException: Job 119 cancelled because SparkContext was shut down
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:954)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:952)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)
	at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:952)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:2164)
	at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
	at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:2077)
	at org.apache.spark.SparkContext$$anonfun$stop$6.apply$mcV$sp(SparkContext.scala:1949)
	at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1340)
	at org.apache.spark.SparkContext.stop(SparkContext.scala:1948)
	at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:121)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.GeneratedMethodAccessor75.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


In [98]:
test_features = (
    test
    .join(items_features, on="item_id", how="left")
    .join(users_features_concat, on="user_id", how="left")
    .select("user_id", "item_id", "purchase", 
        f.concat(f.col("item_feature"), f.col("user_feature")).alias("item_user_feature")
           )
    .withColumn("item_user_feature_notna", 
                f.when(f.col("item_user_feature").isNull(), f.array([f.lit(0.0)] * 361))
                .otherwise(f.col("item_user_feature")))
    .drop("item_user_feature")
)
test_features.cache()

DataFrame[user_id: int, item_id: int, purchase: string, item_user_feature_notna: array<double>]

In [99]:
test_features.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: string (nullable = true)
 |-- item_user_feature_notna: array (nullable = true)
 |    |-- element: double (containsNull = true)



In [71]:
test_features.show(10)

+-------+-------+--------+--------------------+
|user_id|item_id|purchase|   item_user_feature|
+-------+-------+--------+--------------------+
| 754230|  73041|    null|[0.0, 1.0, 1.0, 0...|
| 754230|  74440|    null|[0.0, 1.0, 0.0, 0...|
| 754230|  74452|    null|[0.0, 1.0, 0.0, 0...|
| 754230|  93131|    null|[0.0, 0.0, 0.0, 0...|
| 754230|  93633|    null|[0.0, 0.0, 0.0, 1...|
| 754230|  95151|    null|[0.0, 0.0, 0.0, 0...|
| 754230|   9071|    null|[0.0, 0.0, 0.0, 1...|
| 754230|  72387|    null|[0.0, 0.0, 1.0, 0...|
| 754230|  78269|    null|[0.0, 1.0, 0.0, 1...|
| 754230|  93477|    null|[0.0, 0.0, 1.0, 0...|
+-------+-------+--------+--------------------+
only showing top 10 rows



In [100]:
test_features.filter(f.col("item_user_feature_notna").isNull()).show(10)

+-------+-------+--------+-----------------------+
|user_id|item_id|purchase|item_user_feature_notna|
+-------+-------+--------+-----------------------+
+-------+-------+--------+-----------------------+



In [125]:
Vectors.sparse(4, range(4), [1.0, 2.0, 0.0, 0.0])

SparseVector(4, {0: 1.0, 1: 2.0, 2: 0.0, 3: 0.0})

In [126]:
Vectors.dense([1.0, 2.0, 0.0, 0.0])

DenseVector([1.0, 2.0, 0.0, 0.0])

In [127]:
def array_to_vector(v):
#     return Vectors.sparse(len(v), range(len(v)), v)
    return Vectors.dense(v)

array_to_vector_udf = f.udf(array_to_vector, VectorUDT())

In [128]:
train_vector_features = (
    train_features
    .withColumn("item_user_feature_sparse", array_to_vector_udf(f.col("item_user_feature_notna")))
    .drop("item_user_feature_notna")
)
train_vector_features.show()

+-------+-------+--------+------------------------+
|user_id|item_id|purchase|item_user_feature_sparse|
+-------+-------+--------+------------------------+
| 754230|    948|     0.0|    [0.0,1.0,1.0,0.0,...|
| 754230|   7038|     0.0|    [0.0,0.0,0.0,0.0,...|
| 754230|  66724|     0.0|    [0.0,0.0,0.0,0.0,...|
| 754230|  73019|     0.0|    [0.0,0.0,0.0,0.0,...|
| 754230|  73215|     0.0|    [0.0,0.0,0.0,1.0,...|
| 754230|  75384|     0.0|    [0.0,1.0,1.0,0.0,...|
| 754230|  87533|     0.0|    [0.0,1.0,0.0,0.0,...|
| 754230|  88770|     0.0|    [0.0,0.0,0.0,1.0,...|
| 754230|  90017|     0.0|    [0.0,1.0,1.0,0.0,...|
| 754230| 100029|     0.0|    [0.0,0.0,0.0,0.0,...|
| 754230| 100218|     0.0|    [0.0,1.0,0.0,1.0,...|
| 754230| 100462|     0.0|    [0.0,1.0,0.0,0.0,...|
| 754230|   9294|     0.0|    [0.0,0.0,0.0,0.0,...|
| 754230|   9938|     0.0|    [0.0,1.0,0.0,0.0,...|
| 754230|   9967|     0.0|    [0.0,0.0,1.0,0.0,...|
| 754230|  10930|     0.0|    [0.0,1.0,0.0,0.0,...|
| 754230|  7

In [129]:
test_vector_features = (
    test_features
    .withColumn("item_user_feature_sparse", array_to_vector_udf(f.col("item_user_feature_notna")))
    .drop("item_user_feature_notna")
)
test_vector_features.show()

+-------+-------+--------+------------------------+
|user_id|item_id|purchase|item_user_feature_sparse|
+-------+-------+--------+------------------------+
| 754230|  11025|    null|    [0.0,1.0,0.0,1.0,...|
| 754230|  72912|    null|    [0.0,0.0,0.0,1.0,...|
| 754230|  86406|    null|    [0.0,0.0,0.0,0.0,...|
| 754230|  88999|    null|    [0.0,1.0,0.0,1.0,...|
| 754230|  93487|    null|    [0.0,0.0,1.0,0.0,...|
| 754230|   9071|    null|    [0.0,0.0,0.0,1.0,...|
| 754230|  72387|    null|    [0.0,0.0,1.0,0.0,...|
| 754230|  78269|    null|    [0.0,1.0,0.0,1.0,...|
| 754230|  93477|    null|    [0.0,0.0,1.0,0.0,...|
| 754230|  94726|    null|    [0.0,1.0,1.0,0.0,...|
| 754230|  10128|    null|    [0.0,1.0,0.0,0.0,...|
| 754230|  73421|    null|    [0.0,1.0,1.0,0.0,...|
| 754230|  74556|    null|    [0.0,1.0,0.0,0.0,...|
| 754230|  79417|    null|    [0.0,1.0,0.0,1.0,...|
| 754230|  92740|    null|    [0.0,0.0,0.0,0.0,...|
| 754230|  93020|    null|    [0.0,0.0,0.0,0.0,...|
| 754230|  9

In [130]:
train_train = train_vector_features.sampleBy("purchase", fractions={0.0: 0.8, 1.0: 0.8}, seed=5757)

In [131]:
train_train.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: float (nullable = true)
 |-- item_user_feature_sparse: vector (nullable = true)



In [132]:
train_valid = train_vector_features.join(train_train, on=["user_id", "item_id"], how="leftanti")

In [133]:
train_valid.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: float (nullable = true)
 |-- item_user_feature_sparse: vector (nullable = true)



In [134]:
from pyspark.ml.classification import GBTClassifier, LogisticRegression

In [135]:
lr = LogisticRegression(featuresCol="item_user_feature_sparse", labelCol="purchase", maxIter=20)

In [136]:
lr_model = lr.fit(train_train)

Py4JJavaError: An error occurred while calling o3581.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: ShuffleMapStage 269 (treeAggregate at LogisticRegression.scala:520) has failed the maximum allowable number of times: 4. Most recent failure reason: org.apache.spark.shuffle.FetchFailedException: Error in reading FileSegmentManagedBuffer{file=/hadoop/yarn/local/usercache/alexander.kuznetsov/appcache/application_1615561586883_0711/blockmgr-0275250d-14e2-4b09-9f51-057e33e7f089/0a/shuffle_35_103_0.data, offset=16164, length=1947} 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:554) 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:449) 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:64) 	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435) 	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:441) 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) 	at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:31) 	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) 	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage6.sort_addToSorter_0$(Unknown Source) 	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage6.processNext(Unknown Source) 	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) 	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636) 	at org.apache.spark.sql.execution.RowIteratorFromScala.advanceNext(RowIterator.scala:83) 	at org.apache.spark.sql.execution.joins.SortMergeJoinScanner.advancedStreamed(SortMergeJoinExec.scala:811) 	at org.apache.spark.sql.execution.joins.SortMergeJoinScanner.findNextOuterJoinRows(SortMergeJoinExec.scala:770) 	at org.apache.spark.sql.execution.joins.OneSideOuterIterator.advanceStream(SortMergeJoinExec.scala:934) 	at org.apache.spark.sql.execution.joins.OneSideOuterIterator.advanceNext(SortMergeJoinExec.scala:970) 	at org.apache.spark.sql.execution.RowIteratorToScala.hasNext(RowIterator.scala:68) 	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage8.processNext(Unknown Source) 	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) 	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636) 	at org.apache.spark.sql.execution.columnar.CachedRDDBuilder$$anonfun$1$$anon$1.hasNext(InMemoryRelation.scala:125) 	at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:221) 	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299) 	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165) 	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156) 	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091) 	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156) 	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882) 	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:357) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:308) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.sql.execution.SQLExecutionRDD$$anonfun$compute$1.apply(SQLExecutionRDD.scala:52) 	at org.apache.spark.sql.execution.SQLExecutionRDD$$anonfun$compute$1.apply(SQLExecutionRDD.scala:52) 	at org.apache.spark.sql.internal.SQLConf$.withExistingConf(SQLConf.scala:92) 	at org.apache.spark.sql.execution.SQLExecutionRDD.compute(SQLExecutionRDD.scala:51) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:359) 	at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:357) 	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165) 	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156) 	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091) 	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156) 	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882) 	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:357) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:308) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) 	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) 	at org.apache.spark.scheduler.Task.run(Task.scala:123) 	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408) 	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) 	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414) 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 	at java.lang.Thread.run(Thread.java:745) Caused by: java.io.IOException: Error in reading FileSegmentManagedBuffer{file=/hadoop/yarn/local/usercache/alexander.kuznetsov/appcache/application_1615561586883_0711/blockmgr-0275250d-14e2-4b09-9f51-057e33e7f089/0a/shuffle_35_103_0.data, offset=16164, length=1947} 	at org.apache.spark.network.buffer.FileSegmentManagedBuffer.createInputStream(FileSegmentManagedBuffer.java:111) 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:442) 	... 86 more Caused by: java.io.FileNotFoundException: /hadoop/yarn/local/usercache/alexander.kuznetsov/appcache/application_1615561586883_0711/blockmgr-0275250d-14e2-4b09-9f51-057e33e7f089/0a/shuffle_35_103_0.data (No such file or directory) 	at java.io.FileInputStream.open0(Native Method) 	at java.io.FileInputStream.open(FileInputStream.java:195) 	at java.io.FileInputStream.<init>(FileInputStream.java:138) 	at org.apache.spark.network.buffer.FileSegmentManagedBuffer.createInputStream(FileSegmentManagedBuffer.java:100) 	... 87 more 
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1517)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2143)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1143)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1137)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1206)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1182)
	at org.apache.spark.ml.classification.LogisticRegression$$anonfun$train$1.apply(LogisticRegression.scala:520)
	at org.apache.spark.ml.classification.LogisticRegression$$anonfun$train$1.apply(LogisticRegression.scala:494)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:494)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:489)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:279)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


In [None]:
predictions_train = lr_model.transform(train_valid)

In [116]:
gbt = GBTClassifier(featuresCol="item_user_feature_sparse", labelCol="purchase", maxIter=20)

In [117]:
gbt_model = gbt.fit(train_train)

Py4JJavaError: An error occurred while calling o3233.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: ResultStage 241 (count at DecisionTreeMetadata.scala:118) has failed the maximum allowable number of times: 4. Most recent failure reason: org.apache.spark.shuffle.MetadataFetchFailedException: Missing an output location for shuffle 25 	at org.apache.spark.MapOutputTracker$$anonfun$convertMapStatuses$2.apply(MapOutputTracker.scala:882) 	at org.apache.spark.MapOutputTracker$$anonfun$convertMapStatuses$2.apply(MapOutputTracker.scala:878) 	at scala.collection.Iterator$class.foreach(Iterator.scala:891) 	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334) 	at org.apache.spark.MapOutputTracker$.convertMapStatuses(MapOutputTracker.scala:878) 	at org.apache.spark.MapOutputTrackerWorker.getMapSizesByExecutorId(MapOutputTracker.scala:691) 	at org.apache.spark.shuffle.BlockStoreShuffleReader.read(BlockStoreShuffleReader.scala:49) 	at org.apache.spark.sql.execution.ShuffledRowRDD.compute(ShuffledRowRDD.scala:165) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:89) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:359) 	at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:357) 	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165) 	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156) 	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091) 	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156) 	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882) 	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:357) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:308) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:359) 	at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:357) 	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165) 	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156) 	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091) 	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156) 	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882) 	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:357) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:308) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:359) 	at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:357) 	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165) 	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156) 	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091) 	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156) 	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882) 	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:357) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:308) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.sql.execution.SQLExecutionRDD$$anonfun$compute$1.apply(SQLExecutionRDD.scala:52) 	at org.apache.spark.sql.execution.SQLExecutionRDD$$anonfun$compute$1.apply(SQLExecutionRDD.scala:52) 	at org.apache.spark.sql.internal.SQLConf$.withExistingConf(SQLConf.scala:92) 	at org.apache.spark.sql.execution.SQLExecutionRDD.compute(SQLExecutionRDD.scala:51) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:359) 	at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:357) 	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165) 	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156) 	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091) 	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156) 	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882) 	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:357) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:308) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310) 	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) 	at org.apache.spark.scheduler.Task.run(Task.scala:123) 	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408) 	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) 	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414) 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 	at java.lang.Thread.run(Thread.java:745) 
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1517)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2143)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1213)
	at org.apache.spark.ml.tree.impl.DecisionTreeMetadata$.buildMetadata(DecisionTreeMetadata.scala:118)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:106)
	at org.apache.spark.ml.regression.DecisionTreeRegressor$$anonfun$train$2.apply(DecisionTreeRegressor.scala:129)
	at org.apache.spark.ml.regression.DecisionTreeRegressor$$anonfun$train$2.apply(DecisionTreeRegressor.scala:124)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
	at org.apache.spark.ml.regression.DecisionTreeRegressor.train(DecisionTreeRegressor.scala:124)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.boost(GradientBoostedTrees.scala:297)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.run(GradientBoostedTrees.scala:55)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$train$1.apply(GBTClassifier.scala:206)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$train$1.apply(GBTClassifier.scala:156)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:156)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:58)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


In [None]:
predictions_train = gbt_model.transform(train_valid)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="purchase", metricName='areaUnderROC')

In [None]:
evaluator.evaluate(predictions_train)

In [None]:
predictions_test = gbt_model.transform(test_vector_features)

In [55]:
spark.stop()