### Евгений Шенк

## Лабораторная раота №3. Рекомендательная система видеоконтента с implicit feedback – Spark ML

In [1]:
import json
import os
import sys
import re
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


### Spark Session

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
from pyspark.ml.feature import HashingTF, IDF, Normalizer, StopWordsRemover
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("ESShenk_spark_session")
         .getOrCreate())

### Data

In [3]:
!hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2022-01-06 18:46 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2022-01-06 18:46 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2022-01-06 18:46 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2022-01-06 18:46 /labs/slaba03/laba03_views_programmes.csv


In [4]:
df_items = spark.read.csv("/labs/slaba03/laba03_items.csv", header=True, sep="\t")
df_test = spark.read.csv("/labs/slaba03/laba03_test.csv", header=True)
df_train = spark.read.csv("/labs/slaba03/laba03_train.csv", header=True)
df_views_programmes = spark.read.csv("/labs/slaba03/laba03_views_programmes.csv", header=True)

In [5]:
df_test = df_test\
    .withColumn("user_id", F.col("user_id").cast(IntegerType()))\
    .withColumn("item_id", F.col("item_id").cast(IntegerType()))\
    .withColumn("purchase", F.col("purchase").cast(IntegerType()))

In [6]:
df_train = df_train\
    .withColumn("user_id", F.col("user_id").cast(IntegerType()))\
    .withColumn("item_id", F.col("item_id").cast(IntegerType()))\
    .withColumn("purchase", F.col("purchase").cast(IntegerType()))

### Инфо по данным

In [7]:
df_items.show(n=1, truncate=True, vertical=True)  # channel_id, content_type, year, genres, region_id

-RECORD 0-------------------------------------------
 item_id                     | 65667                
 channel_id                  | null                 
 datetime_availability_start | 1970-01-01T00:00:00Z 
 datetime_availability_stop  | 2018-01-01T00:00:00Z 
 datetime_show_start         | null                 
 datetime_show_stop          | null                 
 content_type                | 1                    
 title                       | на пробах только ... 
 year                        | 2013.0               
 genres                      | Эротика              
 region_id                   | null                 
only showing top 1 row



In [8]:
df_items.printSchema()

root
 |-- item_id: string (nullable = true)
 |-- channel_id: string (nullable = true)
 |-- datetime_availability_start: string (nullable = true)
 |-- datetime_availability_stop: string (nullable = true)
 |-- datetime_show_start: string (nullable = true)
 |-- datetime_show_stop: string (nullable = true)
 |-- content_type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- region_id: string (nullable = true)



In [9]:
df_test.show(n=1, truncate=False, vertical=True)

-RECORD 0---------
 user_id  | 1654  
 item_id  | 94814 
 purchase | null  
only showing top 1 row



In [10]:
# df_test.count()  # 2156840

2156840

In [11]:
df_train.show(n=1, truncate=False, vertical=True)

-RECORD 0---------
 user_id  | 1654  
 item_id  | 74107 
 purchase | 0     
only showing top 1 row



In [12]:
# df_train.count()  # 5032624

5032624

In [13]:
df_views_programmes.show(n=1, truncate=False, vertical=True)  # item_type ts_end-ts_start

-RECORD 0---------------
 user_id   | 0          
 item_id   | 7101053    
 ts_start  | 1491409931 
 ts_end    | 1491411600 
 item_type | live       
only showing top 1 row



In [14]:
df_views_programmes = df_views_programmes\
    .withColumn("user_id", df_views_programmes.user_id.cast(IntegerType()))\
    .withColumn("item_id", df_views_programmes.item_id.cast(IntegerType()))\
    .withColumn("ts_start", df_views_programmes.ts_start.cast(IntegerType()))\
    .withColumn("ts_end", df_views_programmes.ts_end.cast(IntegerType()))

In [15]:
df_views_programmes.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- ts_start: integer (nullable = true)
 |-- ts_end: integer (nullable = true)
 |-- item_type: string (nullable = true)



In [16]:
@F.pandas_udf(IntegerType(), F.PandasUDFType.SCALAR)
def col_diff(col_1, col_2):
    return col_2 - col_1

In [17]:
df_views_programmes.select("*")\
.filter(F.col("item_id") == 7101053)\
.withColumn("ts_diff", col_diff("ts_start", "ts_end"))\
.show(n=1, truncate=False, vertical=True)

-RECORD 0---------------
 user_id   | 0          
 item_id   | 7101053    
 ts_start  | 1491409931 
 ts_end    | 1491411600 
 item_type | live       
 ts_diff   | 1669       
only showing top 1 row



In [18]:
# df_views_programmes.count()  # 20845607

20845607

## Обработка Признаков

In [19]:
genres_list = df_items.select("genres").filter(F.col("content_type") == 1).rdd.flatMap(lambda x:  x).collect()
genres_result = []
for x in genres_list:
    if x:
        genres_result += x.strip().split(',')

genre_stats = {}
for g in genres_result:
    # Check if the word is already in dictionary
    if g in genre_stats:
        # Increment count of word by 1
        genre_stats[g] = genre_stats[g] + 1
    else:
        # Add the word to dictionary with count 1
        genre_stats[g] = 1

In [21]:
genre_list = [x for x,y in genre_stats.items() if y > 10]
genre_list[:3]

['Эротика', 'Комедии', 'Мелодрамы']

In [22]:
df_items_prep = df_items\
    .select("item_id", "year", "genres")\
    .filter(F.col("content_type") == 1)

In [23]:
@F.udf(IntegerType())
def fill_col(col, x):
    try:
        if x in col:
            return 1
        else: 
            return 0
    except TypeError:
        return 0

In [24]:
for x in genre_list:
    df_items_prep = df_items_prep.withColumn(x, fill_col(F.col("genres"), F.lit(x)))

In [25]:
df_items_prep = df_items_prep \
    .withColumn("item_id", F.col("item_id").cast(IntegerType())) \
    .withColumn("year", F.col("year").cast(IntegerType())) \
    .drop(F.col("genres")) \
    .na.fill(-9999) \
    .withColumn("item_stats", F.to_json(F.struct(genre_list)))

In [26]:
# df_items_prep.show(n=1, truncate=100, vertical=True)

In [27]:
df_train_joined = df_train.filter(F.col("purchase") == 1).join(df_items_prep, on='item_id', how='left')

In [28]:
# df_train_joined.show(n=1, truncate=100, vertical=True)

In [29]:
grouping_column = 'user_id'
cols = [F.sum(F.col(x)).alias(x) for x in df_train_joined.columns if x != grouping_column]

df_train_prep_grouped = df_train_joined \
    .groupBy("user_id").agg(*cols) \
    .drop(F.col("item_id")) \
    .drop(F.col("year")) \
    .drop(F.col("purchase")) \
    .withColumn("user_stats", F.to_json(F.struct(genre_list))) \
    .select("user_id", "user_stats")

In [30]:
# df_train_prep_grouped.show(n=1, truncate=100, vertical=True)

In [31]:
item_count = df_train \
    .select("item_id") \
    .filter(F.col("purchase") == 1) \
    .groupBy("item_id").agg(F.count(F.col("item_id")).alias("item_prc"))

In [32]:
user_count = df_train \
    .select("user_id") \
    .filter(F.col("purchase") == 1) \
    .groupBy("user_id").agg(F.count(F.col("user_id")).alias("user_prc"))

In [33]:
user_year = df_train \
    .select("user_id") \
    .filter(F.col("purchase") == 1) \
    .join(df_train_joined.select("user_id", "year"), on='user_id', how='left') \
    .groupBy("user_id").agg(F.avg(F.col("year")).alias("avg_year"))

In [37]:
@F.udf(IntegerType())
def get_best_stat(user_col, item_col):
    if user_col and item_col:
        user_col = json.loads(user_col)
        item_col = json.loads(item_col)
        best = 0
        for k, v in item_col.items():
            if v > 0 and user_col[k] > best:
                best = user_col[k]

        return best
    else:
        return 0

In [38]:
@F.udf(FloatType())
def year_diff(user_col, item_col):
    if user_col and item_col:
        return item_col - user_col
    else:
        return 0

In [53]:
df_train_prep = df_train \
    .join(user_count, on='user_id', how='left') \
    .join(item_count, on='item_id', how='left') \
    .join(df_train_prep_grouped, on='user_id', how='left') \
    .join(df_items_prep.select("item_id", "year", "item_stats"), on='item_id', how='left') \
    .join(user_year, on='user_id', how='left') \
    .withColumn("stat_1", get_best_stat(F.col("user_stats"), F.col("item_stats"))) \
    .withColumn("stat_2", year_diff(F.col("avg_year"), F.col("year"))) \
    .na.fill(0) \
    .select('user_id', 'item_id', 'purchase', 'user_prc', 'item_prc', 'year', 'avg_year', 'stat_1', 'stat_2')

In [54]:
df_train_prep = df_train_prep.sampleBy("purchase", fractions={0: 0.02, 1: 1.0}, seed=27).cache()

In [55]:
df_train_prep.count()#.show(n=1, truncate=False, vertical=True)

110899

In [42]:
df_test_prep = df_test  \
    .join(user_count, on='user_id', how='left') \
    .join(item_count, on='item_id', how='left') \
    .join(df_train_prep_grouped, on='user_id', how='left') \
    .join(df_items_prep.select("item_id", "year", "item_stats"), on='item_id', how='left') \
    .join(user_year, on='user_id', how='left') \
    .withColumn("stat_1", get_best_stat(F.col("user_stats"), F.col("item_stats"))) \
    .withColumn("stat_2", year_diff(F.col("avg_year"), F.col("year"))) \
    .na.fill(0) \
    .select('user_id', 'item_id', 'purchase', 'user_prc', 'item_prc', 'year', 'avg_year', 'stat_1', 'stat_2')

In [43]:
df_test_prep = df_test_prep.cache()

### Model

In [56]:
ignore_cols = ["item_id", "user_id", "purchase"]
assembler = VectorAssembler(
    inputCols=[x for x in df_train_prep.columns if x not in ignore_cols], outputCol='features')

In [57]:
train_data = (assembler.transform(df_train_prep).select("purchase", "features"))
train_data = train_data.repartition(16).cache()
train_data.count()

110899

In [58]:
train_data.show(n=3, truncate=False, vertical=True)

-RECORD 0--------------------------------------------------------------
 purchase | 0                                                          
 features | (6,[1,2],[4.0,2015.0])                                     
-RECORD 1--------------------------------------------------------------
 purchase | 0                                                          
 features | [11.0,2.0,2014.0,2001.090909090909,2.0,12.909090995788574] 
-RECORD 2--------------------------------------------------------------
 purchase | 0                                                          
 features | [2.0,2.0,2011.0,2014.0,2.0,-3.0]                           
only showing top 3 rows



In [61]:
gbt = GBTClassifier(maxIter=10, maxDepth=4, seed=27, labelCol="purchase")
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="purchase", metricName='areaUnderROC')
paramGrid = ParamGridBuilder().build()

In [63]:
crossval = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid,
                          evaluator=evaluator, numFolds=3, parallelism=6)

In [64]:
%%time
model = crossval.fit(train_data)

CPU times: user 110 ms, sys: 37.5 ms, total: 148 ms
Wall time: 42.2 s


In [65]:
auc_roc = model.avgMetrics[0]
print("AUC ROC = %g" % auc_roc)

AUC ROC = 0.953232


In [67]:
test_data = assembler.transform(df_test_prep)

In [68]:
test_data.show(n=3, truncate=False, vertical=True)

-RECORD 0----------------------------------------------------------------
 user_id  | 754230                                                       
 item_id  | 94619                                                        
 purchase | 0                                                            
 user_prc | 72                                                           
 item_prc | 1                                                            
 year     | 2013                                                         
 avg_year | 2009.9166666666667                                           
 stat_1   | 56                                                           
 stat_2   | 3.0833333                                                    
 features | [72.0,1.0,2013.0,2009.9166666666667,56.0,3.0833332538604736] 
-RECORD 1----------------------------------------------------------------
 user_id  | 754230                                                       
 item_id  | 10210                     

In [74]:
preds = model.transform(test_data.select("user_id", "item_id", "features"))

In [75]:
preds.show(n=3, truncate=False, vertical=True)

-RECORD 0---------------------------------------------------------------------
 user_id       | 754230                                                       
 item_id       | 94619                                                        
 features      | [72.0,1.0,2013.0,2009.9166666666667,56.0,3.0833332538604736] 
 rawPrediction | [0.10027600187598734,-0.10027600187598734]                   
 probability   | [0.5499706236274292,0.4500293763725708]                      
 prediction    | 0.0                                                          
-RECORD 1---------------------------------------------------------------------
 user_id       | 754230                                                       
 item_id       | 10210                                                        
 features      | [72.0,16.0,2014.0,2009.9166666666667,0.0,4.083333492279053]  
 rawPrediction | [0.5693210822338086,-0.5693210822338086]                     
 probability   | [0.757430251546605,0.24256974845339

In [97]:
@F.udf(FloatType())
def get_prob(col_1):
    return float(col_1[1])

In [108]:
result = preds.select(F.col("user_id"), F.col("item_id"), get_prob(F.col("probability")).alias("purchase")) \
.orderBy(F.col("user_id").asc(), F.col("item_id").asc())

In [109]:
result.show(n=3, truncate=False, vertical=True)

-RECORD 0---------------
 user_id  | 1654        
 item_id  | 336         
 purchase | 0.07027356  
-RECORD 1---------------
 user_id  | 1654        
 item_id  | 678         
 purchase | 0.07027356  
-RECORD 2---------------
 user_id  | 1654        
 item_id  | 691         
 purchase | 0.067733474 
only showing top 3 rows



### Сохранить и выйти

In [110]:
result.toPandas().to_csv("../lab03.csv", index=False)

In [111]:
spark.stop()