In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 2 --driver-memory 4g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Laba_3") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
spark

In [4]:
from pyspark import keyword_only
from pyspark.ml import Transformer, Pipeline
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover, CountVectorizer, VectorAssembler
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.classification import LogisticRegression, GBTClassifier
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StructField, DoubleType, FloatType, ArrayType, StringType, IntegerType
from pyspark.sql.window import Window
from pyspark.sql.functions import udf, col, when, isnan, isnull, broadcast, lower, pandas_udf, row_number, explode, split
from pyspark.sql.functions import array, collect_set, collect_list, lit, asc, desc, sum, count, PandasUDFType
from pyspark.mllib.linalg import Vectors, SparseVector, DenseVector, VectorUDT
import json
import re

In [5]:
!hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2022-01-06 18:46 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2022-01-06 18:46 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2022-01-06 18:46 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2022-01-06 18:46 /labs/slaba03/laba03_views_programmes.csv


In [6]:
!hdfs dfs -head /labs/slaba03/laba03_items.csv

item_id	channel_id	datetime_availability_start	datetime_availability_stop	datetime_show_start	datetime_show_stop	content_type	title	year	genres	region_id
65667		1970-01-01T00:00:00Z	2018-01-01T00:00:00Z			1	на пробах только девушки (all girl auditions)	2013.0	Эротика	
65669		1970-01-01T00:00:00Z	2018-01-01T00:00:00Z			1	скуби ду: эротическая пародия (scooby doo: a xxx parody)	2011.0	Эротика	
65668		1970-01-01T00:00:00Z	2018-01-01T00:00:00Z			1	горячие девочки для горячих девочек (hot babes 4 hot babes)	2011.0	Эротика	
65671		1970-01-01T00:00:00Z	2018-01-01T00:00:00Z			1	соблазнительницы женатых мужчин (top heavy homewreckers)	2011.0	Эротика	
65670		1970-01-01T00:00:00Z	2018-01-01T00:00:00Z			1	секретные секс-материалы ii: темная секс пародия (the sex files ii: a dark xxx parody)	2010.0	Эротика	
65809		1970-01-01T00:00:00Z	2099-12-31

In [7]:
schema = (StructType()
      .add("user_id", IntegerType(), True)
      .add("item_id", IntegerType(), True)
      .add("purchase", IntegerType(), True))
      
df_user = (spark.read.format("csv")
           .option("header", True)
           .schema(schema)
           .load("/labs/slaba03/laba03_train.csv")
           .repartition(16)
           .cache())

In [8]:
df_user.show(10)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
| 613775|  74106|       0|
| 753356|   5359|       0|
| 782217|  94658|       0|
| 753619|  72394|       0|
| 793430|  11277|       0|
| 770209|   9840|       0|
| 769240|  79851|       0|
| 529632|  93621|       0|
| 789967|  10689|       0|
| 748500|  74517|       0|
+-------+-------+--------+
only showing top 10 rows



In [10]:
schema = (StructType()
      .add("user_id", IntegerType(), True)
      .add("item_id", IntegerType(), True)) 
      
      
df_user_test = (spark.read.format("csv")
                .option("header", True)
                .schema(schema)
                .load("/labs/slaba03/laba03_test.csv")
                .repartition(16)
                .cache())

In [12]:
read_users_schema = StructType(fields=[
    StructField('user_id', IntegerType()),
    StructField('item_id', IntegerType()),
    StructField('ts_start', IntegerType()),
    StructField('ts_end', IntegerType()),
    StructField('item_type', StringType()),
])

df_views_programmes = spark.read.format("csv") \
      .option("header", True) \
      .schema(read_users_schema) \
      .load("/labs/slaba03/laba03_views_programmes.csv")

df_views_programmes = (df_views_programmes
                       .withColumn('duration', df_views_programmes.ts_end - df_views_programmes.ts_start)
                       .drop('ts_start', 'ts_end'))

df_views_programmes = df_views_programmes.groupby('user_id', 'item_id', 'item_type').agg(sum("duration").alias("duration"))

df_views_programmes = df_views_programmes.repartition(16).cache()

df_views_programmes.limit(5).toPandas()

Unnamed: 0,user_id,item_id,item_type,duration
0,805112,6688671,live,13196
1,800010,6699896,live,958
2,857986,7379100,live,9402
3,785896,6316490,live,1838
4,888511,6379455,live,13073


In [13]:
read_items_schema = StructType(fields=[
    StructField('item_id', IntegerType()), 
    StructField('channel_id', FloatType(), nullable=True),
    StructField('datetime_availability_start', StringType(), nullable=True),
    StructField('datetime_availability_stop', StringType(), nullable=True),
    StructField('datetime_show_start', StringType(), nullable=True),
    StructField('datetime_show_stop', StringType(), nullable=True),
    StructField('content_type', IntegerType()),
    StructField('title', StringType(), nullable=True),
    StructField('year', FloatType(), nullable=True),
    StructField('genres', StringType(), nullable=True),
    StructField('region_id', FloatType(), nullable=True),
])

df_items = (spark.read.format("csv")
            .option("header", True)
            .option("sep", "\t")
            .schema(read_items_schema)
            .load("/labs/slaba03/laba03_items.csv")
           )

df_items = (df_items
            .withColumn("year", 
                        when(df_items.item_id == 103377, 2008.0)
                        .when(df_items.item_id == 95141, 2014.0)
                        .when(df_items.item_id == 72544, 2009.0)
                        .when(df_items.item_id == 8544, 1994.0)
                        .otherwise(df_items.year))
            .withColumn("genres", 
                        when(df_items.item_id == 103377, 'Анимация,Короткометражные')
                        .otherwise(df_items.genres))
           )
    
df_items = (df_items.repartition(16).cache())

print(df_items.filter(df_items.item_id.isNull()).count())

0


In [14]:
df_items = (
    df_items
    .withColumn("year_cat", array((((df_items.year - 1910.0)/10) + 1).cast(IntegerType()).cast(StringType())))
    .withColumn("year_cat_str", (((df_items.year - 1910.0)/10) + 1).cast(IntegerType()).cast(StringType()))
)

df_items.filter(df_items.year.isNotNull()).limit(2).toPandas()

Unnamed: 0,item_id,channel_id,datetime_availability_start,datetime_availability_stop,datetime_show_start,datetime_show_stop,content_type,title,year,genres,region_id,year_cat,year_cat_str
0,74429,,1970-01-01T00:00:00Z,2099-12-31T21:00:00Z,,,1,тарбозавр 3d,2011.0,"Полнометражные,Западные мультфильмы,Для детей,...",,[11],11
1,72413,,1970-01-01T00:00:00Z,2099-12-31T21:00:00Z,,,1,марин и его друзья. подводные истории,2015.0,"Западные мультфильмы,Сериалы,Для детей,Зарубежные",,[11],11


In [15]:
df_items = df_items.drop('year_cat_vector')
count_vectorizer_year = CountVectorizer(inputCol='year_cat', outputCol="year_cat_vector", binary=False)
count_vectorizer_year_model = count_vectorizer_year.fit(df_items)
df_items = count_vectorizer_year_model.transform(df_items)


normalizer_year = Normalizer(inputCol='year_cat_vector', outputCol="year_cat_norm")
df_items_year = normalizer_year.transform(df_items).select('item_id', 'year_cat_norm')
df_items = df_items.drop("year_cat_vector")

df_items_year.show(5)

+-------+--------------+
|item_id| year_cat_norm|
+-------+--------------+
|6310504|(12,[0],[1.0])|
|6235208|(12,[0],[1.0])|
|6200617|(12,[0],[1.0])|
|6234759|(12,[0],[1.0])|
|6326768|(12,[0],[1.0])|
+-------+--------------+
only showing top 5 rows



In [16]:
df_user_year = (df_user
                .filter(df_user.purchase == 1)
                .join(df_items, df_user.item_id == df_items.item_id, 'left')
                .select(df_user.user_id, df_items.year_cat_str.alias("year_cat_str")))
           
df_user_uniq = df_user.select('user_id').distinct()
df_user_year = (df_user_uniq
                .join(df_user_year, df_user_uniq.user_id == df_user_year.user_id, 'left')
                .select(df_user_uniq.user_id, df_user_year.year_cat_str))
    
df_user_year = df_user_year.groupBy('user_id').agg(collect_set('year_cat_str').alias('year_cat'))
df_user_year = count_vectorizer_year_model.transform(df_user_year)
df_user_year = normalizer_year.transform(df_user_year)
df_user_year = df_user_year.drop("year_cat", "year_cat_vector")

In [17]:
df_user_item_stat = df_user.groupby("item_id").agg(sum("purchase").alias("sum_purchase"), count("purchase").alias("count_purchase"))
df_user_item_stat = df_user_item_stat.withColumn("item_purchase_rate", df_user_item_stat.sum_purchase / df_user_item_stat.count_purchase)
df_user_item_stat = df_user_item_stat.select("item_id", "item_purchase_rate")

df_user_item_stat.show(5)

+-------+--------------------+
|item_id|  item_purchase_rate|
+-------+--------------------+
|  94851|7.288629737609329E-4|
|  90019|0.002281368821292...|
|  78113|0.001468428781204...|
|  95080|                 0.0|
|   8638|0.001450326323422...|
+-------+--------------------+
only showing top 5 rows



In [18]:
df_user_user_stat = df_user.groupby("user_id").agg(sum("purchase").alias("sum_purchase"), count("purchase").alias("count_purchase"))
df_user_user_stat = df_user_user_stat.withColumn("user_purchase_rate", df_user_user_stat.sum_purchase / df_user_user_stat.count_purchase)
df_user_user_stat = df_user_user_stat.select("user_id", "user_purchase_rate")

df_user_user_stat.show(5)

+-------+--------------------+
|user_id|  user_purchase_rate|
+-------+--------------------+
| 780033|7.757951900698216E-4|
| 761341|3.875968992248062E-4|
| 776188|0.001152516327314637|
| 754230|0.027575641516660282|
| 833685|0.007500986971969996|
+-------+--------------------+
only showing top 5 rows



In [19]:
def replace_genres(s):
    replace_map = {
        'Арт-хаус': 'Артхаус',
        'Боевики': 'Боевик',
        'Военные': 'Военный',
        'Военные': 'Военный',
        'Детские': 'Детский',
        'Для детей': 'Детский',
        'Для самых маленьких': 'Детский',
        'Для всей семьи': 'Семейные',
        'Для взрослых': 'Эротика',
        'Документальные': 'Документальный',
        'Драмы': 'Драма',
        'Западные мультфильмы': 'Зарубежные,Анимация',
        'Исторические': 'Исторический',
        'Короткометражки': 'Короткометражные',
        'Детский песни': 'Детский,Музыкальные',
        'Мультфильмы в 3D': 'Анимация',
        'Мультфильмы': 'Анимация',
        'Мультсериалы': 'Анимация,Сериалы',
        'Мюзиклы': 'Музыкальные',
        'Русские мультфильмы': 'Анимация,Русские',
        'Аниме': 'Анимация',
        'Спорт': 'Спортивные',
        'Спортивныеивные': 'Спортивные',
        'Наши': 'Русские',
        'Фильмы в 3D': 'Фильмы',
        'Юмористические': 'Юмористические,Передачи',
        'Кулинария': 'Передачи',
        'Игры': 'Передачи',
        'О здоровье': 'Передачи',
        'Охота и рыбалка': 'Передачи',
        'Реалити-шоу': 'Передачи',
        'Видеоигры': 'Видеоигры,Передачи',
        'Фильмы-спектакли': 'Музыкальные,Фильмы',
        'Познавательные': 'Развивающие,Передачи',
        'Хочу всё знать': 'Развивающие,Передачи',
        'Фантастические': 'Фантастика',
        'Фэнтези': 'Фантастика',
        'Союзмультфильм': 'Союзмультфильм,Анимация',
        'Юмористические': 'Комедии',
        'Развлекательные': 'Комедии',
        'Комедия': 'Комедии',
        'Вестерн': 'Фильмы,Зарубежные,Боевик',
        'Советское кино': 'Советские,Фильмы',
        'Прочие': 'General',
        'Мультфильм': 'Анимация',
        'Музыкальный': 'Музыкальные',
        'Семейный': 'Семейные',
        'Приключение': 'Приключения',
        'Научная фантастика': 'Фантастика',
        'сказка': 'Сказки',
        'Триллер': 'Триллеры',
    }
    if s is None:
        return ['General']
    
    for key in replace_map:
        s = str(s).replace(key, replace_map[key])
        
    return s.split(',')

replace_genres_udf = udf(replace_genres, ArrayType(StringType()))

In [20]:
df_items_genres = df_items.withColumn("genres_arr", replace_genres_udf("genres"))

count_vectorizer = CountVectorizer(inputCol='genres_arr', outputCol="genres_vector", binary=False)
count_vectorizer_model = count_vectorizer.fit(df_items_genres)
df_items_genres = count_vectorizer_model.transform(df_items_genres)

normalizer = Normalizer(inputCol='genres_vector', outputCol="genres_norm")
df_items_genres = normalizer.transform(df_items_genres)

df_items_genres = df_items_genres.select('item_id', 'genres_norm')

df_items_genres.show(5)

+-------+--------------+
|item_id|   genres_norm|
+-------+--------------+
|6310504|(41,[0],[1.0])|
|6235208|(41,[0],[1.0])|
|6200617|(41,[0],[1.0])|
|6234759|(41,[0],[1.0])|
|6326768|(41,[0],[1.0])|
+-------+--------------+
only showing top 5 rows



In [21]:
df_user_genres = (
    df_user
    .join(df_items, df_user.item_id == df_items.item_id, 'inner')
    .select(
        df_user.user_id, 
        replace_genres_udf(df_items.genres).alias("genres_arr"), 
        df_user.purchase
    )
)


df_user_genres = df_user_genres.select(
    df_user_genres.user_id, 
    explode(df_user_genres.genres_arr).alias('genres'), 
    df_user.purchase
)

df_user_genres_all = df_user_genres.groupBy('user_id').agg(collect_list('genres').alias('genres_arr'))
df_user_genres_all = count_vectorizer_model.transform(df_user_genres_all)


df_user_genres_purchase = df_user_genres.filter(df_user.purchase == 1)
df_user_uniq = df_user.select('user_id').distinct()
df_user_genres_purchase = (df_user_uniq
                .join(df_user_genres_purchase, df_user_uniq.user_id == df_user_genres_purchase.user_id, 'left')
                .select(df_user_uniq.user_id, df_user_genres_purchase.genres))

df_user_genres_purchase = (
    df_user_genres_purchase
    .groupBy('user_id')
    .agg(collect_list('genres').alias('genres_arr')))
df_user_genres_purchase = count_vectorizer_model.transform(df_user_genres_purchase)

df_user_genres = df_user_genres_all.join(
    df_user_genres_purchase,
    df_user_genres_all.user_id == df_user_genres_purchase.user_id,
    'inner'
).select(
    df_user_genres_all.user_id, 
    df_user_genres_all.genres_vector.alias('genres_vector_all'),
    df_user_genres_purchase.genres_vector.alias('genres_vector_purchase') 
)

df_user_genres.show(5)

+-------+--------------------+----------------------+
|user_id|   genres_vector_all|genres_vector_purchase|
+-------+--------------------+----------------------+
| 754230|(41,[0,1,2,3,4,5,...|  (41,[1,2,3,4,5,6,...|
| 761341|(41,[0,1,2,3,4,5,...|        (41,[0],[1.0])|
| 776188|(41,[0,1,2,3,4,5,...|  (41,[1,3,4,6,8,9,...|
| 780033|(41,[0,1,2,3,4,5,...|  (41,[1,4,6,8,15],...|
| 798454|(41,[0,1,2,3,4,5,...|  (41,[6,11,12],[1....|
+-------+--------------------+----------------------+
only showing top 5 rows



In [22]:
df_user_genres.filter(df_user_genres.genres_vector_purchase.isNull()).show(5)

+-------+-----------------+----------------------+
|user_id|genres_vector_all|genres_vector_purchase|
+-------+-----------------+----------------------+
+-------+-----------------+----------------------+



In [23]:
df_train = df_user.select(df_user.user_id, df_user.item_id, df_user.purchase.alias('target'))

In [24]:
df_train = (df_train
            .join(df_user_genres, df_train.user_id == df_user_genres.user_id, 'inner')
            .join(df_items_genres, df_train.item_id == df_items_genres.item_id, 'inner')
            .select(
                df_train.user_id,
                df_train.item_id,
                df_train.target,
                
                df_user_genres.genres_vector_all,
                df_user_genres.genres_vector_purchase,
                df_items_genres.genres_norm
            )
            .coalesce(10)
            .cache()
           )

df_train.show(2)

+-------+-------+------+--------------------+----------------------+--------------------+
|user_id|item_id|target|   genres_vector_all|genres_vector_purchase|         genres_norm|
+-------+-------+------+--------------------+----------------------+--------------------+
| 754230|  78087|     0|(41,[0,1,2,3,4,5,...|  (41,[1,2,3,4,5,6,...|(41,[2,4,24,25,26...|
| 754230|  77839|     0|(41,[0,1,2,3,4,5,...|  (41,[1,2,3,4,5,6,...|(41,[3,16,17,18],...|
+-------+-------+------+--------------------+----------------------+--------------------+
only showing top 2 rows



In [25]:
df_train = (df_train
            .join(df_user_user_stat, df_user_user_stat.user_id == df_train.user_id, 'inner')
            .join(df_user_item_stat, df_user_item_stat.item_id == df_train.item_id, 'inner')          
            .select(
                df_train.user_id,
                df_train.item_id,
                df_train.target,
                df_train.genres_vector_all,
                df_train.genres_vector_purchase,
                df_train.genres_norm,
                
                df_user_user_stat.user_purchase_rate,
                df_user_item_stat.item_purchase_rate,
            )
            .coalesce(10)
            .cache()
           )

df_train.filter((df_train.user_id == 816426) & (df_train.item_id == 91200)).show()
df_train.show(2)

+-------+-------+------+--------------------+----------------------+--------------------+------------------+--------------------+
|user_id|item_id|target|   genres_vector_all|genres_vector_purchase|         genres_norm|user_purchase_rate|  item_purchase_rate|
+-------+-------+------+--------------------+----------------------+--------------------+------------------+--------------------+
| 816426|  91200|     0|(41,[0,1,2,3,4,5,...|            (41,[],[])|(41,[1,3,4,9],[0....|               0.0|7.358351729212656E-4|
+-------+-------+------+--------------------+----------------------+--------------------+------------------+--------------------+

+-------+-------+------+--------------------+----------------------+--------------------+--------------------+--------------------+
|user_id|item_id|target|   genres_vector_all|genres_vector_purchase|         genres_norm|  user_purchase_rate|  item_purchase_rate|
+-------+-------+------+--------------------+----------------------+-----------------

In [26]:
df_train = (df_train
            .join(df_user_year, df_user_year.user_id == df_train.user_id, 'inner')
            .join(df_items_year, df_items_year.item_id == df_train.item_id, 'inner')          
            .select(
                df_train.user_id,
                df_train.item_id,
                df_train.target,
                df_train.genres_vector_all,
                df_train.genres_vector_purchase,
                df_train.genres_norm,
                df_train.user_purchase_rate,
                df_train.item_purchase_rate,
                
                df_user_year.year_cat_norm.alias('user_year_norm'),
                df_items_year.year_cat_norm.alias('item_year_norm')
            )
            .coalesce(10)
            .cache())

df_train.show(2)

+-------+-------+------+--------------------+----------------------+--------------------+--------------------+--------------------+--------------------+--------------+
|user_id|item_id|target|   genres_vector_all|genres_vector_purchase|         genres_norm|  user_purchase_rate|  item_purchase_rate|      user_year_norm|item_year_norm|
+-------+-------+------+--------------------+----------------------+--------------------+--------------------+--------------------+--------------------+--------------+
| 754230|   8389|     0|(41,[0,1,2,3,4,5,...|  (41,[1,2,3,4,5,6,...|(41,[2,5,7,23],[0...|0.027575641516660282|0.005979073243647235|(12,[1,2,4,7],[0....|(12,[3],[1.0])|
| 754230|   8638|     1|(41,[0,1,2,3,4,5,...|  (41,[1,2,3,4,5,6,...|(41,[1,4,6,12,20]...|0.027575641516660282|0.001450326323422...|(12,[1,2,4,7],[0....|(12,[1],[1.0])|
+-------+-------+------+--------------------+----------------------+--------------------+--------------------+--------------------+--------------------+--------

In [27]:
assembler = VectorAssembler(
    inputCols=[
        "genres_vector_all", "genres_vector_purchase", "genres_norm", 
        "user_purchase_rate", "item_purchase_rate"
    ], 
    outputCol="features"
)

df_train = assembler.transform(df_train).select("features", "target")
df_train.show(5)

+--------------------+------+
|            features|target|
+--------------------+------+
|(125,[0,1,2,3,4,5...|     0|
|(125,[0,1,2,3,4,5...|     1|
|(125,[0,1,2,3,4,5...|     0|
|(125,[0,1,2,3,4,5...|     0|
|(125,[0,1,2,3,4,5...|     0|
+--------------------+------+
only showing top 5 rows



In [28]:
print(df_train.filter(df_train.target == 0).count())
print(df_train.filter(df_train.target == 1).count())

samle_count = df_train.filter(df_train.target == 1).count() / df_train.filter(df_train.target == 0).count()

df_train = df_train.filter(df_train.target == 1).union(df_train.filter(df_train.target == 0).sample(samle_count)).coalesce(10)

print(df_train.filter(df_train.target == 0).count())
print(df_train.filter(df_train.target == 1).count())

5021720
10904
11073
10904


In [30]:
df_train.show(5)

+--------------------+------+
|            features|target|
+--------------------+------+
|(125,[0,1,2,3,4,5...|     1|
|(125,[0,1,2,3,4,5...|     1|
|(125,[0,1,2,3,4,5...|     1|
|(125,[0,1,2,3,4,5...|     1|
|(125,[0,1,2,3,4,5...|     1|
+--------------------+------+
only showing top 5 rows



In [31]:
lr = LogisticRegression(featuresCol='features', labelCol="target", maxIter=15)
lr_model = lr.fit(df_train)

In [32]:
df_test = df_user_test

In [33]:
df_test = (df_test
            .join(df_user_genres, df_test.user_id == df_user_genres.user_id, 'inner')
            .join(df_items_genres, df_test.item_id == df_items_genres.item_id, 'inner')
            .select(
                df_test.user_id,
                df_test.item_id,
                
                df_user_genres.genres_vector_all,
                df_user_genres.genres_vector_purchase,
                df_items_genres.genres_norm
            )
            .coalesce(10)
            .cache()
           )

df_test.show(5)

+-------+-------+--------------------+----------------------+--------------------+
|user_id|item_id|   genres_vector_all|genres_vector_purchase|         genres_norm|
+-------+-------+--------------------+----------------------+--------------------+
| 754230| 102033|(41,[0,1,2,3,4,5,...|  (41,[1,2,3,4,5,6,...|(41,[2,9],[0.7071...|
| 754230| 100303|(41,[0,1,2,3,4,5,...|  (41,[1,2,3,4,5,6,...|      (41,[3],[1.0])|
| 754230|  79874|(41,[0,1,2,3,4,5,...|  (41,[1,2,3,4,5,6,...|(41,[4,10],[0.707...|
| 754230| 101646|(41,[0,1,2,3,4,5,...|  (41,[1,2,3,4,5,6,...|(41,[2,5,7,14,19,...|
| 754230|  93488|(41,[0,1,2,3,4,5,...|  (41,[1,2,3,4,5,6,...|(41,[6,12,15],[0....|
+-------+-------+--------------------+----------------------+--------------------+
only showing top 5 rows



In [34]:
df_test = (df_test
            .join(df_user_user_stat, df_user_user_stat.user_id == df_test.user_id, 'inner')
            .join(df_user_item_stat, df_user_item_stat.item_id == df_test.item_id, 'inner')          
            .select(
                df_test.user_id,
                df_test.item_id,
                df_test.genres_vector_all,
                df_test.genres_vector_purchase,
                df_test.genres_norm,
                
                df_user_user_stat.user_purchase_rate,
                df_user_item_stat.item_purchase_rate,
            )
            .coalesce(10)
            .cache())

df_test.show(5)

+-------+-------+--------------------+----------------------+--------------------+--------------------+--------------------+
|user_id|item_id|   genres_vector_all|genres_vector_purchase|         genres_norm|  user_purchase_rate|  item_purchase_rate|
+-------+-------+--------------------+----------------------+--------------------+--------------------+--------------------+
| 761341|   8389|(41,[0,1,2,3,4,5,...|        (41,[0],[1.0])|(41,[2,5,7,23],[0...|3.875968992248062E-4|0.005979073243647235|
| 776188|   8389|(41,[0,1,2,3,4,5,...|  (41,[1,3,4,6,8,9,...|(41,[2,5,7,23],[0...|0.001152516327314637|0.005979073243647235|
| 846231|   8389|(41,[0,1,2,3,4,5,...|  (41,[1,2,3,4,6,8,...|(41,[2,5,7,23],[0...|0.001923816852635629|0.005979073243647235|
| 822709|   8389|(41,[0,1,2,3,4,5,...|  (41,[4,9],[1.0,1.0])|(41,[2,5,7,23],[0...|3.789314134141720...|0.005979073243647235|
| 824008|   8389|(41,[0,1,2,3,4,5,...|  (41,[1,4],[1.0,1.0])|(41,[2,5,7,23],[0...|3.821169277799006...|0.005979073243647235|


In [None]:
df_test = (df_test
            .join(df_user_year, df_user_year.user_id == df_test.user_id, 'inner')
            .join(df_items_year, df_items_year.item_id == df_test.item_id, 'inner')          
            .select(
                df_test.user_id,
                df_test.item_id,
                df_test.genres_vector_all,
                df_test.genres_vector_purchase,
                df_test.genres_norm,
                df_test.user_purchase_rate,
                df_test.item_purchase_rate,
                
                df_user_year.year_cat_norm.alias('user_year_norm'),
                df_items_year.year_cat_norm.alias('item_year_norm')
            )
            .coalesce(10)
            .cache())

df_test.show(5)

In [None]:
assembler = VectorAssembler(
    inputCols=["genres_vector_all", "genres_vector_purchase", "genres_norm", 
               "user_purchase_rate", "item_purchase_rate"
              ], 
    outputCol="features"
)

df_test = assembler.transform(df_test).select("user_id", "item_id", "features")
df_test.show(5)

In [44]:
predictions = lr_model.transform(df_test)
# predictions = gbtc_model.transform(df_test)

In [45]:
predictions.groupby('prediction').count().limit(10).toPandas()

Unnamed: 0,prediction,count
0,0.0,1851975
1,1.0,304865


In [46]:
predictions = (df_user_test
                .join(predictions, 
                      (df_user_test.user_id == predictions.user_id) & (df_user_test.item_id == predictions.item_id), 
                      'left')
                .select(df_user_test.user_id, df_user_test.item_id, predictions.prediction.alias("purchase"))
                .fillna(value=0.0, subset=["purchase"])
                .repartition(10)
                .cache()
               )

print(df_user_test.count())
print(predictions.count())
predictions.show(5)

2156840
2156840
+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
| 938008|  94183|     0.0|
| 928787|  99730|     0.0|
| 801678|  73229|     0.0|
| 878233|  78285|     0.0|
| 894258|  98731|     0.0|
+-------+-------+--------+
only showing top 5 rows



In [47]:
df = (predictions
      .sort("user_id", "item_id")
      .toPandas())

df.to_csv('lab03.csv')

In [48]:
spark.stop()