In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as f

In [None]:
from pyspark.ml import Pipeline, Transformer, Estimator

In [None]:
from pyspark.sql.window import Window

In [54]:
from pyspark.ml.feature import VectorAssembler, CountVectorizer

In [55]:
from pyspark.ml.classification import GBTClassifier, LogisticRegression, MultilayerPerceptronClassificationModel

In [4]:
spark_config = SparkConf()
spark = SparkSession.builder\
                    .config(conf=spark_config)\
                    .appName("rysin_lab03")\
                    .getOrCreate()

In [5]:
!hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2021-02-27 22:12 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2021-02-27 22:12 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2021-02-27 22:12 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2021-02-27 22:12 /labs/slaba03/laba03_views_programmes.csv


In [8]:
path_to_train = '/labs/slaba03/laba03_train.csv'
path_to_test = '/labs/slaba03/laba03_test.csv'
path_to_items = '/labs/slaba03/laba03_items.csv'
path_to_watches = '/labs/slaba03/laba03_views_programmes.csv'

In [10]:
train = spark.read\
                .format('csv')\
                .schema(StructType(fields=[StructField('user_id', IntegerType()), 
                                        StructField('item_id', IntegerType()),
                                              StructField('purchase', IntegerType(), nullable=True), 
                                             ]))\
                .option("header", "true")\
                .load(path_to_train)

In [11]:
train.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
+-------+-------+--------+
only showing top 5 rows



In [12]:
test = spark.read\
                .format('csv')\
                .schema(StructType(fields=[StructField('user_id', IntegerType()), 
                                        StructField('item_id', IntegerType()),
                                              StructField('purchase', IntegerType(), nullable=True), 
                                             ]))\
                .option("header", "true")\
                .load(path_to_test)

In [13]:
test.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  94814|    null|
|   1654|  93629|    null|
|   1654|   9980|    null|
|   1654|  95099|    null|
|   1654|  11265|    null|
+-------+-------+--------+
only showing top 5 rows



In [15]:
items = spark.read\
                .format('csv')\
                .schema(StructType(fields=[StructField('item_id', IntegerType()), 
                                       StructField('channel_id', IntegerType()),
                                       StructField('datetime_availability_start', StringType()),
                                       StructField('datetime_availability_stop', StringType()),
                                       StructField('datetime_show_start', StringType()),
                                       StructField('datetime_show_stop', StringType()),
                                       StructField('content_type', IntegerType()),
                                       StructField('title', StringType(), nullable=True),
                                       StructField('year', FloatType(), nullable=True),
                                       StructField('genres', StringType()),
                                       StructField('region_id', IntegerType()),
                                      ])
)\
                .option("header", "true")\
                .option("delimiter", "\\t")\
                .load(path_to_items)

items = items.select(['item_id', 'content_type', 'year', 'genres'])\
                     .na.fill({'year': -999, 'genres': 'unknown'})

In [15]:
items.show(10, truncate=False)

+-------+------------+------+------------------------------------------+
|item_id|content_type|year  |genres                                    |
+-------+------------+------+------------------------------------------+
|65667  |1           |2013.0|Эротика                                   |
|65669  |1           |2011.0|Эротика                                   |
|65668  |1           |2011.0|Эротика                                   |
|65671  |1           |2011.0|Эротика                                   |
|65670  |1           |2010.0|Эротика                                   |
|65809  |1           |2016.0|Комедии                                   |
|65810  |1           |2016.0|Комедии,Мелодрамы                         |
|326    |1           |2012.0|Ужасы,Триллеры,Драмы,Фантастика,Зарубежные|
|336    |1           |2012.0|Ужасы,Комедии,Фантастика,Зарубежные       |
|357    |1           |2012.0|Комедии,Мелодрамы,Наши                    |
+-------+------------+------+----------------------

In [16]:
watches =  spark.read\
                .format('csv')\
                .schema(StructType(fields=[StructField('user_id', IntegerType()), 
                                       StructField('item_id', IntegerType()),
                                       StructField('ts_start', IntegerType()),
                                       StructField('ts_end', IntegerType()),
                                       StructField('item_type', StringType()),
                                      ]))\
                .option("header", "true")\
                .load(path_to_watches)

In [17]:
watches.show(5)

+-------+-------+----------+----------+---------+
|user_id|item_id|  ts_start|    ts_end|item_type|
+-------+-------+----------+----------+---------+
|      0|7101053|1491409931|1491411600|     live|
|      0|7101054|1491412481|1491451571|     live|
|      0|7101054|1491411640|1491412481|     live|
|      0|6184414|1486191290|1486191640|     live|
|    257|4436877|1490628499|1490630256|     live|
+-------+-------+----------+----------+---------+
only showing top 5 rows



### EDA

In [17]:
# число уникальных пользователей в train выборке 
print(train.select('user_id').distinct().count())
# число уникальных пользователей в test выборке
print(test.select('user_id').distinct().count())
# число уникальных пользователей в справочнике фильмов
print(watches.select('user_id').distinct().count())

1941
1941
79385


In [18]:
# число уникальных фильмов в train выборке 
print(train.select('item_id').distinct().count())
# число уникальных фильмов в test выборке
print(test.select('item_id').distinct().count())
# число уникальных фильмов в справочнике фильмов
print(items.select('item_id').distinct().count())

3704
3704
635568


In [19]:
# баланс классов
train_sdf.groupBy("purchase").count().collect()

[Row(purchase=0, count=5021720), Row(purchase=1, count=10904)]

### Фичи, связанные с числом просмотров пользователей и айтемов

In [47]:
class TargetEncoder(Estimator):
    
    def __init__(self):
        Transformer.__init__(self)
        self.item_mean_purchase = None
        self.user_mean_purchase = None
        
    def fit(self, X):
        self.user_mean_purchase = X.select(['user_id', 'purchase'])\
                                   .groupBy('user_id')\
                                   .agg(f.mean('purchase').alias('user_purchase_mean'),
                                        f.sum('purchase').alias('user_purchase_count')
                                       )
        self.item_mean_purchase = X.select(['item_id', 'purchase'])\
                                   .groupBy('item_id')\
                                   .agg(f.mean('purchase').alias('item_purchase_mean'), 
                                        f.sum('purchase').alias('item_purchase_count')
                                       )
        return self
        
    def transform(self, X):
        X = X.join(self.item_mean_purchase, on='item_id', how='inner')
        X = X.join(self.user_mean_purchase, on='user_id', how='inner')
        return X

In [66]:
target_features = Pipeline(stages=[TargetEncoder()])

target_features_model = target_features.fit(train)
X_train = target_features_model.transform(train)
X_test = target_features_model.transform(test)

In [67]:
X_train.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)
 |-- item_purchase_mean: double (nullable = true)
 |-- item_purchase_count: long (nullable = true)
 |-- user_purchase_mean: double (nullable = true)
 |-- user_purchase_count: long (nullable = true)



###  Фичи айтемов

In [68]:
X_train = X_train.join(items, on='item_id', 
                       how='left').withColumn("genres_array", 
                                              f.split(f.col("genres"), ','))
X_test = X_test.join(items, on='item_id', 
                     how='left').withColumn("genres_array", 
                                            f.split(f.col("genres"), ','))

In [69]:
cntv = CountVectorizer(inputCol="genres_array", 
                       outputCol="genres_vector")

In [70]:
cntv = cntv.fit(X_train)

In [71]:
X_train = cntv.transform(X_train)

In [72]:
X_test = cntv.transform(X_test)

### Просмотры передач

In [73]:
user_watches = watches\
        .withColumn('watched_time', 
                    f.col('ts_end') - f.col('ts_start')
                   ).na.fill(0).groupBy(['user_id'])\
.agg(f.sum('watched_time').alias('sum_watched'),
    f.mean('watched_time').alias('mean_watched')
    )

In [74]:
X_train = X_train.join(user_watches, on='user_id', how='left').na.fill(0)

In [75]:
X_test = X_test.join(user_watches, on='user_id', how='left').na.fill(0)

### ALS Embeddings

In [76]:
from pyspark.ml.recommendation import ALS

from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf


list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())

In [77]:
class UserItemEmbeddings(Estimator):
    
    def __init__(self):
        Transformer.__init__(self)
        self.user_emb = None
        self.item_emb = None
    
    def fit(self, user_item_intercations):
        als_model = ALS(maxIter=20, rank=20, userCol='user_id', itemCol='item_id', ratingCol='purchase')
        als_model_fitted = als_model.fit(user_item_intercations)
        self.user_emb = als_model_fitted.userFactors.select(f.col('id').alias('user_id'),
                                                            list_to_vector_udf(f.col('features')).alias('user_embeddings')
                                                           )
        self.item_emb = als_model_fitted.itemFactors.select(f.col('id').alias('item_id'),
                                                            list_to_vector_udf(f.col('features')).alias('item_embeddings')
                                                           )
        return self
    
    def transform(self, X):
        X = X.join(self.user_emb, on='user_id', how='left')
        X = X.join(self.item_emb, on='item_id', how='left')
        
        return X

In [78]:
emb_features_pippeline = Pipeline(stages=[UserItemEmbeddings()])

In [None]:
emb_features = emb_features_pippeline.fit(train)

In [None]:
X_train = emb_features.transform(X_train)

In [None]:
X_test = emb_features.transform(X_test)

In [37]:
X_train.show(5, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 item_id         | 8389                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
 user_id         | 746713     

In [None]:
X_train.printSchema()

### Обучение и скоринг модели

In [39]:
features_list = ['user_id', 'item_id', 
                 'year', 'genres_vector', 'content_type',
                 'user_purchase_mean', 'user_purchase_count', 
                 'item_purchase_mean', 'item_purchase_count',
                 'sum_watched', 'mean_watched',
                 'user_embeddings', 'item_embeddings'
                ]

In [None]:
features_assembler = VectorAssembler(inputCols=features_list, 
                                     outputCol="features")

In [44]:
features_assembler.transform(X_train).select('features').take(5)

[Row(features=SparseVector(138, {0: 517612.0, 1: 326.0, 2: 2012.0, 3: 1.0, 4: 1.0, 6: 1.0, 13: 1.0, 15: 1.0, 87: 1.0, 88: 0.0004, 89: 1.0, 90: 0.0007, 91: 1.0, 92: 287400.0, 93: 3314.8853, 94: 246971.0, 95: 3741.9848, 96: 40429.0, 97: 2887.7857, 98: -0.0, 99: -0.0, 100: -0.0, 101: 0.0, 102: 0.0, 103: -0.0, 104: 0.0, 105: -0.0, 106: -0.0, 107: -0.0, 108: 0.0, 109: 0.0, 110: 0.0, 111: -0.0, 112: 0.0, 113: 0.0, 114: 0.0, 115: 0.0, 116: -0.0, 117: 0.0, 118: -0.0, 119: -0.0, 120: -0.0, 121: 0.0, 122: 0.0, 123: -0.0, 124: 0.0, 125: -0.0, 126: -0.0, 127: -0.0, 128: 0.0, 129: 0.0, 130: 0.0, 131: -0.0, 132: 0.0, 133: 0.0, 134: 0.0, 135: 0.0, 136: -0.0, 137: 0.0})),
 Row(features=SparseVector(138, {0: 522798.0, 1: 326.0, 2: 2012.0, 3: 1.0, 4: 1.0, 6: 1.0, 13: 1.0, 15: 1.0, 87: 1.0, 88: 0.0012, 89: 3.0, 90: 0.0007, 91: 1.0, 92: 6229271.0, 93: 3254.6371, 94: 6036928.0, 95: 3837.8436, 96: 192343.0, 97: 2671.4306, 98: -0.0, 99: -0.0, 100: -0.0, 101: 0.0, 102: 0.0, 103: -0.0, 104: 0.0, 105: -0.0, 106

In [None]:
gbt = GBTClassifier(featuresCol="features", 
                    labelCol="purchase", 
                    seed=1234,
                    maxIter=50, 
                    maxDepth=7)

In [None]:
model_pipeline = Pipeline(stages=[features_assembler,
                                  gbt
                                 ])
  
model = model_pipeline.fit(X_train)

In [None]:
predictions = model.transform(X_test)

### Зпись скоров в файл

In [None]:
answer_df = predictions.select(['user_id','item_id','probability'])\
                        .orderBy(['user_id','item_id']).toPandas()

In [None]:
answer_df['purchase'] = answer_df['probability'].apply(lambda x: x[1])

In [None]:
answer_df = answer_df.drop('probability', axis=1)

In [None]:
answer_df.to_csv('lab03.csv')

### Тушим контекст

In [None]:
spark.stop()