In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 5g pyspark-shell'
#os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.5
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
import pandas as pd
import numpy as np

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

# Исследуем данные

In [4]:
# USERS PURCHASES

schema = StructType([StructField("user_id", IntegerType()),
                    StructField("item_id", IntegerType()),
                    StructField("purchase", DoubleType())])

df_train = spark.read.csv('/labs/laba03/lab10_train.csv', header= True, schema=schema)
test = spark.read.csv('/labs/laba03/lab10_test.csv', header= True, schema=schema)

df_views = spark.read.csv('/labs/laba03/lab10_views_programmes.csv', header= True)
df_items = spark.read.csv('/labs/laba03/lab10_items.csv', header= True, sep= '\t')

In [5]:
print('Num partitions ',df_train.rdd.getNumPartitions())
print('Row count ', df_train.count())
df_train.show(5)

Num partitions  5
Row count  5032624
+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|     0.0|
|   1654|  89249|     0.0|
|   1654|  99982|     0.0|
|   1654|  89901|     0.0|
|   1654| 100504|     0.0|
+-------+-------+--------+
only showing top 5 rows



In [9]:
print('Num partitions ', test.rdd.getNumPartitions())
print('Row count ', test.count())
test.show(2)

Num partitions  5
Row count  2156840
+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  94814|    null|
|   1654|  93629|    null|
+-------+-------+--------+
only showing top 2 rows



In [10]:
print('Num partitions ', df_views.rdd.getNumPartitions())
print('Row count ', df_views.count())
df_views.show(2)

Num partitions  7
Row count  20845607
+-------+-------+----------+----------+---------+
|user_id|item_id|  ts_start|    ts_end|item_type|
+-------+-------+----------+----------+---------+
|      0|7101053|1491409931|1491411600|     live|
|      0|7101054|1491412481|1491451571|     live|
+-------+-------+----------+----------+---------+
only showing top 2 rows



In [11]:
print('Num partitions ', df_items.rdd.getNumPartitions())
print('Row count ', df_items.count())
df_items.show(1, vertical=True, truncate=False)

Num partitions  5
Row count  635568
-RECORD 0--------------------------------------------------------------------
 item_id                     | 65667                                         
 channel_id                  | null                                          
 datetime_availability_start | 1970-01-01T00:00:00Z                          
 datetime_availability_stop  | 2018-01-01T00:00:00Z                          
 datetime_show_start         | null                                          
 datetime_show_stop          | null                                          
 content_type                | 1                                             
 title                       | на пробах только девушки (all girl auditions) 
 year                        | 2013.0                                        
 genres                      | Эротика                                       
 region_id                   | null                                          
only showing top 1 row



In [12]:
spark.conf.set("spark.sql.shuffle.partitions", 200)

# Посмотрим какое кол-во человек из обучающей выборки делали покупки

In [13]:
df_train.select('user_id').distinct().count()

1941

In [14]:
df_train[df_train['purchase'] == 1].select('user_id').distinct().count()

1675

### Посмотрим покупки одного пользователя

In [15]:
df_train[(df_train['user_id'] == 1654) & 
         (df_train['purchase'] == 1)].show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|   9897|     1.0|
|   1654|   7394|     1.0|
|   1654|   9064|     1.0|
|   1654|  73216|     1.0|
|   1654|  88816|     1.0|
+-------+-------+--------+



In [16]:
df_items[df_items['item_id'].isin([9897, 7394, 9064, 73216, 88816])]\
        .show(5, vertical=True, truncate=False)

-RECORD 0--------------------------------------------------------------------
 item_id                     | 7394                                          
 channel_id                  | null                                          
 datetime_availability_start | 1970-01-01T00:00:00Z                          
 datetime_availability_stop  | 2099-12-31T21:00:00Z                          
 datetime_show_start         | null                                          
 datetime_show_stop          | null                                          
 content_type                | 1                                             
 title                       | лиса и заяц                                   
 year                        | 1973.0                                        
 genres                      | Мультфильмы,Союзмультфильм,Наши               
 region_id                   | null                                          
-RECORD 1-------------------------------------------------------

# Посмотрим, как распределены между собой метки классов

In [17]:
df_train.groupBy("purchase").count().collect()

[Row(purchase=0.0, count=5021720), Row(purchase=1.0, count=10904)]

## Train-Validation Split

In [18]:
train = df_train.sampleBy("purchase", fractions={0: 0.8, 1: 0.8}, seed=5757)

valid = df_train.join(train, on=["user_id", "item_id"], how="leftanti")

test = spark.read.csv('/labs/laba03/lab10_test.csv', header= True, schema=schema)

In [19]:
train.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|     0.0|
|   1654|  89249|     0.0|
|   1654|  99982|     0.0|
|   1654|  89901|     0.0|
|   1654|  84350|     0.0|
+-------+-------+--------+
only showing top 5 rows



In [20]:
train.count()

4026087

In [21]:
valid.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|   7679|     0.0|
|   1654|  11060|     0.0|
|   1654|  67318|     0.0|
|   1654|  72891|     0.0|
|   1654|  87940|     0.0|
+-------+-------+--------+
only showing top 5 rows



# Создадим фичи, которые отражают склонность к покупке пользователя и "покупаемость"  item'а

In [22]:
from pyspark.sql.functions import col

train_purchases = train.groupBy('user_id')\
                .sum().select(col("sum(purchase)").alias("user_purchases"), col("user_id")).cache()

train_purchases.show(2)

+--------------+-------+
|user_purchases|user_id|
+--------------+-------+
|           1.0| 875232|
|           3.0| 878742|
+--------------+-------+
only showing top 2 rows



In [23]:
item_purchases = train.groupBy('item_id')\
                        .sum().select(col("sum(purchase)").alias("item_purchases"), col("item_id")).cache()

item_purchases.show(2)

+--------------+-------+
|item_purchases|item_id|
+--------------+-------+
|           1.0| 100483|
|           1.0|  88930|
+--------------+-------+
only showing top 2 rows



In [24]:
# Сколько было покупок у пользователя и сколько раз покупали item?
train = train.join(train_purchases, on='user_id', how='left')
valid = valid.join(train_purchases, on='user_id', how='left')
test = test.join(train_purchases, on='user_id', how='left')

train = train.join(item_purchases, on='item_id', how='left')
valid = valid.join(item_purchases, on='item_id', how='left')
test = test.join(item_purchases, on='item_id', how='left')

In [25]:
train_user_attempts = train.groupBy('user_id').count().select(col("count").alias("user_attempts"), col("user_id"))\
                            .cache()

train_item_attempts = train.groupBy('item_id').count().select(col("count").alias("item_attempts"), col("item_id"))\
                            .cache()

train_user_attempts.show(2)

+-------------+-------+
|user_attempts|user_id|
+-------------+-------+
|         2128| 875232|
|         2030| 878742|
+-------------+-------+
only showing top 2 rows



In [26]:
train_item_attempts.show(2)

+-------------+-------+
|item_attempts|item_id|
+-------------+-------+
|         1074| 100191|
|         1079|  81863|
+-------------+-------+
only showing top 2 rows



In [27]:
train = train.join(train_user_attempts, on='user_id', how='left')
valid = valid.join(train_user_attempts, on='user_id', how='left')
test = test.join(train_user_attempts, on='user_id', how='left')

train = train.join(train_item_attempts, on='item_id', how='left')
valid = valid.join(train_item_attempts, on='item_id', how='left')
test = test.join(train_item_attempts, on='item_id', how='left')

In [28]:
test.show(2)

+-------+-------+--------+--------------+--------------+-------------+-------------+
|item_id|user_id|purchase|user_purchases|item_purchases|user_attempts|item_attempts|
+-------+-------+--------+--------------+--------------+-------------+-------------+
|  94814|   1654|    null|           4.0|           1.0|         2014|         1096|
|  93629|   1654|    null|           4.0|           4.0|         2014|         1098|
+-------+-------+--------+--------------+--------------+-------------+-------------+
only showing top 2 rows



In [29]:
train = train.withColumn('user_addict', col('user_purchases') / col('user_attempts'))
valid = valid.withColumn('user_addict', col('user_purchases') / col('user_attempts'))
test = test.withColumn('user_addict', col('user_purchases') / col('user_attempts'))

In [30]:
train = train.withColumn('item_addict', col('item_purchases') / col('item_attempts'))
valid = valid.withColumn('item_addict', col('item_purchases') / col('item_attempts'))
test = test.withColumn('item_addict', col('item_purchases') / col('item_attempts'))

In [31]:
train.show(2)

+-------+-------+--------+--------------+--------------+-------------+-------------+--------------------+--------------------+
|item_id|user_id|purchase|user_purchases|item_purchases|user_attempts|item_attempts|         user_addict|         item_addict|
+-------+-------+--------+--------------+--------------+-------------+-------------+--------------------+--------------------+
|  74107|   1654|     0.0|           4.0|           1.0|         2014|         1073|0.001986097318768...|9.319664492078285E-4|
|  89249|   1654|     0.0|           4.0|           1.0|         2014|         1087|0.001986097318768...|9.199632014719411E-4|
+-------+-------+--------+--------------+--------------+-------------+-------------+--------------------+--------------------+
only showing top 2 rows



In [32]:
# На всякий случай заполняем пропуски
test = test.na.fill(0)
train = train.na.fill(0)
valid = valid.na.fill(0)

In [33]:
train_purchases.unpersist()
item_purchases.unpersist()
train_user_attempts.unpersist()
train_item_attempts.unpersist()

DataFrame[item_attempts: bigint, item_id: int]

In [35]:
from pyspark.ml.feature import VectorAssembler
# Выбираю колонки, которые войдут в features для GBT
cols = ['item_purchases', 'user_purchases', 'user_addict', 'item_addict']
assembler = VectorAssembler(inputCols=cols, outputCol="features")

train_data = assembler.transform(train).cache()
valid_data = assembler.transform(valid)
test_data = assembler.transform(test)

In [40]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol="purchase")

pipeline = Pipeline(stages=[
    gbt
])

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="purchase", metricName='areaUnderROC')
# score = evaluator.evaluate(predictions_valid)
# score

In [41]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [42]:
paramGrid = ParamGridBuilder().addGrid(gbt.maxDepth, [3, 4])\
                              .addGrid(gbt.minInstancesPerNode, [2, 3])\
                              .addGrid(gbt.maxBins, [50, 55])\
                              .build()

In [43]:
crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,
                              evaluator=evaluator, numFolds=3, parallelism=3)

In [45]:
cv_model = crossval.fit(train_data)

In [46]:
cv_model.avgMetrics

[0.9192005117571451,
 0.9180166903128033,
 0.9192005117571451,
 0.9180166903128033,
 0.9335605971624095,
 0.932569587591231,
 0.9335605971624097,
 0.9324850247708811]

In [57]:
cv_model.getEstimatorParamMaps()[np.argmax(cv_model.avgMetrics)]

{Param(parent='GBTClassifier_577b570e32d7', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 4,
 Param(parent='GBTClassifier_577b570e32d7', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 3,
 Param(parent='GBTClassifier_577b570e32d7', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 50}

In [50]:
predictions_valid = cv_model.transform(valid_data)

In [51]:
evaluator.evaluate(predictions_valid)

0.8863325492598007

In [58]:
train_data.unpersist()

DataFrame[item_id: int, user_id: int, purchase: double, user_purchases: double, item_purchases: double, user_attempts: bigint, item_attempts: bigint, user_addict: double, item_addict: double, features: vector]

In [59]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol="purchase", maxDepth=4, minInstancesPerNode=3, maxBins=50)

gbt_model = gbt.fit(train_data)
predictions_valid = gbt_model.transform(valid_data)

In [60]:
gbt_model.featureImportances

SparseVector(4, {0: 0.3831, 1: 0.3993, 2: 0.0922, 3: 0.1253})

In [61]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="purchase", metricName='areaUnderROC')
score = evaluator.evaluate(predictions_valid)
score

0.8865449000585436

## Создадим фичу, используя жанры

In [62]:
df_items.show(vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------
 item_id                     | 65667                                                                                  
 channel_id                  | null                                                                                   
 datetime_availability_start | 1970-01-01T00:00:00Z                                                                   
 datetime_availability_stop  | 2018-01-01T00:00:00Z                                                                   
 datetime_show_start         | null                                                                                   
 datetime_show_stop          | null                                                                                   
 content_type                | 1                                                                                      
 title                       | на пробах только 

In [63]:
from pyspark.ml.feature import RegexTokenizer

items_genres_years = df_items.select('item_id', 'genres', 'year')
items_genres_years = items_genres_years.na.fill({'genres': u'_'})
items_genres_years = items_genres_years.na.fill({'year': u'1899'})
items_genres_years = items_genres_years.withColumn('year', items_genres_years.year.cast(IntegerType()))

tokenizer = RegexTokenizer(inputCol="genres", outputCol="genre_tokens", gaps=False, pattern=u"[_А-Яа-яёЁ]+", toLowercase=False)

items_genres_years_tk = tokenizer.transform(items_genres_years)

In [64]:
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="genre_tokens", outputCol="genre_vector")
cv_model = cv.fit(items_genres_years_tk)
items_features_vec = cv_model.transform(items_genres_years_tk)

In [65]:
items_features_vec.show(5)

+-------+-------+----+------------+---------------+
|item_id| genres|year|genre_tokens|   genre_vector|
+-------+-------+----+------------+---------------+
|  65667|Эротика|2013|   [Эротика]|(96,[22],[1.0])|
|  65669|Эротика|2011|   [Эротика]|(96,[22],[1.0])|
|  65668|Эротика|2011|   [Эротика]|(96,[22],[1.0])|
|  65671|Эротика|2011|   [Эротика]|(96,[22],[1.0])|
|  65670|Эротика|2010|   [Эротика]|(96,[22],[1.0])|
+-------+-------+----+------------+---------------+
only showing top 5 rows



In [66]:
train_genres = train.join(items_features_vec, on='item_id', how='left')
valid_genres = valid.join(items_features_vec, on='item_id', how='left')
test_genres = test.join(items_features_vec, on='item_id', how='left')

In [67]:
# Выбираю колонки, которые войдут в features для GBT
cols = ['item_purchases', 'user_purchases', 'user_addict', 'item_addict', 'year', 'genre_vector']
assembler = VectorAssembler(inputCols=cols, outputCol="features")

train_data = assembler.transform(train_genres).cache()
valid_data = assembler.transform(valid_genres)
test_data = assembler.transform(test_genres)

In [68]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol="purchase", maxDepth=4, minInstancesPerNode=3, maxBins=50)

gbt_model = gbt.fit(train_data)
predictions_valid = gbt_model.transform(valid_data)

In [69]:
gbt_model.featureImportances

SparseVector(101, {0: 0.3567, 1: 0.3587, 2: 0.111, 3: 0.1339, 5: 0.0011, 12: 0.0013, 21: 0.0072, 27: 0.0035, 41: 0.0004, 49: 0.0189, 61: 0.0049, 75: 0.0024})

In [70]:
evaluator = BinaryClassificationEvaluator(labelCol="purchase", metricName='areaUnderROC')
score = evaluator.evaluate(predictions_valid)
score

0.8898306281434082

In [71]:
train_data.unpersist()

DataFrame[item_id: int, user_id: int, purchase: double, user_purchases: double, item_purchases: double, user_attempts: bigint, item_attempts: bigint, user_addict: double, item_addict: double, genres: string, year: int, genre_tokens: array<string>, genre_vector: vector, features: vector]

# Добавим вектор пользовательской истории

In [72]:
# Добавим вектор пользовательской истории
from pyspark.sql.functions import monotonically_increasing_id

items_count = train.groupBy('item_id').count().withColumnRenamed('count', 'item_count')
items_desc_count = items_count.orderBy(items_count.item_count.desc()).limit(500) 
items_desc_count = items_desc_count.coalesce(1)
items_desc_count = items_desc_count.withColumn("item_row_id", monotonically_increasing_id())

items_desc_count.cache()

DataFrame[item_id: int, item_count: bigint, item_row_id: bigint]

In [73]:
items_desc_count.show(2)

+-------+----------+-----------+
|item_id|item_count|item_row_id|
+-------+----------+-----------+
|  66185|      1172|          0|
|  94158|      1172|          1|
+-------+----------+-----------+
only showing top 2 rows



In [74]:
train_truncated = train.join(items_desc_count, on='item_id', how='inner')\
                    .select('user_id', 'item_row_id', 'purchase').cache()

train_truncated.show(2)

+-------+-----------+--------+
|user_id|item_row_id|purchase|
+-------+-----------+--------+
| 793430|        266|     0.0|
| 795620|        266|     0.0|
+-------+-----------+--------+
only showing top 2 rows



In [75]:
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
from pyspark.ml.linalg import VectorUDT

from pyspark.sql.functions import udf

In [76]:
# Now we build up a history vector

as_ml = udf(lambda v: v.asML(), VectorUDT())

train_matrix = train_truncated.rdd.map(lambda r: MatrixEntry(r[0], r[1], r[2]))
train_matrix = CoordinateMatrix(train_matrix)

train_row_mat_i = train_matrix.toIndexedRowMatrix()

train_mat_df = train_row_mat_i.rows.toDF().withColumnRenamed('index', 'user_id') \
                                    .withColumn("history_vec", as_ml("vector"))    

In [77]:
train_mat_df.show(10)

+-------+--------------------+--------------------+
|user_id|              vector|         history_vec|
+-------+--------------------+--------------------+
| 918750|(500,[3,6,7,8,9,1...|(500,[3,6,7,8,9,1...|
| 922400|(500,[2,3,4,5,7,8...|(500,[2,3,4,5,7,8...|
| 932650|(500,[1,2,3,4,5,8...|(500,[1,2,3,4,5,8...|
| 892100|(500,[0,1,2,3,5,7...|(500,[0,1,2,3,5,7...|
| 940600|(500,[1,3,9,11,15...|(500,[1,3,9,11,15...|
| 891250|(500,[0,1,2,3,4,5...|(500,[0,1,2,3,4,5...|
| 857500|(500,[0,3,4,5,6,8...|(500,[0,3,4,5,6,8...|
| 836300|(500,[0,1,2,3,7,9...|(500,[0,1,2,3,7,9...|
| 906250|(500,[0,2,4,8,12,...|(500,[0,2,4,8,12,...|
| 893850|(500,[3,4,5,6,7,8...|(500,[3,4,5,6,7,8...|
+-------+--------------------+--------------------+
only showing top 10 rows



In [78]:
train_truncated.unpersist()

DataFrame[user_id: int, item_row_id: bigint, purchase: double]

In [79]:
# Joining everything together

train_hist = train.join(train_mat_df, 'user_id', 'left') \
                    .join(items_features_vec, on='item_id', how='left') 

valid_hist = valid.join(train_mat_df, 'user_id', 'left') \
                    .join(items_features_vec, on='item_id', how='left') 

test_hist = test.join(train_mat_df, 'user_id', 'left') \
                    .join(items_features_vec, on='item_id', how='left')         

In [80]:
train_hist.show(2)

+-------+-------+--------+--------------+--------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+----+--------------------+--------------------+
|item_id|user_id|purchase|user_purchases|item_purchases|user_attempts|item_attempts|         user_addict|         item_addict|              vector|         history_vec|              genres|year|        genre_tokens|        genre_vector|
+-------+-------+--------+--------------+--------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+----+--------------------+--------------------+
|    326| 749587|     0.0|          34.0|           1.0|         2062|         1076|0.016488845780795344|9.293680297397769E-4|(500,[1,4,5,6,7,8...|(500,[1,4,5,6,7,8...|Ужасы,Триллеры,Др...|2012|[Ужасы, Триллеры,...|(96,[0,1,5,11,13]...|
|    326| 767447|     0.0|           1.0|           

In [81]:
assembler = VectorAssembler(inputCols=['item_purchases', 'user_purchases', 'user_addict', 'item_addict', 'history_vec'],
                                outputCol="features")

train_data = assembler.transform(train_hist).cache()
valid_data = assembler.transform(valid_hist)
test_data = assembler.transform(test_hist)

In [82]:
train_data.where(train_data.features.isNull()).count()

0

In [85]:
gbt = GBTClassifier(labelCol="purchase", maxDepth=4, minInstancesPerNode=3, maxBins=50)

gbt_model = gbt.fit(train_data)
predictions_valid = gbt_model.transform(valid_data)

In [86]:
evaluator = BinaryClassificationEvaluator(labelCol="purchase", metricName='areaUnderROC')
score = evaluator.evaluate(predictions_valid)
score

0.8926062410810246

In [None]:
train_data.unpersist()

In [87]:
test_predictions = gbt_model.transform(test_data)

In [89]:
test_predictions.show(1, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [94]:
predictions_pd = test_predictions.select("user_id", "item_id", col("probability").alias("purchase")).toPandas()
predictions_pd = predictions_pd.sort_values(by=['user_id', 'item_id'])
predictions_pd['purchase'] = predictions_pd['purchase'].apply(lambda x: x[1])
predictions_pd.to_csv('lab03.csv', index=False)

  Unsupported type in conversion to Arrow: VectorUDT
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.


In [95]:
sc.stop()