In [9]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [10]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row

from pyspark.ml.feature import HashingTF, StopWordsRemover, OneHotEncoder, RegexTokenizer, VectorAssembler, CountVectorizer, MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.mllib.linalg import SparseVector
from pyspark.sql.window import Window
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [11]:
PARTITION_NUM = 4

In [None]:
! hdfs dfs -ls /labs/slaba03/

In [4]:
! hdfs dfs -get /labs/slaba03/laba03_items.csv

In [None]:
! hdfs dfs -head /labs/slaba03/laba03_train.csv

In [12]:
train_schema = StructType([
    StructField('user_id', IntegerType()),
    StructField('item_id', IntegerType()),
    StructField('purchase', IntegerType()),
])

test_schema = StructType([
    StructField('user_id', IntegerType()),
    StructField('item_id', IntegerType()),
])


items_schema = StructType([
    StructField('user_id', IntegerType()),
    StructField('item_id', IntegerType()),
    StructField('datetime_availability_start', TimestampType()),
    StructField('datetime_availability_stop', TimestampType()),
    StructField('datetime_show_start', TimestampType()),
    StructField('datetime_show_stop', TimestampType()),
    StructField('content_type', IntegerType()),
    StructField('title', StringType()),
    StructField('year', IntegerType()),
    StructField('genres', StringType()),
    StructField('region_id', IntegerType()),
])

views_programmes_schema = StructType([
    StructField('user_id', IntegerType()),
    StructField('item_id', IntegerType()),
    StructField('ts_start', TimestampType()),
    StructField('ts_end', TimestampType()),
    StructField('item_type', StringType()),
])

In [13]:
# читаем данные
train = spark.read.csv('/labs/slaba03/laba03_train.csv', header=True, schema=train_schema)
items = spark.read.csv('/labs/slaba03/laba03_items.csv', header=True, sep='\t', inferSchema=True)
views_programmes = spark.read.csv('/labs/slaba03/laba03_views_programmes.csv', header=True, inferSchema=True)
test = spark.read.csv('/labs/slaba03/laba03_test.csv', header=True, schema=test_schema)

In [14]:
# преобразуем типы
items = items \
        .withColumn('channel_id', items['channel_id'].cast('int')) \
        .withColumn('year', items['year'].cast('int')) \
        .withColumn('region_id', items['region_id'].cast('int')) \
        .withColumn('genres_arr', F.split(F.col('genres'), ",")) \
        .drop('genres') \
        .withColumnRenamed('genres_arr', 'genres')

views_programmes = views_programmes \
                 .withColumn('ts_start', views_programmes['ts_start'].cast('timestamp')) \
                 .withColumn('ts_end', views_programmes['ts_end'].cast('timestamp'))

In [15]:
# репартиционируем
train = train.repartition(PARTITION_NUM)
items = items.repartition(PARTITION_NUM)
views_programmes = views_programmes.repartition(PARTITION_NUM)
test = test.repartition(PARTITION_NUM)

In [16]:
data = train.join(items, on=['item_id'], how='inner').repartition(PARTITION_NUM)

# убираем лишнее
data = data.drop(
    'datetime_availability_start',
    'datetime_availability_stop',
    'datetime_show_start',
    'datetime_show_stop',
    'content_type',
    'channel_id',
    'region_id',
)

# заполняем null в genres пустым массивом
fill = F.array().cast("array<string>")
data = data.withColumn('genres', F.when(F.col('genres').isNull(), fill).otherwise(F.col('genres')))

# заполняем null в year
data = data.na.fill(0, subset=['year'])

In [17]:
# готовим признаки

window_over_user = Window.partitionBy('user_id')

# user buy activity
user_actions = data.groupBy('user_id').count()
user_actions = user_actions \
            .withColumn('user_activity', F.format_number(F.col('count') / F.sum('count').over(Window.partitionBy()), 5).cast('double'))


# target encoding
# data = data.withColumn('te', F.sum('purchase').over(window_over_user) / F.count('purchase').over(window_over_user)) \
#                      .coalesce(PARTITION_NUM)

# как давно вышла программа
data = data.withColumn('item_age', F.year(F.current_date()) - F.col('year'))

# статистики количества просмотров и времени просмотра (мин)
view_stats = views_programmes \
        .withColumn('item_type_label', F.when(F.col('item_type') == 'live', 1).otherwise(0)) \
        .withColumn('live_cnt', F.sum('item_type_label').over(window_over_user)) \
        .withColumn('pvr_cnt', F.count('item_type_label').over(window_over_user) - F.col('live_cnt')) \
        .withColumn('all_cnt', F.count('item_type_label').over(window_over_user)) \
        .withColumn('live_share', F.col('live_cnt') / F.col('all_cnt')) \
        .withColumn('pvr_share', F.col('pvr_cnt') / F.col('all_cnt')) \
        .withColumn('watch_sec', F.col('ts_end').cast('long') - F.col('ts_start').cast('long')) \
        .withColumn('watch_min', F.col('watch_sec') / 60) \
        .withColumn('watch_abs', F.sum('watch_min').over(window_over_user)) \
        .withColumn('watch_avg', F.avg('watch_min').over(window_over_user))

#view_stats.show(20, False, True)
# убираем дубли
view_stats = view_stats.dropDuplicates(['user_id']).select('user_id', 'live_cnt', 'pvr_cnt', 'watch_abs', 'watch_avg', 'live_share')

# собираем признаки в train
data = data.join(view_stats, on=['user_id'])
data = data.join(user_actions, on=['user_id']).repartition(PARTITION_NUM)

In [42]:
stat = data.select('user_id', 'item_id', 'purchase').sample(fraction=0.33)
user_purchase_freq = stat.groupby('user_id').agg(F.mean('purchase').alias('user_purchase_freq')).cache()
item_purchase_freq = stat.groupby('item_id').agg(F.mean('purchase').alias('item_purchase_freq')).cache()

data = data.join(user_purchase_freq, on=['user_id'], how='left') \
           .join(item_purchase_freq, on=['item_id'], how='left') \
           .na.fill(-1, subset=['user_purchase_freq', 'item_purchase_freq']) \
           .repartition(PARTITION_NUM)

In [47]:
data.show(10, False, True)

-RECORD 0----------------------------------------------------------------
 item_id            | 88933                                              
 user_id            | 776188                                             
 purchase           | 0                                                  
 title              | волк и баран                                       
 year               | 2012                                               
 genres             | [Мультфильмы, Наши]                                
 item_age           | 10                                                 
 live_cnt           | 241                                                
 pvr_cnt            | 61                                                 
 watch_abs          | 28472.499999999967                                 
 watch_avg          | 94.2798013245032                                   
 live_share         | 0.7980132450331126                                 
 count              | 2603            

In [49]:
# трансформеры и эстиматоры

# для жанров
cv = CountVectorizer(inputCol='genres', outputCol='genres_ohe', binary=True)

# сливаем все вместе
assembler = VectorAssembler(inputCols=['user_purchase_freq', 'item_purchase_freq', 'genres_ohe', 'live_cnt', 'pvr_cnt', 'watch_abs', 'watch_avg', 'live_share', 'item_age', 'year'], outputCol='features')

In [51]:
# готовим тест

data_test = test.join(items, on=['item_id'], how='inner').repartition(4)

# убираем лишнее
data_test = data_test.drop(
    'datetime_availability_start',
    'datetime_availability_stop',
    'datetime_show_start',
    'datetime_show_stop',
    'content_type',
    'channel_id',
    'region_id',
)
# заполняем null в genres пустым массивом
data_test = data_test.withColumn('genres', F.when(F.col('genres').isNull(), fill).otherwise(F.col('genres')))
# заполняем null в year
data_test = data_test.na.fill(0, subset=['year'])

# готовим признаки
user_actions = data_test.groupBy('user_id').count()
user_actions = user_actions \
            .withColumn('user_activity', F.format_number(F.col('count') / F.sum('count').over(Window.partitionBy()), 5).cast('double'))

data_test = data_test.withColumn('item_age', F.year(F.current_date()) - F.col('year'))

# собираем признаки в test
# в view_stats отсутствует часть user_id, заполним нулями
#cols = data_test.columns
#cols.remove('genres')
#data_test.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in cols]).show()
# +-------+-------+-----+----+--------+--------+-------+---------+---------+
# |user_id|item_id|title|year|item_age|live_cnt|pvr_cnt|watch_abs|watch_avg|
# +-------+-------+-----+----+--------+--------+-------+---------+---------+
# |      0|      0|    0|   0|       0|    3342|   3342|     3342|     3342|
# +-------+-------+-----+----+--------+--------+-------+---------+---------+


data_test = data_test.join(view_stats, on=['user_id'], how='left')
data_test = data_test.join(user_actions, on=['user_id'])
data_test = data_test.na.fill(0, subset=['live_cnt', 'pvr_cnt', 'watch_abs', 'watch_avg', 'live_share'])

data_test = data_test \
            .join(user_purchase_freq, on=['user_id'], how='left') \
            .join(item_purchase_freq, on=['item_id'], how='left') \
            .na.fill(-1, subset=['user_purchase_freq', 'item_purchase_freq']) \
            .repartition(PARTITION_NUM)

In [60]:
pipeline = Pipeline(stages=[cv, assembler])
pipe_model = pipeline.fit(data)
data = pipe_model.transform(data)
data_test = pipe_model.transform(data_test)

In [None]:
cols = data_test.columns
cols.remove('genres')
#cols.remove('genres_ohe')
#cols.remove('features')
data_test.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in cols]).show()

In [61]:
train_f = data.select('user_id', 'item_id', 'features', 'purchase')
test_f = data_test.select('user_id', 'item_id', 'features')

In [62]:
gbt = GBTClassifier(labelCol="purchase", featuresCol="features", maxDepth=7)
ml_model = gbt.fit(train_f)
predictions = ml_model.transform(test_f)

In [63]:
predictions.filter(F.col('prediction') == 1).show()

+-------+-------+--------------------+--------------------+--------------------+----------+
|user_id|item_id|            features|       rawPrediction|         probability|prediction|
+-------+-------+--------------------+--------------------+--------------------+----------+
| 588378|  91225|(92,[0,1,2,11,20,...|[-0.2505149737274...|[0.37729865786829...|       1.0|
| 747028|  93667|(92,[0,1,2,3,5,12...|[-0.5659504410010...|[0.24381047048918...|       1.0|
| 747028|  74708|(92,[0,1,6,7,14,8...|[-0.3896588192714...|[0.31446696920847...|       1.0|
| 747028|   8257|(92,[0,1,21,65,72...|[-0.4309730063296...|[0.29693292825085...|       1.0|
| 747028|   9059|(92,[0,1,85,86,87...|[-0.3798517280839...|[0.31871065254867...|       1.0|
| 747028|  93518|(92,[0,1,2,12,14,...|[-0.3537831922133...|[0.33013680309417...|       1.0|
| 747028|  74359|(92,[0,1,2,12,33,...|[-0.3537831922133...|[0.33013680309417...|       1.0|
| 747028|  88925|(92,[0,1,8,16,85,...|[-0.2673498931949...|[0.36942140613654...|

In [64]:
second_element = F.udf(lambda v: float(v[1]), FloatType())
results = predictions \
        .select('user_id', 'item_id', 'probability') \
        .withColumn('purchase', second_element('probability')) \
        .drop('probability') \
        .sort(['user_id', 'item_id'], ascending=[1, 1])

In [65]:
results.toPandas().to_csv('lab03.csv', header=True)

In [66]:
spark.stop()