In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 5g --executor-cores 3 --driver-memory 5g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType, StructType, StructField, IntegerType, StringType

if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())


import pyspark.sql.functions as f

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.sql.functions import col, desc, pandas_udf, PandasUDFType, udf, regexp_replace, when, asc, lit, broadcast
from pyspark.sql.types import StructType, IntegerType, StructField, DateType, StringType, TimestampType, FloatType, ArrayType
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder

In [3]:
#! hdfs dfs -head /labs/slaba03/laba03_train.csv

In [4]:
from pyspark.sql.types import  StructType, StructField, IntegerType, DateType, StringType, TimestampType, FloatType, ArrayType

In [5]:
schema = StructType([
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", IntegerType())
])

In [6]:
dataset_train = spark.read.csv("/labs/slaba03/laba03_train.csv", schema=schema, header=True)

In [7]:
dataset_train.where("purchase=1").show(2)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|   9897|       1|
|   1654|   7394|       1|
+-------+-------+--------+
only showing top 2 rows



In [8]:
test_schema = StructType(fields = [
    StructField('user_id', IntegerType()),
    StructField('item_id', IntegerType()),
])

In [9]:
dataset_test = spark.read.option("header","true") \
                        .schema(test_schema) \
                        .csv('/labs/slaba03/laba03_test.csv')

In [10]:
dataset_test = dataset_test.na.fill(0)
dataset_test.where("user_id=1654").show(2)

+-------+-------+
|user_id|item_id|
+-------+-------+
|   1654|  94814|
|   1654|  93629|
+-------+-------+
only showing top 2 rows



In [11]:
#! hdfs dfs -head /labs/slaba03/laba03_items.csv

In [12]:
items_schema = StructType(fields = [
    StructField('item_id', IntegerType()),
    StructField('channel_id', IntegerType()),
    StructField('datetime_availability_start', StringType()),
    StructField('datetime_availability_stop', StringType()),
    StructField('datetime_show_start', StringType()),
    StructField('datetime_show_stop', StringType()),
    StructField('content_type', IntegerType()),
    StructField('title', StringType()),
    StructField('year', FloatType()),
    StructField('genres', StringType()),
    StructField('region_id', IntegerType()),
])

In [13]:
dataset_items = spark.read.option("header","true") \
                        .option("sep", "\t") \
                        .schema(items_schema) \
                        .csv('/labs/slaba03/laba03_items.csv')

In [14]:
dataset_items = dataset_items.na.fill(0)
dataset_items_i=dataset_items.select("item_id","genres").where("content_type=1")
dataset_items_i.show(2)

+-------+-------+
|item_id| genres|
+-------+-------+
|  65667|Эротика|
|  65669|Эротика|
+-------+-------+
only showing top 2 rows



In [None]:
dataset_items_i.select("genres").distinct().count()

In [None]:
dataset_items_c=dataset_items_i.select("genres").groupby("genres").count()


In [None]:
dataset_items_c.show(2)

In [None]:
genres = dataset_items_i.select('genres').rdd.flatMap(lambda x: x).collect()
genres

In [None]:
genres = list(set(genres))


genres

In [None]:
genr = set()
for i in genres:
    for j in i.split(','):
        genr.add(j)

In [None]:
genr

In [15]:
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("казк"), 1).otherwise(0).alias('fairy_tale'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Анимация"), 1).otherwise(0).alias('animation'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Арт"), 1).otherwise(0).alias('art_house'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Аниме"), 1).otherwise(0).alias('anime'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Биография"), 1).otherwise(0).alias('biography'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Боевик"), 1).otherwise(0).alias('action_movie'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Вестерн"), 1).otherwise(0).alias('western'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Видеоигры"), 1).otherwise(0).alias('video_game'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Игры"), 1).otherwise(0).alias('games'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Военны"), 1).otherwise(0).alias('military'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Детективы"), 1).otherwise(0).alias('detective'))
dataset_items_i = dataset_items_i.select('*', when((col('genres').contains("Семейны")) | (col('genres').contains("Для всей семьи")), 1).otherwise(0).alias('family'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Для взрослых"), 1).otherwise(0).alias('adults'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Эротика"), 1).otherwise(0).alias('erotica'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Развлекательные"), 1).otherwise(0).alias('entertainment'))
dataset_items_i = dataset_items_i.select('*', when((col('genres').contains("Наши")) | (col('genres').contains("Русские")), 1).otherwise(0).alias('our'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Зарубежные"), 1).otherwise(0).alias('foreign'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Криминал"), 1).otherwise(0).alias('crime'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Комеди"), 1).otherwise(0).alias('comedy'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Драм"), 1).otherwise(0).alias('drama'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Мультфильм"), 1).otherwise(0).alias('cartoon'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Приключени"), 1).otherwise(0).alias('adventure'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("General"), 1).otherwise(0).alias('general'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Исторически"), 1).otherwise(0).alias('history'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Советские"), 1).otherwise(0).alias('ussr'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Триллер"), 1).otherwise(0).alias('thriller'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Ужасы"), 1).otherwise(0).alias('horror'))
dataset_items_i = dataset_items_i.select('*', when((col('genres').contains("Мистические")), 1).otherwise(0).alias('mystical'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Фантасти"), 1).otherwise(0).alias('fantastic'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Короткометр"), 1).otherwise(0).alias('short_film'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Полнометражные"), 1).otherwise(0).alias('full_length'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Музыкальн"), 1).otherwise(0).alias('music'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Спорт"), 1).otherwise(0).alias('sports'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Мелодрам"), 1).otherwise(0).alias('melodrama'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Сериалы"), 1).otherwise(0).alias('serials'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Экранизации"), 1).otherwise(0).alias('film_adaptations'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Романтические"), 1).otherwise(0).alias('romantic'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Развивающие"), 1).otherwise(0).alias('developing'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Хочу всё знать"), 1).otherwise(0).alias('want_know'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Познавательные"), 1).otherwise(0).alias('educational'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Советское кино"), 1).otherwise(0).alias('ussr_movie'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Документальны"), 1).otherwise(0).alias('documentary'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Кулинария"), 1).otherwise(0).alias('cooking'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("О здоровье"), 1).otherwise(0).alias('health'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Прочие"), 1).otherwise(0).alias('other'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Про животных"), 1).otherwise(0).alias('animal'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Мюзиклы"), 1).otherwise(0).alias('musical'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Охота и рыбалка"), 1).otherwise(0).alias('hunting'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Передачи"), 1).otherwise(0).alias('broadcast'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Научная фантастика"), 1).otherwise(0).alias('nauchpop'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Юмористические"), 1).otherwise(0).alias('humorous'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Реалити-шоу"), 1).otherwise(0).alias('reality_show'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Западные мультфильмы"), 1).otherwise(0).alias('western cartoons'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Союзмультфильм"), 1).otherwise(0).alias('union cartoons'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Мультсериалы"), 1).otherwise(0).alias('animated_series'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Фэнтези"), 1).otherwise(0).alias('fantasy'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Для самых маленьких"), 1).otherwise(0).alias('small'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Фильмы"), 1).otherwise(0).alias('movie'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Русские мультфильмы"), 1).otherwise(0).alias('russian_cartoon'))
dataset_items_i = dataset_items_i.select('*', when(col('genres').contains("Фильмы-спектакли"), 1).otherwise(0).alias('films_performances'))
dataset_items_i = dataset_items_i.select('*', when((col('genres').contains("Детские")) | (col('genres').contains("Для детей")), 1).otherwise(0).alias('childish'))


In [16]:
dataset_items_i.where("fairy_tale=1").show(1)

+-------+--------------------+----------+---------+---------+-----+---------+------------+-------+----------+-----+--------+---------+------+------+-------+-------------+---+-------+-----+------+-----+-------+---------+-------+-------+----+--------+------+--------+---------+----------+-----------+-----+------+---------+-------+----------------+--------+----------+---------+-----------+----------+-----------+-------+------+-----+------+-------+-------+---------+--------+--------+------------+----------------+--------------+---------------+-------+-----+-----+---------------+------------------+--------+
|item_id|              genres|fairy_tale|animation|art_house|anime|biography|action_movie|western|video_game|games|military|detective|family|adults|erotica|entertainment|our|foreign|crime|comedy|drama|cartoon|adventure|general|history|ussr|thriller|horror|mystical|fantastic|short_film|full_length|music|sports|melodrama|serials|film_adaptations|romantic|developing|want_know|educational|uss

In [17]:
features=dataset_items_i.drop("genres").distinct()
features.show(1)

+-------+----------+---------+---------+-----+---------+------------+-------+----------+-----+--------+---------+------+------+-------+-------------+---+-------+-----+------+-----+-------+---------+-------+-------+----+--------+------+--------+---------+----------+-----------+-----+------+---------+-------+----------------+--------+----------+---------+-----------+----------+-----------+-------+------+-----+------+-------+-------+---------+--------+--------+------------+----------------+--------------+---------------+-------+-----+-----+---------------+------------------+--------+
|item_id|fairy_tale|animation|art_house|anime|biography|action_movie|western|video_game|games|military|detective|family|adults|erotica|entertainment|our|foreign|crime|comedy|drama|cartoon|adventure|general|history|ussr|thriller|horror|mystical|fantastic|short_film|full_length|music|sports|melodrama|serials|film_adaptations|romantic|developing|want_know|educational|ussr_movie|documentary|cooking|health|other|a

In [18]:
sample = dataset_train.sampleBy('purchase', fractions={0: 0.5, 1: 0.5}, seed=42).cache()

In [None]:
sample.groupBy('item_id').std('purchase').dropDuplicates(['item_id']).show(1)

In [19]:
dataset_train = dataset_train.join(sample.groupBy('user_id').mean('purchase').dropDuplicates(['user_id']).\
                             select('user_id', col('avg(purchase)').alias('avg(purchase)_user')), on='user_id', how='left')
dataset_train = dataset_train.join(sample.groupBy('item_id').mean('purchase').dropDuplicates(['item_id']).\
                             select('item_id', col('avg(purchase)').alias('avg(purchase)_item')), on='item_id', how='left')

dataset_test = dataset_test.join(sample.groupBy('user_id').mean('purchase').dropDuplicates(['user_id']).\
                           select('user_id', col('avg(purchase)').alias('avg(purchase)_user')), on='user_id', how='left')
dataset_test = dataset_test.join(sample.groupBy('item_id').mean('purchase').dropDuplicates(['item_id']).\
                           select('item_id', col('avg(purchase)').alias('avg(purchase)_item')), on='item_id', how='left')

dataset_train = dataset_train.na.fill(0)
dataset_test = dataset_test.na.fill(0)

In [20]:
dataset_train.show(1)

+-------+-------+--------+-------------------+--------------------+
|item_id|user_id|purchase| avg(purchase)_user|  avg(purchase)_item|
+-------+-------+--------+-------------------+--------------------+
|   8389| 754230|       0|0.02843601895734597|0.007541478129713424|
+-------+-------+--------+-------------------+--------------------+
only showing top 1 row



In [21]:
dataset_test.show(1)

+-------+-------+--------------------+--------------------+
|item_id|user_id|  avg(purchase)_user|  avg(purchase)_item|
+-------+-------+--------------------+--------------------+
|   8389| 761341|7.739938080495357E-4|0.007541478129713424|
+-------+-------+--------------------+--------------------+
only showing top 1 row



In [22]:
dataset_train_f = dataset_train.join(features, on='item_id', how='left')
dataset_train_f = dataset_train_f.na.fill(0)

dataset_train_f.show(1)

+-------+-------+--------+-------------------+--------------------+----------+---------+---------+-----+---------+------------+-------+----------+-----+--------+---------+------+------+-------+-------------+---+-------+-----+------+-----+-------+---------+-------+-------+----+--------+------+--------+---------+----------+-----------+-----+------+---------+-------+----------------+--------+----------+---------+-----------+----------+-----------+-------+------+-----+------+-------+-------+---------+--------+--------+------------+----------------+--------------+---------------+-------+-----+-----+---------------+------------------+--------+
|item_id|user_id|purchase| avg(purchase)_user|  avg(purchase)_item|fairy_tale|animation|art_house|anime|biography|action_movie|western|video_game|games|military|detective|family|adults|erotica|entertainment|our|foreign|crime|comedy|drama|cartoon|adventure|general|history|ussr|thriller|horror|mystical|fantastic|short_film|full_length|music|sports|melodr

In [23]:
dataset_test_f = dataset_test.join(features, on='item_id', how='left')
dataset_test_f = dataset_test_f.na.fill(0)

In [24]:
dataset_test_f.show(1)

+-------+-------+--------------------+--------------------+----------+---------+---------+-----+---------+------------+-------+----------+-----+--------+---------+------+------+-------+-------------+---+-------+-----+------+-----+-------+---------+-------+-------+----+--------+------+--------+---------+----------+-----------+-----+------+---------+-------+----------------+--------+----------+---------+-----------+----------+-----------+-------+------+-----+------+-------+-------+---------+--------+--------+------------+----------------+--------------+---------------+-------+-----+-----+---------------+------------------+--------+
|item_id|user_id|  avg(purchase)_user|  avg(purchase)_item|fairy_tale|animation|art_house|anime|biography|action_movie|western|video_game|games|military|detective|family|adults|erotica|entertainment|our|foreign|crime|comedy|drama|cartoon|adventure|general|history|ussr|thriller|horror|mystical|fantastic|short_film|full_length|music|sports|melodrama|serials|film

In [25]:
train = dataset_train_f.sampleBy('purchase', fractions={0: 0.8, 1: 0.8}, seed=42).cache()
val = dataset_train_f.join(train, on=['user_id', 'item_id'], how='leftanti').cache()

In [26]:
assembler = VectorAssembler(inputCols=dataset_train_f.drop('purchase').columns, outputCol="features")

In [27]:
dataset = assembler.transform(train)

In [28]:
dataset.show(1)

+-------+-------+--------+-------------------+--------------------+----------+---------+---------+-----+---------+------------+-------+----------+-----+--------+---------+------+------+-------+-------------+---+-------+-----+------+-----+-------+---------+-------+-------+----+--------+------+--------+---------+----------+-----------+-----+------+---------+-------+----------------+--------+----------+---------+-----------+----------+-----------+-------+------+-----+------+-------+-------+---------+--------+--------+------------+----------------+--------------+---------------+-------+-----+-----+---------------+------------------+--------+--------------------+
|item_id|user_id|purchase| avg(purchase)_user|  avg(purchase)_item|fairy_tale|animation|art_house|anime|biography|action_movie|western|video_game|games|military|detective|family|adults|erotica|entertainment|our|foreign|crime|comedy|drama|cartoon|adventure|general|history|ussr|thriller|horror|mystical|fantastic|short_film|full_lengt

In [29]:
gbt = GBTClassifier(featuresCol='features', labelCol='purchase', maxIter=10, maxDepth=10)

In [30]:
model = gbt.fit(dataset)

In [31]:
val_vector = assembler.transform(val)

In [32]:
valid = model.transform(val_vector)

In [33]:
valid.show(1)

+-------+-------+--------+--------------------+------------------+----------+---------+---------+-----+---------+------------+-------+----------+-----+--------+---------+------+------+-------+-------------+---+-------+-----+------+-----+-------+---------+-------+-------+----+--------+------+--------+---------+----------+-----------+-----+------+---------+-------+----------------+--------+----------+---------+-----------+----------+-----------+-------+------+-----+------+-------+-------+---------+--------+--------+------------+----------------+--------------+---------------+-------+-----+-----+---------------+------------------+--------+--------------------+--------------------+--------------------+----------+
|user_id|item_id|purchase|  avg(purchase)_user|avg(purchase)_item|fairy_tale|animation|art_house|anime|biography|action_movie|western|video_game|games|military|detective|family|adults|erotica|entertainment|our|foreign|crime|comedy|drama|cartoon|adventure|general|history|ussr|thril

In [34]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="purchase", metricName='areaUnderROC')

In [35]:

evaluator.evaluate(valid)


0.9229835176643045

In [36]:
# Вклад признаков в решение модели
model.featureImportances

SparseVector(65, {0: 0.1058, 1: 0.2843, 2: 0.1771, 3: 0.2023, 4: 0.0016, 5: 0.0, 6: 0.0026, 7: 0.0006, 8: 0.0013, 9: 0.0098, 10: 0.0033, 11: 0.0013, 12: 0.0012, 13: 0.004, 14: 0.0084, 15: 0.0093, 16: 0.0015, 17: 0.0081, 18: 0.0003, 19: 0.0095, 20: 0.0146, 21: 0.0072, 22: 0.0091, 23: 0.0094, 24: 0.0098, 25: 0.0117, 27: 0.0032, 28: 0.0009, 29: 0.0104, 30: 0.0065, 31: 0.005, 32: 0.0074, 33: 0.0004, 34: 0.0075, 35: 0.002, 36: 0.0027, 37: 0.0056, 38: 0.0035, 39: 0.0004, 40: 0.0005, 41: 0.0008, 42: 0.0011, 43: 0.0001, 44: 0.0006, 45: 0.0051, 48: 0.0001, 49: 0.0025, 50: 0.0008, 51: 0.0, 52: 0.0027, 53: 0.0, 54: 0.0003, 55: 0.0007, 56: 0.004, 57: 0.0026, 58: 0.0032, 59: 0.0068, 60: 0.0057, 61: 0.0011, 62: 0.002, 64: 0.0098})

In [37]:
# Функция для извлечения предсказания для 2-го класса
best_pred = udf(lambda s: s.values.item(1), FloatType())

In [38]:
out_vector = assembler.transform(dataset_test_f)
out_predict = model.transform(out_vector)

In [39]:
out_predict = out_predict.withColumn('purchase', best_pred(col('probability')))

In [40]:
out_predict.select('user_id', 'item_id', 'purchase', 'probability').sort('user_id', 'item_id').show(5, truncate=False)

+-------+-------+----------+----------------------------------------+
|user_id|item_id|purchase  |probability                             |
+-------+-------+----------+----------------------------------------+
|1654   |336    |0.06629539|[0.9337046069842695,0.06629539301573051]|
|1654   |678    |0.06627692|[0.9337230768532409,0.06627692314675915]|
|1654   |691    |0.06627692|[0.9337230768532409,0.06627692314675915]|
|1654   |696    |0.06818159|[0.9318184143716487,0.0681815856283513] |
|1654   |763    |0.06627692|[0.9337230768532409,0.06627692314675915]|
+-------+-------+----------+----------------------------------------+
only showing top 5 rows



In [43]:
# Бывает проблема с чекером из-за несоответствия кол-ва данных в файле
print('Count on predictions data: ', out_predict.count())
print('Count on check data: ', dataset_test.count())

Count on predictions data:  2156840
Count on check data:  2156840


In [44]:
# Сохранение результатов в файл
out_predict.select('user_id', 'item_id', 'purchase').sort('user_id', 'item_id').toPandas().to_csv('lab03_100.csv', sep=',', index=True)

In [None]:
spark.stop()