In [306]:
!hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2021-02-27 22:12 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2021-02-27 22:12 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2021-02-27 22:12 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2021-02-27 22:12 /labs/slaba03/laba03_views_programmes.csv


In [307]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [308]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "alexey gurov lab3") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [361]:
import pyspark.sql.functions as f
from pyspark.sql.functions import udf

from pyspark.ml.feature import HashingTF
import re
import numpy as np
from numpy import NaN

from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import MinMaxScaler, VectorAssembler

from pyspark.sql.types import StructType, StructField, StringType, LongType, \
                              IntegerType, DateType, FloatType, ArrayType, DoubleType

In [310]:
spark

In [473]:
@udf(returnType=ArrayType(StringType()))
def split(x):
    return list(x.split(' '))

@udf(returnType=StringType())
def get_type(x):
    return type(x)

@udf(returnType=ArrayType(DoubleType()))
def get_features(a, b, c, d, e):
    return list([float(a), float(b), float(c)]) + list(map(float, Vectors.dense(a).toArray())) + list(map(float, Vectors.dense(b).toArray()))

@udf(returnType=ArrayType(FloatType()))
def add_sparce(a, b):
    return list(map(float, Vectors.dense(a).toArray())) + list(map(float, Vectors.dense(b).toArray()))


@udf(returnType=ArrayType(DoubleType()))
def add_features(a, b, c):
    return list([float(a), float(b), float(c)])


@udf(returnType=IntegerType())
def to_int(x):
    return int(float(x))

my_func = f.udf(lambda a, b, c, d, e: Vectors.sparse(list([float(a), float(b), float(c)]) + list(map(float, Vectors.dense(a).toArray())) + list(map(float, Vectors.dense(b).toArray()))))

# Считываем данные

In [312]:
!hdfs dfs -head /labs/slaba03/laba03_items.csv

item_id	channel_id	datetime_availability_start	datetime_availability_stop	datetime_show_start	datetime_show_stop	content_type	title	year	genres	region_id
65667		1970-01-01T00:00:00Z	2018-01-01T00:00:00Z			1	на пробах только девушки (all girl auditions)	2013.0	Эротика	
65669		1970-01-01T00:00:00Z	2018-01-01T00:00:00Z			1	скуби ду: эротическая пародия (scooby doo: a xxx parody)	2011.0	Эротика	
65668		1970-01-01T00:00:00Z	2018-01-01T00:00:00Z			1	горячие девочки для горячих девочек (hot babes 4 hot babes)	2011.0	Эротика	
65671		1970-01-01T00:00:00Z	2018-01-01T00:00:00Z			1	соблазнительницы женатых мужчин (top heavy homewreckers)	2011.0	Эротика	
65670		1970-01-01T00:00:00Z	2018-01-01T00:00:00Z			1	секретные секс-материалы ii: темная секс пародия (the sex files ii: a dark xxx parody)	2010.0	Эротика	
65809		1970-01-01T00:00:00Z	2099-12-31

In [313]:
data_train_schema = StructType([
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", IntegerType())
])

In [314]:
data_train = spark.read.csv('/labs/slaba03/laba03_train.csv', schema=data_train_schema, header=True)

In [315]:
data_train.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
+-------+-------+--------+
only showing top 5 rows



In [316]:
data_test_schema = StructType([
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType())
])

In [317]:
data_test = spark.read.csv('/labs/slaba03/laba03_test.csv', schema=data_test_schema, header=True)

In [436]:
data_test.toPandas().shape

(2156840, 2)

In [318]:
data_test.show(5)

+-------+-------+
|user_id|item_id|
+-------+-------+
|   1654|  94814|
|   1654|  93629|
|   1654|   9980|
|   1654|  95099|
|   1654|  11265|
+-------+-------+
only showing top 5 rows



In [319]:
data_items_schema = StructType([
    StructField("item_id", IntegerType()),
    StructField("channel_id", IntegerType()),
    StructField("datetime_availability_start", DateType()),
    StructField("datetime_availability_stop", DateType()),
    StructField("datetime_show_start", DateType()),
    StructField("datetime_show_stop", DateType()),
    StructField("content_type", IntegerType()),
    StructField("title", StringType()),
    StructField("year", IntegerType()),
    StructField("genres", StringType()),
    StructField("region_id", IntegerType()),
])

In [320]:
data_items = spark.read.csv('/labs/slaba03/laba03_items.csv', header=True, sep='\t')\
                .withColumnRenamed('item_id', 'items_item_id')

In [321]:
data_items.show(5, truncate=False)

+-------------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------------------------------------------------------------------------+------+-------+---------+
|items_item_id|channel_id|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|content_type|title                                                                                 |year  |genres |region_id|
+-------------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------------------------------------------------------------------------+------+-------+---------+
|65667        |null      |1970-01-01T00:00:00Z       |2018-01-01T00:00:00Z      |null               |null              |1           |на пробах только девушки (all girl auditions)                                         |2013.0|Эротика|null     |
|65669        |n

In [331]:
data_items.select('title').show(5, truncate=False)

+--------------------------------------------------------------------------------------+
|title                                                                                 |
+--------------------------------------------------------------------------------------+
|на пробах только девушки (all girl auditions)                                         |
|скуби ду: эротическая пародия (scooby doo: a xxx parody)                              |
|горячие девочки для горячих девочек (hot babes 4 hot babes)                           |
|соблазнительницы женатых мужчин (top heavy homewreckers)                              |
|секретные секс-материалы ii: темная секс пародия (the sex files ii: a dark xxx parody)|
+--------------------------------------------------------------------------------------+
only showing top 5 rows



In [555]:
data_views_programmes = spark.read.csv('/labs/slaba03/laba03_views_programmes.csv', header=True)

In [556]:
data_views_programmes.show(5)

+-------+-------+----------+----------+---------+
|user_id|item_id|  ts_start|    ts_end|item_type|
+-------+-------+----------+----------+---------+
|      0|7101053|1491409931|1491411600|     live|
|      0|7101054|1491412481|1491451571|     live|
|      0|7101054|1491411640|1491412481|     live|
|      0|6184414|1486191290|1486191640|     live|
|    257|4436877|1490628499|1490630256|     live|
+-------+-------+----------+----------+---------+
only showing top 5 rows



# Подготовка и обработка данных

In [488]:
data_train_united = data_train.join(data_items, f.col('item_id') == f.col('items_item_id'))\
                        .select('user_id', 'item_id', 'content_type', 'title', 'year', 'genres', 'purchase')\
                        .filter((f.col('content_type') == 1))\
                        .fillna({'genres': '', 'year': 0})

In [508]:
data_test_united = data_test.join(data_items, f.col('item_id') == f.col('items_item_id'), 'left_outer')\
                        .select('user_id', 'item_id', 'content_type', 'title', 'year', 'genres')\
                        .filter(f.col('content_type') == 1)\
                        .fillna({'genres': '', 'year': 0})

In [489]:
data_train_united.show()

+-------+-------+------------+--------------------+------+--------------------+--------+
|user_id|item_id|content_type|               title|  year|              genres|purchase|
+-------+-------+------------+--------------------+------+--------------------+--------+
| 520446|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|       0|
| 556825|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|       0|
| 566701|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|       0|
| 613775|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|       0|
| 619378|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|       0|
| 625678|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|       0|
| 632495|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|       0|
| 636572|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|       0|
| 639612|   8389|    

In [396]:
data_test_united.show()

+-------+-------+------------+--------------------+------+--------------------+
|user_id|item_id|content_type|               title|  year|              genres|
+-------+-------+------------+--------------------+------+--------------------+
|   1654|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|
| 510087|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|
| 517612|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|
| 522798|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|
| 523860|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|
| 529632|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|
| 566758|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|
| 575248|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|
| 588378|   8389|           1|пес в сапогах (су...|1981.0|Мультфильмы,Детск...|
| 625638|   8389|           1|пес в сапо

In [381]:
d = data_train_united.groupBy('genres').count().toPandas()

In [382]:
d.shape

(1075, 2)

In [490]:
title_ht = HashingTF(inputCol='title_split', outputCol="title_features", numFeatures=100)
genres_ht = HashingTF(inputCol='genres_split', outputCol="genres_features", numFeatures=100)

In [491]:
d = data_train_united.select('user_id', 
                             'item_id', 
                             to_int(data_train_united.year).alias('year'), 
                             split(data_train_united.title).alias('title_split'), 
                             split(data_train_united.genres).alias('genres_split'), 
                             'purchase')

In [492]:
data_train_united_transformed = genres_ht.transform(title_ht.transform(d))\
                                        .select('user_id', 'item_id', 'year', 'title_features', 'genres_features', 'purchase')

In [509]:
d = data_test_united.select('user_id', 
                            'item_id', 
                            to_int(data_test_united.year).alias('year'), 
                            split(data_test_united.title).alias('title_split'), 
                            split(data_test_united.genres).alias('genres_split'))

In [510]:
data_test_united_transformed = genres_ht.transform(title_ht.transform(d))\
                                        .select('user_id', 'item_id', 'year', 'title_features', 'genres_features')

In [496]:
data_train_united_transformed.show()

+-------+-------+----+--------------------+---------------+--------+
|user_id|item_id|year|      title_features|genres_features|purchase|
+-------+-------+----+--------------------+---------------+--------+
| 520446|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|       0|
| 556825|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|       0|
| 566701|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|       0|
| 613775|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|       0|
| 619378|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|       0|
| 625678|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|       0|
| 632495|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|       0|
| 636572|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|       0|
| 639612|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|       0|
| 668112|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|       0|
| 703514|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|       0|
| 711308|   8389|1981|(100,[20,58,

In [497]:
data_test_united_transformed.show()

+-------+-------+----+--------------------+---------------+
|user_id|item_id|year|      title_features|genres_features|
+-------+-------+----+--------------------+---------------+
| 869272|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|
| 869294|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|
| 869465|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|
| 869669|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|
| 869892|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|
| 869909|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|
| 870514|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|
| 870906|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|
| 870998|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|
| 871082|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|
| 871154|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|
| 871348|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|
| 871411|   8389|1981|(100,[20,58,84,87...|(100,[5],[1.0])|
| 871442|   8389|1981|(100,[20,58,84,87.

###  Дополнительные параметры для обучения модели

In [553]:
sum_item_purchase = data_train.groupBy('item_id')\
                        .sum('purchase')\
                        .withColumnRenamed('sum(purchase)', 'sum_item_purchased').cache()

In [554]:
sum_user_purchase = data_train.groupBy('user_id')\
                        .sum('purchase')\
                        .withColumnRenamed('sum(purchase)', 'sum_user_purchased').cache()

In [571]:
sum_item_watch_time = data_views_programmes\
                        .select('item_id', (f.col('ts_end') - f.col('ts_start')).alias('item_watch_time'))\
                        .groupBy('item_id')\
                        .sum('item_watch_time')\
                        .withColumnRenamed('sum(item_watch_time)', 'sum_item_watch_time')\
                        .cache()

sum_item_watch_time = sum_item_watch_time.select('item_id', to_int(sum_item_watch_time.sum_item_watch_time)\
                                                 .alias('sum_item_watch_time')).cache()

In [572]:
sum_user_watch_time = data_views_programmes\
                        .select('user_id', (f.col('ts_end') - f.col('ts_start')).alias('user_watch_time'))\
                        .groupBy('user_id')\
                        .sum('user_watch_time')\
                        .withColumnRenamed('sum(user_watch_time)', 'sum_user_watch_time')\
                        .cache()

sum_user_watch_time = sum_user_watch_time.select('user_id', to_int(sum_user_watch_time.sum_user_watch_time)\
                                                 .alias('sum_user_watch_time')).cache()

In [578]:
X_train = data_train_united_transformed\
            .join(sum_item_purchase, on='item_id', how='left_outer')\
            .join(sum_user_purchase, on='user_id', how='left_outer')\
            .join(sum_item_watch_time, on='item_id', how='left_outer')\
            .join(sum_user_watch_time, on='user_id', how='left_outer')\
            .select('user_id', 
                    'item_id', 
                    'year', 
                    'genres_features', 
                    'sum_item_purchased', 
                    'sum_user_purchased', 
                    'sum_item_watch_time', 
                    'sum_user_watch_time', 
                    'purchase')\
            .fillna({'sum_item_purchased': 0, 
                     'sum_user_purchased': 0, 
                     'sum_item_watch_time': 0, 
                     'sum_user_watch_time': 0}).cache()

In [579]:
X_test = data_test_united_transformed\
            .join(sum_item_purchase, on='item_id', how='left_outer')\
            .join(sum_user_purchase, on='user_id', how='left_outer')\
            .join(sum_item_watch_time, on='item_id', how='left_outer')\
            .join(sum_user_watch_time, on='user_id', how='left_outer')\
            .select('user_id', 
                    'item_id', 
                    'year', 
                    'genres_features', 
                    'sum_item_purchased', 
                    'sum_user_purchased', 
                    'sum_item_watch_time', 
                    'sum_user_watch_time')\
            .fillna({'sum_item_purchased': 0, 
                     'sum_user_purchased': 0, 
                     'sum_item_watch_time': 0, 
                     'sum_user_watch_time': 0}).cache()

### Строим модель

In [582]:
from pyspark.ml.classification import GBTClassifier

In [583]:
feature_list = ['user_id', 
                'item_id', 
                'year', 
                'genres_features', 
                'sum_item_purchased', 
                'sum_user_purchased', 
                'sum_item_watch_time', 
                'sum_user_watch_time']

In [584]:
feature_assembler = VectorAssembler(inputCols=feature_list, outputCol='features')

In [585]:
gbt = GBTClassifier(featuresCol='features', labelCol='purchase', seed=42, maxIter=40, maxDepth=4, minInstancesPerNode=1)

In [586]:
model_pipeline = Pipeline(stages=[feature_assembler, gbt])

In [588]:
model = model_pipeline.fit(X_train)

### Делаем предсказания и записываем в файл

In [589]:
predictions = model.transform(X_test)

In [512]:
type(predictions)

pyspark.sql.dataframe.DataFrame

In [590]:
predictions.show(5)

+-------+-------+----+---------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+--------------------+----------+
|user_id|item_id|year|genres_features|sum_item_purchased|sum_user_purchased|sum_item_watch_time|sum_user_watch_time|            features|       rawPrediction|         probability|prediction|
+-------+-------+----+---------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+--------------------+----------+
|   1654|   8389|1981|(100,[5],[1.0])|                 8|                 5|                  0|             365191|(107,[0,1,2,8,103...|[1.78690582979145...|[0.97271652947498...|       0.0|
| 510087|   8389|1981|(100,[5],[1.0])|                 8|                 6|                  0|            2337212|(107,[0,1,2,8,103...|[1.77074800513445...|[0.97184567426074...|       0.0|
| 517612|   8389|1981|(100,[5],[1.0])|       

In [591]:
answer_df = predictions.select(['user_id','item_id','probability']).orderBy(['user_id','item_id']).toPandas()

  Unsupported type in conversion to Arrow: VectorUDT
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.


In [592]:
answer_df['purchase'] = answer_df['probability'].apply(lambda x: x[1])

In [593]:
answer_df = answer_df.drop('probability', axis=1)

In [594]:
answer_df.to_csv('lab03.csv')

In [595]:
spark.stop()