In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()

spark = SparkSession.builder.config(conf=conf).appName("Sabilov lab3").getOrCreate()

In [3]:
dict_length = 10000
col_vector = []

In [4]:
!hdfs dfs -ls /labs/slaba03

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2021-02-27 22:12 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2021-02-27 22:12 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2021-02-27 22:12 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2021-02-27 22:12 /labs/slaba03/laba03_views_programmes.csv


In [5]:
!hdfs dfs -head /labs/slaba03/laba03_items.csv

item_id	channel_id	datetime_availability_start	datetime_availability_stop	datetime_show_start	datetime_show_stop	content_type	title	year	genres	region_id
65667		1970-01-01T00:00:00Z	2018-01-01T00:00:00Z			1	на пробах только девушки (all girl auditions)	2013.0	Эротика	
65669		1970-01-01T00:00:00Z	2018-01-01T00:00:00Z			1	скуби ду: эротическая пародия (scooby doo: a xxx parody)	2011.0	Эротика	
65668		1970-01-01T00:00:00Z	2018-01-01T00:00:00Z			1	горячие девочки для горячих девочек (hot babes 4 hot babes)	2011.0	Эротика	
65671		1970-01-01T00:00:00Z	2018-01-01T00:00:00Z			1	соблазнительницы женатых мужчин (top heavy homewreckers)	2011.0	Эротика	
65670		1970-01-01T00:00:00Z	2018-01-01T00:00:00Z			1	секретные секс-материалы ii: темная секс пародия (the sex files ii: a dark xxx parody)	2010.0	Эротика	
65809		1970-01-01T00:00:00Z	2099-12-31

In [6]:
# загружаем данные
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType, DoubleType, TimestampType
train_schema = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", IntegerType())
])

test_schema = StructType(fields=[
    StructField('user_id', IntegerType()),
    StructField('item_id', IntegerType()),
    StructField('purchase', DoubleType())
])

items_schema = StructType(fields=[
    StructField('item_id', IntegerType()),
    StructField('channel_id', IntegerType()),
    StructField('datetime_availability_start', TimestampType()),
    StructField('datetime_availability_stop', TimestampType()),
    StructField('datetime_show_start', TimestampType()),
    StructField('datetime_show_stop', TimestampType()),
    StructField('content_type', IntegerType()),
    StructField('title', StringType()),
    StructField('year', DoubleType()),
    StructField('genres', StringType()),
    StructField('region_id', IntegerType())
])

views_schema = StructType(fields=[
    StructField('user_id', IntegerType()),
    StructField('item_id', IntegerType()),
    StructField('ts_start', LongType()),
    StructField('ts_end', LongType()),
    StructField('item_type', StringType())
])


train = spark.read.csv('/labs/slaba03/laba03_train.csv', header=True, schema=train_schema)
test = spark.read.csv('/labs/slaba03/laba03_test.csv', header=True, schema=test_schema)
items = spark.read.csv('/labs/slaba03/laba03_items.csv', sep='\t', header=True, schema=items_schema)
views = spark.read.csv('/labs/slaba03/laba03_views_programmes.csv', header=True, schema=views_schema)
# train.show(5)
# test.show(5)
# items.show(3, vertical=True, truncate=False)
# views.show(5)

In [7]:
# train = train.sampleBy("purchase", fractions={0: 0.01, 1: 0.01}, seed=42)
# test = test.sample(False, 0.01, 43)
# print(train.count())
# print(test.count())

In [8]:
# filtering paid content
# items = items.filter(items.content_type == 1)
col_vector += ['content_type']

In [9]:
views = views.filter(views.item_id >= 326)

In [10]:
import pyspark.sql.functions as f

user_views = views.groupBy('user_id').agg(f.count('item_id').alias('items_viewed'))

In [11]:
item_views = views.groupBy('item_id').agg(f.count('user_id').alias('users_viewed'))

In [12]:
views = views.join(user_views, 'user_id', how='left').join(item_views, 'item_id', how='left')

In [13]:
views = views.withColumn('item_view_time', f.col('ts_end') - f.col('ts_start'))

In [14]:
user_sum_view = views.groupBy('user_id', 'items_viewed').agg(f.sum('item_view_time').alias('item_sum_view'))

user_mean_time_view = user_sum_view.withColumn('user_mean_time_view', f.col('item_sum_view') / (f.col('items_viewed') + 0.000001))

In [15]:
# item_type labling
from itertools import chain
from pyspark.sql.functions import create_map, lit
from pyspark.ml.feature import StringIndexer

views = views.fillna('9999', subset=['item_type'])

item_types = views.select('item_type').distinct().rdd.flatMap(lambda x: x).collect()
item_types_dict = dict(zip(item_types, range(len(item_types))))

mapping_expr = create_map([lit(x) for x in chain(*item_types_dict.items())])

views = views.withColumn('item_type_lbl', mapping_expr[views['item_type']]).drop('item_type')


mean_view_type = views.groupBy('user_id').agg(f.mean('item_type_lbl').alias('mean_view_type'))

In [16]:
#join all to train and test df
train = train.join(items, on='item_id', how='left')
test = test.join(items, on='item_id', how='left')

In [17]:
train = train.join(user_mean_time_view, on='user_id', how='left')
test = test.join(user_mean_time_view, on='user_id', how='left')
train = train.join(mean_view_type, on='user_id', how='left')
test = test.join(mean_view_type, on='user_id', how='left')

In [18]:
# train.show(2, False, True)
# test.show(2, False, True)

In [19]:
# print(train.count())
# train = train.dropDuplicates(subset=['user_id', 'item_id'])
# print(train.count())

In [20]:
# repartition dfs
train = train.coalesce(3)
test = test.coalesce(3)
# print(train.rdd.getNumPartitions())
# print(test.rdd.getNumPartitions())

In [21]:
# df's count
# print(train.count())
# print(test.count())

In [22]:
# # calculate the duration of viewing video by user
# import pyspark.sql.functions as f
# train = train.withColumn('viewing_time', f.col('ts_end') - f.col('ts_start')).drop('ts_end', 'ts_start')
# test = test.withColumn('viewing_time', f.col('ts_end') - f.col('ts_start')).drop('ts_end', 'ts_start')
# # train.show(2, False, True)
# # test.show(2, False, True)
# col_vector += ['viewing_time']

In [23]:
# # item_type labling
# from itertools import chain
# from pyspark.sql.functions import create_map, lit
# from pyspark.ml.feature import StringIndexer

# train = train.fillna('9999', subset=['item_type'])
# test = test.fillna('9999', subset=['item_type'])

# item_types = train.select('item_type').distinct().rdd.flatMap(lambda x: x).collect()
# item_types_dict = dict(zip(item_types, range(len(item_types))))

# mapping_expr = create_map([lit(x) for x in chain(*item_types_dict.items())])

# train = train.withColumn('item_type_lbl', mapping_expr[train['item_type']]).drop('item_type')
# # train.show(2, False, True)

# item_types = test.select('item_type').distinct().rdd.flatMap(lambda x: x).collect()
# item_types_dict = dict(zip(item_types, range(len(item_types))))

# mapping_expr = create_map([lit(x) for x in chain(*item_types_dict.items())])

# test = test.withColumn('item_type_lbl', mapping_expr[test['item_type']]).drop('item_type')
# # test.show(2, False, True)
# #-------------------------------------------------------------------------------------------
# # train_indexer = StringIndexer(inputCol="item_type", outputCol="item_type_lbl") 
# # train = train_indexer.fit(train).transform(train)

# # test_indexer = StringIndexer(inputCol="item_type", outputCol="item_type_lbl")
# # test = test_indexer.fit(test).transform(test)

# col_vector += ['item_type_lbl']

In [24]:
# genres labling
train = train.fillna('Без жанра', subset=['genres'])
train = train.fillna(-9999, subset=['year'])
genres = train.select('genres').distinct().rdd.flatMap(lambda x: x).collect()
genres_dict = dict(zip(genres, range(len(genres))))

mapping_expr = create_map([lit(x) for x in chain(*genres_dict.items())])

train = train.withColumn('genres_lbl', mapping_expr[train['genres']]).drop('genres')
# train.show(2, False, True)

test = test.fillna('Без жанра', subset=['genres'])
test = test.fillna(-9999, subset=['year'])
genres = test.select('genres').distinct().rdd.flatMap(lambda x: x).collect()
genres_dict = dict(zip(genres, range(len(genres))))

mapping_expr = create_map([lit(x) for x in chain(*genres_dict.items())])

test = test.withColumn('genres_lbl', mapping_expr[test['genres']]).drop('genres')
# test.show(2, False, True)

#-------------------------------------------------------------------------------------------
# train_indexer = StringIndexer(inputCol="genres", outputCol="genres_lbl") 
# train = train_indexer.fit(train).transform(train)

# test_indexer = StringIndexer(inputCol="genres", outputCol="genres_lbl")
# test = test_indexer.fit(test).transform(test)

col_vector += ['genres_lbl']

In [50]:
# print(items.select('region_id').distinct().rdd.flatMap(lambda x: x).collect())
# print(items.select('channel_id').distinct().rdd.flatMap(lambda x: x).collect())

In [51]:
# calculate tfidf on title col
from pyspark.ml.feature import HashingTF, IDF

train = train.withColumn('splited', f.split('title', '[^a-zA-ZА-Яа-я0-9+]{1,}'))

ht = HashingTF(inputCol='splited', outputCol='tf', numFeatures=dict_length, binary=True)
train = ht.transform(train)

idf = IDF(inputCol='tf', outputCol='tfidf').fit(train)
train = idf.transform(train)
# train.show(2, False, True)

test = test.withColumn('splited', f.split('title', '[^a-zA-ZА-Яа-я0-9+]{1,}'))

test = ht.transform(test)
test = idf.transform(test)
# test.show(2, False, True)
col_vector += ['tfidf']


# word2vec
from pyspark.ml.feature import Word2Vec

word2Vec = Word2Vec(vectorSize=100, minCount=0, inputCol="splited", outputCol="word2vec")
model = word2Vec.fit(train)
train = model.transform(train).drop('tf', 'splited', 'title', 'region_id', 'channel_id')

test = model.transform(test).drop('tf', 'splited', 'title', 'region_id', 'channel_id')

In [25]:
# calculate item availability
import datetime

available = f.when(
    (train.datetime_availability_stop > datetime.datetime(2021, 1, 1, 0, 0)) &
    (train.datetime_availability_start < datetime.datetime(2021, 1, 1, 0, 0)),
    1
).otherwise(0)
train = train.withColumn('item_availability', available).drop('datetime_availability_stop', 
                                                              'datetime_availability_start')
# train.show(10, False, True)

available = f.when(
    (test.datetime_availability_stop > datetime.datetime(2021, 1, 1, 0, 0)) &
    (test.datetime_availability_start < datetime.datetime(2021, 1, 1, 0, 0)),
    1
).otherwise(0)
test = test.withColumn('item_availability', available).drop('datetime_availability_stop', 
                                                            'datetime_availability_start')
# test.show(10, False, True)
col_vector += ['item_availability']

In [53]:
# # items features vectorization
# from pyspark.ml.feature import VectorAssembler

# col_vector += ['year']

# train = train.fillna(-9999, subset=['year', 'viewing_time'])
# test = test.fillna(-9999, subset=['year', 'viewing_time'])

# train_assembler = VectorAssembler(inputCols=col_vector, 
#                                   outputCol="features")
# # drop_cols = ['datetime_availability_start', 
# #              'datetime_availability_stop',
# #              'datetime_show_start',
# #              'datetime_show_stop'] + col_vector
# train = train_assembler.transform(train).drop(*drop_cols)
# train.show(5, truncate=False)

# test_assembler = VectorAssembler(inputCols=col_vector, 
#                                   outputCol="features")
# test = test_assembler.transform(test).drop(*drop_cols)
# test.show(5, truncate=False)

In [54]:
# train.cache()
# test.cache()
# train.show(2, False, True)
# test.show(2, False, True)

In [26]:
# time to now
train = train.withColumn('years_to_now', f.lit(2021) - f.col('year'))
test = test.withColumn('years_to_now', f.lit(2021) - f.col('year'))
# train.show(1, False, True)
# test.show(1, False, True)
col_vector += ['years_to_now']

In [27]:


train = train.fillna(-9999, subset=[
    'content_type', 
    'year', 
    'items_viewed', 
    'user_mean_time_view', 
    'mean_view_type',
    'genres_lbl',
    'item_availability',
    'years_to_now'
                                   ])

test = test.fillna(-9999, subset=[
    'content_type', 
    'year', 
    'items_viewed', 
    'user_mean_time_view', 
    'mean_view_type',
    'genres_lbl',
    'item_availability',
    'years_to_now'
])


In [57]:
# # feature col without tfidf
# from pyspark.ml.feature import VectorAssembler
# col_vector = ['content_type', 
#               'year', 
#               'items_viewed', 
#               'user_mean_time_view',
#               'mean_view_type',
#               'genres_lbl',
#               'item_availability',
#               'years_to_now'
#              ]
# train_assembler = VectorAssembler(inputCols=col_vector, 
#                                   outputCol="features")
# train = train_assembler.transform(train)
# # train.show(1, vertical=True, truncate=False)

# test_assembler = VectorAssembler(inputCols=col_vector, 
#                                   outputCol="features")
# test = test_assembler.transform(test)
# # test.show(1, vertical=True, truncate=False)

In [58]:
# feature col with tfidf
from pyspark.ml.feature import VectorAssembler

col_vector = ['content_type', 
              'year', 
              'items_viewed', 
              'user_mean_time_view',
              'mean_view_type',
              'genres_lbl',
              'item_availability',
              'years_to_now',
              'tfidf'
             ]
train_assembler = VectorAssembler(inputCols=col_vector, 
                                  outputCol="features_tfidf")
train = train_assembler.transform(train)
# train.show(1, vertical=True, truncate=False)

test_assembler = VectorAssembler(inputCols=col_vector, 
                                  outputCol="features_tfidf")
test = test_assembler.transform(test)
# test.show(1, vertical=True, truncate=False)


col_vector = ['content_type', 
              'year', 
              'items_viewed', 
              'user_mean_time_view',
              'mean_view_type',
              'genres_lbl',
              'item_availability',
              'years_to_now',
              'word2vec'
             ]
train_assembler = VectorAssembler(inputCols=col_vector, 
                                  outputCol="features_w2v")
train = train_assembler.transform(train)
# train.show(1, vertical=True, truncate=False)

test_assembler = VectorAssembler(inputCols=col_vector, 
                                  outputCol="features_w2v")
test = test_assembler.transform(test)
# test.show(1, vertical=True, truncate=False)

In [88]:
# from pyspark.ml.feature import Normalizer
# normalizer = Normalizer(p=2.0, inputCol="features", outputCol="features_norm")
# train = normalizer.transform(train)
# test = normalizer.transform(test)

In [112]:
from pyspark.ml.linalg import Vectors

tfidf = f.when(
    (train.content_type == 0),
    Vectors.sparse(dict_length, [0], [0])
).otherwise(train.tfidf)
train.withColumn('tfidf_ed', tfidf).show()

AttributeError: 'SparseVector' object has no attribute '_get_object_id'

In [29]:
train = train.withColumn('year_edited', f.col('year') * f.col('content_type'))
train = train.withColumn('items_viewed_edited', f.col('items_viewed') * f.col('content_type'))
train = train.withColumn('user_mean_time_view_edited', f.col('user_mean_time_view') * f.col('content_type'))
train = train.withColumn('mean_view_type_edited', f.col('mean_view_type') * f.col('content_type'))
train = train.withColumn('genres_lbl_edited', f.col('genres_lbl') * f.col('content_type'))
train = train.withColumn('item_availability_edited', f.col('item_availability') * f.col('content_type'))
train = train.withColumn('years_to_now_edited', f.col('years_to_now') * f.col('content_type'))
# train = train.withColumn('tfidf_edited', f.col('tfidf') * f.col('content_type'))

test = test.withColumn('year_edited', f.col('year') * f.col('content_type'))
test = test.withColumn('items_viewed_edited', f.col('items_viewed') * f.col('content_type'))
test = test.withColumn('user_mean_time_view_edited', f.col('user_mean_time_view') * f.col('content_type'))
test = test.withColumn('mean_view_type_edited', f.col('mean_view_type') * f.col('content_type'))
test = test.withColumn('genres_lbl_edited', f.col('genres_lbl') * f.col('content_type'))
test = test.withColumn('item_availability_edited', f.col('item_availability') * f.col('content_type'))
test = test.withColumn('years_to_now_edited', f.col('years_to_now') * f.col('content_type'))
# test = test.withColumn('tfidf_edited', f.col('tfidf') * f.col('content_type'))

In [28]:
train = train.withColumn('year_edited', f.col('year') * f.col('content_type'))
train = train.withColumn('items_viewed_edited', f.col('items_viewed') * f.col('content_type'))
train = train.withColumn('user_mean_time_view_edited', f.col('user_mean_time_view') * f.col('content_type'))
train = train.withColumn('mean_view_type_edited', f.col('mean_view_type') * f.col('content_type'))
train = train.withColumn('genres_lbl_edited', f.col('genres_lbl') * f.col('content_type'))
train = train.withColumn('item_availability_edited', f.col('item_availability') * f.col('content_type'))
train = train.withColumn('years_to_now_edited', f.col('years_to_now') * f.col('content_type'))
# train = train.withColumn('tfidf_edited', f.col('tfidf') * f.col('content_type'))

test = test.withColumn('year_edited', f.col('year') * f.col('content_type'))
test = test.withColumn('items_viewed_edited', f.col('items_viewed') * f.col('content_type'))
test = test.withColumn('user_mean_time_view_edited', f.col('user_mean_time_view') * f.col('content_type'))
test = test.withColumn('mean_view_type_edited', f.col('mean_view_type') * f.col('content_type'))
test = test.withColumn('genres_lbl_edited', f.col('genres_lbl') * f.col('content_type'))
test = test.withColumn('item_availability_edited', f.col('item_availability') * f.col('content_type'))
test = test.withColumn('years_to_now_edited', f.col('years_to_now') * f.col('content_type'))
# test = test.withColumn('tfidf_edited', f.col('tfidf') * f.col('content_type'))

col_vector = ['year_edited',
              'items_viewed_edited',
              'user_mean_time_view_edited',
              'mean_view_type_edited',
              'genres_lbl_edited',
              'item_availability_edited',
              'years_to_now_edited',
              'tfidf'
             ]

train_assembler = VectorAssembler(inputCols=col_vector, 
                                  outputCol="features_tfidf_edited")
train = train_assembler.transform(train)

test_assembler = VectorAssembler(inputCols=col_vector, 
                                  outputCol="features_tfidf_edited")
test = test_assembler.transform(test)


col_vector = ['year_edited',
              'items_viewed_edited',
              'user_mean_time_view_edited',
              'mean_view_type_edited',
              'genres_lbl_edited',
              'item_availability_edited',
              'years_to_now_edited',
              'word2vec'
             ]

train_assembler = VectorAssembler(inputCols=col_vector, 
                                  outputCol="features_w2v_edited")
train = train_assembler.transform(train)

test_assembler = VectorAssembler(inputCols=col_vector, 
                                  outputCol="features_w2v_edited")
test = test_assembler.transform(test)

NameError: name 'VectorAssembler' is not defined

### Fiting model

In [61]:
print(col_vector)

['year_edited', 'items_viewed_edited', 'user_mean_time_view_edited', 'mean_view_type_edited', 'genres_lbl_edited', 'item_availability_edited', 'years_to_now_edited', 'word2vec']


In [62]:
# Model
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier

lr = LogisticRegression(featuresCol='features_w2v_edited', labelCol="purchase", maxIter=20)

# rf = RandomForestClassifier(featuresCol='features_all', labelCol="purchase", numTrees=50, maxDepth=9, seed=42)

In [63]:
# Evaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", 
                                          labelCol="purchase", 
                                          metricName='areaUnderROC')

In [64]:
# %%time
# model = lr.fit(train)
# preds = model.transform(test)
# preds.show(5)

In [65]:
# from pyspark.ml.classification import GBTClassifier

# gbt = GBTClassifier(maxIter=50, 
#                     maxDepth=7, 
#                     featuresCol='features', 
#                     labelCol="purchase", 
#                     subsamplingRate=0.7, 
#                     seed=42)

In [66]:
%%time
# Cros-val lr fitting
train.cache()
test.cache()
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [20])\
                              .build()

crossval = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid,
                              evaluator=evaluator, numFolds=5, parallelism=3)

cv_model = crossval.fit(train)
print('average metrics - ', cv_model.avgMetrics)


predictions_lr = cv_model.transform(test)

KeyboardInterrupt: 

In [67]:
# %%time
# # Cros-val rf fitting

# paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [10])\
#                               .build()
# crossval = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid,
#                               evaluator=evaluator, numFolds=5, parallelism=3)


# cv_model = crossval.fit(train)
# print('average metrics - ', cv_model.avgMetrics)


# predictions_rf = cv_model.transform(test)

In [68]:
predictions = predictions_lr.orderBy(['user_id', 'item_id'], ascending=[1, 1])

NameError: name 'predictions_lr' is not defined

In [None]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, DoubleType

def to_array(col):
    def to_array_(v):
        return v.toArray().tolist()
    # Important: asNondeterministic requires Spark 2.3 or later
    # It can be safely removed i.e.
    # return udf(to_array_, ArrayType(DoubleType()))(col)
    # but at the cost of decreased performance
    return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)

predictions = predictions.withColumn('proba', to_array(col('probability')))

predictions.select(f.col('user_id'), 
                     f.col('item_id'),
                     f.col('proba').getItem(1).alias('purchase')
                    ).toPandas().to_csv('/data/home/farhad.sabilov/lab03.csv')

In [None]:
# train.write.csv('train_1000000.csv')
# test.write.csv('test_1000000.csv')

5032624
5032624


In [43]:
spark.stop()

In [31]:
from pyspark.sql.functions import col

train_purchases = train.groupBy('user_id')\
                .sum().select(col("sum(purchase)").alias("user_purchases"), col("user_id")).cache()
item_purchases = train.groupBy('item_id')\
                        .sum().select(col("sum(purchase)").alias("item_purchases"), col("item_id")).cache()


train = train.join(train_purchases, on='user_id', how='left')
test = test.join(train_purchases, on='user_id', how='left')

train = train.join(item_purchases, on='item_id', how='left')
test = test.join(item_purchases, on='item_id', how='left')

In [32]:

train_user_attempts = train.groupBy('user_id').count().select(col("count").alias("user_attempts"), col("user_id"))\
                            .cache()

train_item_attempts = train.groupBy('item_id').count().select(col("count").alias("item_attempts"), col("item_id"))\
                            .cache()


train = train.join(train_user_attempts, on='user_id', how='left')
test = test.join(train_user_attempts, on='user_id', how='left')

train = train.join(train_item_attempts, on='item_id', how='left')
test = test.join(train_item_attempts, on='item_id', how='left')


train = train.withColumn('user_addict', col('user_purchases') / col('user_attempts'))
test = test.withColumn('user_addict', col('user_purchases') / col('user_attempts'))

train = train.withColumn('item_addict', col('item_purchases') / col('item_attempts'))
test = test.withColumn('item_addict', col('item_purchases') / col('item_attempts'))

In [None]:
train = train.withColumn('items_viewed_edited', f.col('items_viewed') * f.col('content_type'))
train = train.withColumn('user_mean_time_view_edited', f.col('user_mean_time_view') * f.col('content_type'))
train = train.withColumn('mean_view_type_edited', f.col('mean_view_type') * f.col('content_type'))
train = train.withColumn('genres_lbl_edited', f.col('genres_lbl') * f.col('content_type'))
train = train.withColumn('item_availability_edited', f.col('item_availability') * f.col('content_type'))
train = train.withColumn('years_to_now_edited', f.col('years_to_now') * f.col('content_type'))
# train = train.withColumn('tfidf_edited', f.col('tfidf') * f.col('content_type'))

test = test.withColumn('items_viewed_edited', f.col('items_viewed') * f.col('content_type'))
test = test.withColumn('user_mean_time_view_edited', f.col('user_mean_time_view') * f.col('content_type'))
test = test.withColumn('mean_view_type_edited', f.col('mean_view_type') * f.col('content_type'))
test = test.withColumn('genres_lbl_edited', f.col('genres_lbl') * f.col('content_type'))
test = test.withColumn('item_availability_edited', f.col('item_availability') * f.col('content_type'))
test = test.withColumn('years_to_now_edited', f.col('years_to_now') * f.col('content_type'))
# test = test.withColumn('tfidf_edited', f.col('tfidf') * f.col('content_type'))

In [34]:
train = train.withColumn('item_purchases_edited', f.col('item_purchases') * f.col('content_type'))
train = train.withColumn('user_purchases_edited', f.col('user_purchases') * f.col('content_type'))
train = train.withColumn('user_addict_edited', f.col('user_addict') * f.col('content_type'))
train = train.withColumn('item_addict_edited', f.col('item_addict') * f.col('content_type'))

test = test.withColumn('item_purchases_edited', f.col('item_purchases') * f.col('content_type'))
test = test.withColumn('user_purchases_edited', f.col('user_purchases') * f.col('content_type'))
test = test.withColumn('user_addict_edited', f.col('user_addict') * f.col('content_type'))
test = test.withColumn('item_addict_edited', f.col('item_addict') * f.col('content_type'))

In [33]:
train.show(1, False, True)

-RECORD 0--------------------------------------------------
 item_id                    | 8389                         
 user_id                    | 746713                       
 purchase                   | 0                            
 channel_id                 | null                         
 datetime_show_start        | null                         
 datetime_show_stop         | null                         
 content_type               | 1                            
 title                      | пес в сапогах (сурдоперевод) 
 year                       | 1981.0                       
 region_id                  | null                         
 items_viewed               | 3                            
 item_sum_view              | 1481                         
 user_mean_time_view        | 493.6665021111659            
 mean_view_type             | 0.0                          
 genres_lbl                 | 550                          
 item_availability          | 1         

In [36]:
from pyspark.ml.feature import VectorAssembler

col_vector = ['items_viewed_edited',
              'user_mean_time_view_edited',
              'mean_view_type_edited',
              'genres_lbl_edited',
              'item_availability_edited',
              'years_to_now_edited',
              'item_purchases_edited',
              'user_purchases_edited',
              'user_addict_edited',
              'item_addict_edited'
             ]

train_assembler = VectorAssembler(inputCols=col_vector, 
                                  outputCol="features")
train = train_assembler.transform(train)

test_assembler = VectorAssembler(inputCols=col_vector, 
                                  outputCol="features")
test = test_assembler.transform(test)



from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features', labelCol="purchase", maxIter=10)

pipeline = Pipeline(stages=[
    lr
])


from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="purchase", metricName='areaUnderROC')

In [40]:
train.cache()
test.cache()
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [10])\
                              .build()

crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, 
                          evaluator=evaluator, numFolds=3, parallelism=3)

cv_model = crossval.fit(train)

predictions_valid = cv_model.transform(test)

In [41]:
predictions = predictions_valid.orderBy(['user_id', 'item_id'], ascending=[1, 1])

In [42]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, DoubleType

def to_array(col):
    def to_array_(v):
        return v.toArray().tolist()
    # Important: asNondeterministic requires Spark 2.3 or later
    # It can be safely removed i.e.
    # return udf(to_array_, ArrayType(DoubleType()))(col)
    # but at the cost of decreased performance
    return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)

predictions = predictions.withColumn('proba', to_array(col('probability')))

predictions.select(f.col('user_id'), 
                     f.col('item_id'),
                     f.col('proba').getItem(1).alias('purchase')
                    ).toPandas().to_csv('/data/home/farhad.sabilov/lab03.csv')

In [43]:
# train.show(2, False, True)
# test.show(2, False, True)

In [44]:
# print(train.count())
# train = train.dropDuplicates(subset=['user_id', 'item_id'])
# print(train.count())

In [45]:
# repartition dfs
train = train.coalesce(3)
test = test.coalesce(3)
# print(train.rdd.getNumPartitions())
# print(test.rdd.getNumPartitions())

In [46]:
# df's count
# print(train.count())
# print(test.count())

In [47]:
# # calculate the duration of viewing video by user
# import pyspark.sql.functions as f
# train = train.withColumn('viewing_time', f.col('ts_end') - f.col('ts_start')).drop('ts_end', 'ts_start')
# test = test.withColumn('viewing_time', f.col('ts_end') - f.col('ts_start')).drop('ts_end', 'ts_start')
# # train.show(2, False, True)
# # test.show(2, False, True)
# col_vector += ['viewing_time']

In [48]:
# # item_type labling
# from itertools import chain
# from pyspark.sql.functions import create_map, lit
# from pyspark.ml.feature import StringIndexer

# train = train.fillna('9999', subset=['item_type'])
# test = test.fillna('9999', subset=['item_type'])

# item_types = train.select('item_type').distinct().rdd.flatMap(lambda x: x).collect()
# item_types_dict = dict(zip(item_types, range(len(item_types))))

# mapping_expr = create_map([lit(x) for x in chain(*item_types_dict.items())])

# train = train.withColumn('item_type_lbl', mapping_expr[train['item_type']]).drop('item_type')
# # train.show(2, False, True)

# item_types = test.select('item_type').distinct().rdd.flatMap(lambda x: x).collect()
# item_types_dict = dict(zip(item_types, range(len(item_types))))

# mapping_expr = create_map([lit(x) for x in chain(*item_types_dict.items())])

# test = test.withColumn('item_type_lbl', mapping_expr[test['item_type']]).drop('item_type')
# # test.show(2, False, True)
# #-------------------------------------------------------------------------------------------
# # train_indexer = StringIndexer(inputCol="item_type", outputCol="item_type_lbl") 
# # train = train_indexer.fit(train).transform(train)

# # test_indexer = StringIndexer(inputCol="item_type", outputCol="item_type_lbl")
# # test = test_indexer.fit(test).transform(test)

# col_vector += ['item_type_lbl']

In [49]:
# genres labling
train = train.fillna('Без жанра', subset=['genres'])
train = train.fillna(-9999, subset=['year'])
genres = train.select('genres').distinct().rdd.flatMap(lambda x: x).collect()
genres_dict = dict(zip(genres, range(len(genres))))

mapping_expr = create_map([lit(x) for x in chain(*genres_dict.items())])

train = train.withColumn('genres_lbl', mapping_expr[train['genres']]).drop('genres')
# train.show(2, False, True)

test = test.fillna('Без жанра', subset=['genres'])
test = test.fillna(-9999, subset=['year'])
genres = test.select('genres').distinct().rdd.flatMap(lambda x: x).collect()
genres_dict = dict(zip(genres, range(len(genres))))

mapping_expr = create_map([lit(x) for x in chain(*genres_dict.items())])

test = test.withColumn('genres_lbl', mapping_expr[test['genres']]).drop('genres')
# test.show(2, False, True)

#-------------------------------------------------------------------------------------------
# train_indexer = StringIndexer(inputCol="genres", outputCol="genres_lbl") 
# train = train_indexer.fit(train).transform(train)

# test_indexer = StringIndexer(inputCol="genres", outputCol="genres_lbl")
# test = test_indexer.fit(test).transform(test)

col_vector += ['genres_lbl']