### Laba 3. Content Recommendation

- В laba03_train.csv содержатся факты покупки (колонка purchase) пользователями (колонка user_id) телепередач (колонка item_id). Такой формат файла вам уже знаком.

- laba03_test.csv — тестовый датасет без указанного целевого признака purchase, который вам и предстоит предсказать.


- laba03_items.csv — дополнительные данные по items. В данном файле много лишней или ненужной информации, так что задача её фильтрации и отбора ложится на вас. Поля в файле, на которых хотелось бы остановиться:
    - item_id — primary key. Соответствует item_id в предыдущем файле.
    - content_type — тип телепередачи (1 — платная, 0 — бесплатная). Вас интересуют платные передачи.
    - title — название передачи, текстовое поле.
    - year — год выпуска передачи, число.
    - genres — поле с жанрами передачи, разделёнными через запятую.


- Дополнительный файл laba03_views_programmes.csv по просмотрам передач с полями:
    - ts_start — время начала просмотра.
    - ts_end — время окончания просмотра.
    - item_type — тип просматриваемого контента:
        - live — просмотр "вживую", в момент показа контента в эфире.
        - pvr — просмотр в записи, после показа контента в эфире.

In [1]:
import os
import sys
import re
import json
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 6 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
#exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType, LongType, StructType, StructField, StringType, ArrayType
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col


conf = SparkConf()
conf.set("spark.app.name", "karavaev_andrei_spark_lab_2_second") 
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [2]:
import numpy as np
from statistics import mode, mean

from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf, concat
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import RandomForestClassifier

from pyspark.sql.types import *
import pyspark.sql.functions as f
from pyspark.sql.functions import countDistinct

from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.linalg import SparseVector, DenseVector
from pyspark.sql.functions import isnan, when, count, col

# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

# Import the requisite packages
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

In [100]:
items_schema = StructType(fields=[
    StructField("item_id", StringType()),
    StructField("channel_id", StringType()),
    StructField("datetime_availability_start", TimestampType()),
    StructField("datetime_availability_stop", TimestampType()),
    StructField("datetime_show_start", TimestampType()),
    StructField("datetime_show_stop", TimestampType()),
    StructField("content_type", StringType()),
    StructField("title", StringType()),
    StructField("year", FloatType()),
    StructField("genres", StringType()),
    StructField("region_id", StringType())
])

items = spark.read.csv("/labs/slaba03/laba03_items.csv", header=True, sep='\t')

In [101]:
data_schema = StructType(fields=[
    StructField('user_id', LongType()),
    StructField('item_id', LongType()),
    StructField('purchase', IntegerType()) 
])

train = spark.read.csv("/labs/slaba03/laba03_train.csv", schema=data_schema, header=True, sep=',')
test = spark.read.csv("/labs/slaba03/laba03_test.csv", schema=data_schema, header=True, sep=',')

In [102]:
views_schema = StructType(fields=[
    StructField('user_id', LongType()),
    StructField('item_id', LongType()),
    StructField('ts_start', IntegerType()),
    StructField('ts_end', IntegerType()),
    StructField('item_type', StringType())
])

views = spark.read.csv("/labs/slaba03/laba03_views_programmes.csv", schema=views_schema, header=True, sep=',')

In [103]:
print('items', (items.count(), len(items.columns)))
print('test', (test.count(), len(test.columns)))
print('train', (train.count(), len(train.columns)))
print('views', (items.count(), len(views.columns)))

items (635568, 11)
test (2156840, 3)
train (5032624, 3)
views (635568, 5)


In [104]:
items = items.select(['item_id', 'channel_id', 'region_id', 'year', 'title', 'genres'])

#### Checking nulls

In [105]:
items.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in items.columns]).show()

+-------+----------+---------+------+-----+------+
|item_id|channel_id|region_id|  year|title|genres|
+-------+----------+---------+------+-----+------+
|      0|      3704|   362264|631868|    0|    33|
+-------+----------+---------+------+-----+------+



In [106]:
items = items.fillna({'genres':'Приключения,Триллер', 'year': 2010.0})

In [107]:
items.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in items.columns]).show()

+-------+----------+---------+----+-----+------+
|item_id|channel_id|region_id|year|title|genres|
+-------+----------+---------+----+-----+------+
|      0|      3704|   362264|   0|    0|     0|
+-------+----------+---------+----+-----+------+



In [108]:
train.dtypes

[('user_id', 'bigint'), ('item_id', 'bigint'), ('purchase', 'int')]

### Feature Egineering

In [109]:
print(train.select('user_id').distinct().count(), train.select('item_id').distinct().count())

1941 3704


In [110]:
user_items = train.groupBy('user_id').sum()\
                  .select(f.col('user_id'), f.col('sum(purchase)').cast(IntegerType()).alias('user_items'))

item_users = train.groupBy('item_id').sum()\
                  .select(f.col('item_id'), f.col('sum(purchase)').cast(IntegerType()).alias('item_users'))

In [111]:
train = train.join(user_items, 'user_id', how='left')
train = train.join(item_users, 'item_id', how='left')
#train = train.join(user_views_features, 'user_id', how='left')

test = test.join(user_items, 'user_id', how='left')
test = test.join(item_users, 'item_id', how='left')
#test = test.join(user_views_features, 'user_id', how='left')

In [112]:
print(train.rdd.getNumPartitions())
print(test.rdd.getNumPartitions())

train = train.coalesce(4)
test = test.coalesce(4)

print(train.rdd.getNumPartitions())
print(test.rdd.getNumPartitions())

200
200
4
4


In [113]:
print(train.columns)
print(test.columns)

['item_id', 'user_id', 'purchase', 'user_items', 'item_users']
['item_id', 'user_id', 'purchase', 'user_items', 'item_users']


### Vectorize genres feature ("multilabel binarizer")

In [114]:
split_udf = udf(lambda x: str(x.replace(',', ' ')), StringType())

items = items.withColumn('genres_split', split_udf(items.genres))

In [115]:
tokenizer = Tokenizer(inputCol='genres_split', outputCol='words')
items = tokenizer.transform(items)

hasher = HashingTF(inputCol='words', outputCol='genres_dummies', numFeatures=85)
items = hasher.transform(items)

In [116]:
items.where('item_id=396').select('genres_dummies').collect()

[Row(genres_dummies=SparseVector(85, {1: 1.0, 8: 1.0, 55: 1.0, 56: 1.0, 57: 1.0}))]

In [117]:
items.where('item_id=396').select('words').collect()

[Row(words=['детективы', 'триллеры', 'драмы', 'фантастика', 'зарубежные'])]

In [118]:
items.columns

['item_id',
 'channel_id',
 'region_id',
 'year',
 'title',
 'genres',
 'genres_split',
 'words',
 'genres_dummies']

### Vectorize title feature ("TFIDF")

In [119]:
from pyspark.ml.feature import StopWordsRemover
import nltk
nltk.download("stopwords")


stopwordList = nltk.corpus.stopwords.words('russian')
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="stopWordsRem", stopWords=stopwordList)

[nltk_data] Downloading package stopwords to
[nltk_data]     /data/home/andrey.karavaev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [120]:
from pyspark.ml.feature import StopWordsRemover
import nltk
nltk.download("stopwords")


tokenizer = Tokenizer(inputCol='title', outputCol='title_words')
items = tokenizer.transform(items)

stopwordList = nltk.corpus.stopwords.words('russian')
remover = StopWordsRemover(inputCol='title_words', outputCol='title_cleaned', stopWords=stopwordList)
# remover = StopWordsRemover(inputCol='title_words', outputCol='title_cleaned')
items = remover.transform(items)

hasher = HashingTF(inputCol='title_cleaned', outputCol='TF_title', numFeatures=1000)
items = hasher.transform(items)

idf = IDF(inputCol='TF_title', outputCol='TFIDF')
idfModel = idf.fit(items)
items = idfModel.transform(items)

[nltk_data] Downloading package stopwords to
[nltk_data]     /data/home/andrey.karavaev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Pipeline

In [123]:
items.columns

['item_id',
 'channel_id',
 'region_id',
 'year',
 'title',
 'genres',
 'genres_split',
 'words',
 'genres_dummies',
 'title_words',
 'title_cleaned',
 'TF_title',
 'TFIDF']

In [124]:
#items = items.withColumn('content_type', items.content_type.cast(IntegerType()))
#items = items.select('item_id', 'year', 'genres_dummies', 'TFIDF')
items = items.select('item_id', 'year', 'genres_dummies')

In [125]:
# items = items.withColumn('content_type', items.content_type.cast(IntegerType()))
# items = items.select('item_id', 'year', 'content_type', 'genres_dummies', 'TFIDF')

In [126]:
train_sample = train.join(items, 'item_id', how='left')
test_sample = test.join(items, 'item_id', how='left')

print(train_sample.rdd.getNumPartitions())
print(test_sample.rdd.getNumPartitions())
train_sample = train_sample.coalesce(4)
test_sample = test_sample.coalesce(4)
print(train_sample.rdd.getNumPartitions())
print(test_sample.rdd.getNumPartitions())

4
4
4
4


In [127]:
train_sample.take(1)

[Row(item_id=3764, user_id=773825, purchase=0, user_items=2, item_users=3, year='2013.0', genres_dummies=SparseVector(85, {8: 1.0, 46: 1.0, 55: 1.0, 57: 1.0}))]

In [128]:
print('loc')

loc


In [129]:
train_sample = train_sample.withColumn("year", train_sample.year.astype(LongType()))

In [130]:
test_sample = test_sample.withColumn("year", test_sample.year.astype(LongType()))

In [131]:
train_sample.take(1)

[Row(item_id=3764, user_id=741210, purchase=0, user_items=1, item_users=3, year=2013, genres_dummies=SparseVector(85, {8: 1.0, 46: 1.0, 55: 1.0, 57: 1.0}))]

In [132]:
test_sample.take(1)

[Row(item_id=3764, user_id=876215, purchase=None, user_items=23, item_users=3, year=2013, genres_dummies=SparseVector(85, {8: 1.0, 46: 1.0, 55: 1.0, 57: 1.0}))]

In [133]:
print(train_sample.columns)
print(test_sample.columns)

['item_id', 'user_id', 'purchase', 'user_items', 'item_users', 'year', 'genres_dummies']
['item_id', 'user_id', 'purchase', 'user_items', 'item_users', 'year', 'genres_dummies']


In [134]:
print(train_sample.columns)
print(test_sample.columns)

['item_id', 'user_id', 'purchase', 'user_items', 'item_users', 'year', 'genres_dummies']
['item_id', 'user_id', 'purchase', 'user_items', 'item_users', 'year', 'genres_dummies']


In [135]:
train_sample.cache()
test_sample.cache()

DataFrame[item_id: bigint, user_id: bigint, purchase: int, user_items: int, item_users: int, year: bigint, genres_dummies: vector]

In [136]:
train_sample.take(1)

[Row(item_id=3764, user_id=728960, purchase=0, user_items=1, item_users=3, year=2013, genres_dummies=SparseVector(85, {8: 1.0, 46: 1.0, 55: 1.0, 57: 1.0}))]

In [137]:
print(train_sample.count())
print(test_sample.count())

5032624
2156840


# ALS model

In [44]:
als = ALS(
         userCol="user_id", 
         itemCol="item_id",
         ratingCol="purchase", 
         nonnegative = True, 
         implicitPrefs = False,
         coldStartStrategy="drop"
)

# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [20]) \
            .addGrid(als.regParam, [.05]) \
            .build()

# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(
           metricName="rmse", 
           labelCol="purchase", 
           predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)

Num models to be tested:  1


In [45]:
train_sample.columns

['item_id', 'user_id', 'purchase', 'features']

In [46]:
#Fit cross validator to the 'train' dataset
model = cv.fit(train_sample)
#Extract best model from the cv model above
best_model = model.bestModel

In [47]:
factors_sdf = best_model.userFactors
factors_sdf = factors_sdf.withColumnRenamed('id', 'user_id')
factors_sdf = factors_sdf.withColumnRenamed('features', 'user_feat')

In [48]:
items_factors_sdf = best_model.itemFactors
items_factors_sdf = items_factors_sdf.withColumnRenamed('id', 'item_id')
items_factors_sdf = items_factors_sdf.withColumnRenamed('features', 'item_feat')

In [138]:
train_sample = train_sample.join(factors_sdf, on='user_id', how='left')
train_sample = train_sample.join(items_factors_sdf, on='item_id', how='left')

In [139]:
test_sample = test_sample.join(factors_sdf, on='user_id', how='left')
test_sample = test_sample.join(items_factors_sdf, on='item_id', how='left')

In [140]:
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())

In [141]:
train_sample = train_sample.withColumn("user_feat_vector", list_to_vector_udf(train_sample["user_feat"]))
train_sample = train_sample.withColumn("item_feat_vector", list_to_vector_udf(train_sample["item_feat"]))

In [142]:
test_sample = test_sample.withColumn("user_feat_vector", list_to_vector_udf(test_sample["user_feat"]))
test_sample = test_sample.withColumn("item_feat_vector", list_to_vector_udf(test_sample["item_feat"]))

In [143]:
test_sample.take(1)

[Row(item_id=3764, user_id=842257, purchase=None, user_items=21, item_users=3, year=2013, genres_dummies=SparseVector(85, {8: 1.0, 46: 1.0, 55: 1.0, 57: 1.0}), user_feat=[1.3482138718151973e-15, 2.151358018665753e-15, 3.6773655884172855e-15, 5.2052550536193804e-15, 3.0427869273009665e-15, 2.087026925112973e-15, 2.0908597491992986e-15, 6.019219411096399e-15, 2.710286261438659e-15, 3.866496610594383e-15, 7.0087372761225026e-15, 6.973334687059062e-15, 4.02161925993922e-15, 5.327658090761205e-15, 1.9604200634539253e-15, 3.7767140828007424e-15, 2.7520670085949247e-15, 2.6954894428830737e-15, 2.7227043997201175e-15, 5.819863854212995e-15], item_feat=[2.5648847974712227e-15, 4.09221522092813e-15, 6.995592171814063e-15, 9.89247407406045e-15, 5.786538189936222e-15, 3.968942280949475e-15, 3.974367526976639e-15, 1.1443913172778583e-14, 5.154312798105612e-15, 7.351346433177343e-15, 1.3328286194711544e-14, 1.325588351244614e-14, 7.647659733950578e-15, 1.0133072935695087e-14, 3.728722890075208e-15, 

In [144]:
train_sample.take(1)

[Row(item_id=3764, user_id=863262, purchase=0, user_items=2, item_users=3, year=2013, genres_dummies=SparseVector(85, {8: 1.0, 46: 1.0, 55: 1.0, 57: 1.0}), user_feat=[2.6974865348144625e-16, 4.305267829964228e-16, 7.357663816656202e-16, 1.0431258391039992e-15, 6.090166890758419e-16, 4.178016482111438e-16, 4.187074440691139e-16, 1.2054968670158466e-15, 5.426501224112506e-16, 7.743041573042043e-16, 1.4031506288290983e-15, 1.3970574973230245e-15, 8.052225539448808e-16, 1.0663646115283945e-15, 3.9242289998506967e-16, 7.556658855141431e-16, 5.509393456517588e-16, 5.393783517733625e-16, 5.448954479357442e-16, 1.1653298235032188e-15], item_feat=[2.5648847974712227e-15, 4.09221522092813e-15, 6.995592171814063e-15, 9.89247407406045e-15, 5.786538189936222e-15, 3.968942280949475e-15, 3.974367526976639e-15, 1.1443913172778583e-14, 5.154312798105612e-15, 7.351346433177343e-15, 1.3328286194711544e-14, 1.325588351244614e-14, 7.647659733950578e-15, 1.0133072935695087e-14, 3.728722890075208e-15, 7.1858

In [145]:
def normalize_spark_vectors(sdf, columns):
    for column in columns:
        normalizer = Normalizer(inputCol=column, outputCol=f"{column}_norm")
        sdf = normalizer.transform(sdf)
    return sdf

In [146]:
embedding_names = ['user_feat_vector', 'item_feat_vector']
embedding_names_norm = [f'{column}_norm' for column in embedding_names]

In [147]:
train_sample = normalize_spark_vectors(train_sample, embedding_names)

In [148]:
test_sample = normalize_spark_vectors(test_sample, embedding_names)

In [149]:
def dense_to_array(sdf, columns):
    to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))
    for column in columns:
        sdf = sdf.withColumn(column, to_array(column))
    return sdf

In [150]:
train_sample = dense_to_array(train_sample, embedding_names_norm)

In [151]:
test_sample = dense_to_array(test_sample, embedding_names_norm)

In [152]:
train_sample = train_sample.withColumn('feature_array', concat(col(embedding_names_norm[0]), col(embedding_names_norm[1])))

In [153]:
test_sample = test_sample.withColumn('feature_array', concat(col(embedding_names_norm[0]), col(embedding_names_norm[1])))

In [154]:
test_sample = test_sample.withColumn("feature_array", list_to_vector_udf(test_sample["feature_array"]))

In [155]:
train_sample = train_sample.withColumn("feature_array", list_to_vector_udf(train_sample["feature_array"]))

In [163]:
train_sample.take(1)

[Row(item_id=3764, user_id=728960, purchase=0, user_items=1, item_users=3, year=2013, genres_dummies=SparseVector(85, {8: 1.0, 46: 1.0, 55: 1.0, 57: 1.0}), user_feat=[1.5540002616528336e-16, 2.479953650882981e-16, 4.238632806794916e-16, 6.004172929579611e-16, 3.5076870484507976e-16, 2.4061868749955843e-16, 2.410827556755354e-16, 6.941094071987721e-16, 3.125062149143533e-16, 4.458511971990292e-16, 8.080770020375686e-16, 8.042728182527719e-16, 4.637057517546434e-16, 6.141966131856572e-16, 2.2601798354104203e-16, 4.3533988890051773e-16, 3.1730063315404944e-16, 3.1070894334917767e-16, 3.1385522076197417e-16, 6.710418413088406e-16], item_feat=[2.5648847974712227e-15, 4.09221522092813e-15, 6.995592171814063e-15, 9.89247407406045e-15, 5.786538189936222e-15, 3.968942280949475e-15, 3.974367526976639e-15, 1.1443913172778583e-14, 5.154312798105612e-15, 7.351346433177343e-15, 1.3328286194711544e-14, 1.325588351244614e-14, 7.647659733950578e-15, 1.0133072935695087e-14, 3.728722890075208e-15, 7.1858

In [157]:
test_sample.take(1)

[Row(item_id=3764, user_id=842257, purchase=None, user_items=21, item_users=3, year=2013, genres_dummies=SparseVector(85, {8: 1.0, 46: 1.0, 55: 1.0, 57: 1.0}), user_feat=[1.3482138718151973e-15, 2.151358018665753e-15, 3.6773655884172855e-15, 5.2052550536193804e-15, 3.0427869273009665e-15, 2.087026925112973e-15, 2.0908597491992986e-15, 6.019219411096399e-15, 2.710286261438659e-15, 3.866496610594383e-15, 7.0087372761225026e-15, 6.973334687059062e-15, 4.02161925993922e-15, 5.327658090761205e-15, 1.9604200634539253e-15, 3.7767140828007424e-15, 2.7520670085949247e-15, 2.6954894428830737e-15, 2.7227043997201175e-15, 5.819863854212995e-15], item_feat=[2.5648847974712227e-15, 4.09221522092813e-15, 6.995592171814063e-15, 9.89247407406045e-15, 5.786538189936222e-15, 3.968942280949475e-15, 3.974367526976639e-15, 1.1443913172778583e-14, 5.154312798105612e-15, 7.351346433177343e-15, 1.3328286194711544e-14, 1.325588351244614e-14, 7.647659733950578e-15, 1.0133072935695087e-14, 3.728722890075208e-15, 

In [164]:
assembler = VectorAssembler(inputCols=['user_items', 'item_users', 'year', 'genres_dummies'], outputCol='features_base')
train_sample = assembler.transform(train_sample)
test_sample = assembler.transform(test_sample)

In [165]:
train_sample.take(1)

[Row(item_id=3764, user_id=814771, purchase=0, user_items=16, item_users=3, year=2013, genres_dummies=SparseVector(85, {8: 1.0, 46: 1.0, 55: 1.0, 57: 1.0}), user_feat=[5.6330692665417805e-16, 8.99016735655836e-16, 1.5364685325292417e-15, 2.1776109578329527e-15, 1.2716780922836669e-15, 8.723806199591221e-16, 8.741912058234374e-16, 2.5168925459545044e-15, 1.133046433268042e-15, 1.6166513182942939e-15, 2.9297861676319016e-15, 2.91665715694946e-15, 1.6812781322247841e-15, 2.226678941192684e-15, 8.194108852069824e-16, 1.5780432389210877e-15, 1.1503843506653903e-15, 1.1263372970510776e-15, 1.1378106759589923e-15, 2.4331129406580552e-15], item_feat=[2.5648847974712227e-15, 4.09221522092813e-15, 6.995592171814063e-15, 9.89247407406045e-15, 5.786538189936222e-15, 3.968942280949475e-15, 3.974367526976639e-15, 1.1443913172778583e-14, 5.154312798105612e-15, 7.351346433177343e-15, 1.3328286194711544e-14, 1.325588351244614e-14, 7.647659733950578e-15, 1.0133072935695087e-14, 3.728722890075208e-15, 7.

In [166]:
test_sample.take(1)

[Row(item_id=3764, user_id=867836, purchase=None, user_items=0, item_users=3, year=2013, genres_dummies=SparseVector(85, {8: 1.0, 46: 1.0, 55: 1.0, 57: 1.0}), user_feat=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], item_feat=[2.5648847974712227e-15, 4.09221522092813e-15, 6.995592171814063e-15, 9.89247407406045e-15, 5.786538189936222e-15, 3.968942280949475e-15, 3.974367526976639e-15, 1.1443913172778583e-14, 5.154312798105612e-15, 7.351346433177343e-15, 1.3328286194711544e-14, 1.325588351244614e-14, 7.647659733950578e-15, 1.0133072935695087e-14, 3.728722890075208e-15, 7.185813325394277e-15, 5.234126595143017e-15, 5.127598225982159e-15, 5.178115694472879e-15, 1.1065357207991691e-14], user_feat_vector=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), item_feat_vector=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

In [167]:
#gb = GBTClassifier(labelCol='purchase', featuresCol='features', maxIter=60)
#gb = RandomForestClassifier(labelCol='purchase', featuresCol='features_new', numTrees=100)
gb = LogisticRegression(labelCol='purchase', featuresCol='features_base', maxIter=60)

In [90]:
train_sample.take(1)

[Row(item_id=3764, user_id=773825, purchase=0, features=SparseVector(88, {0: 2.0, 1: 3.0, 2: 2013.0, 11: 1.0, 49: 1.0, 58: 1.0, 60: 1.0}), user_feat=[3.0421656815737147e-16, 4.854413583350176e-16, 8.297943444700176e-16, 1.1744736500479474e-15, 6.866311650657007e-16, 4.709204607620003e-16, 4.718453678008428e-16, 1.3581581444950812e-15, 6.115110422868282e-16, 8.724307007821285e-16, 1.5814528641711414e-15, 1.573366346502823e-15, 9.074088204598764e-16, 1.2021310757208132e-15, 4.4233174883340795e-16, 8.521239329046539e-16, 6.209504303905894e-16, 6.082189957977652e-16, 6.144054597467146e-16, 1.313249728181079e-15], item_feat=[2.5648847974712227e-15, 4.09221522092813e-15, 6.995592171814063e-15, 9.89247407406045e-15, 5.786538189936222e-15, 3.968942280949475e-15, 3.974367526976639e-15, 1.1443913172778583e-14, 5.154312798105612e-15, 7.351346433177343e-15, 1.3328286194711544e-14, 1.325588351244614e-14, 7.647659733950578e-15, 1.0133072935695087e-14, 3.728722890075208e-15, 7.185813325394277e-15, 5.

In [168]:
model = gb.fit(train_sample)

In [None]:
model

In [75]:
predict_test = model.transform(test_sample)

In [76]:
predict = predict_test.select(f.col('user_id').cast(IntegerType()), f.col('item_id').cast(IntegerType()), 
                              'probability')

In [77]:
extract_udf = udf(lambda x: float(x[1]), FloatType())

In [78]:
predict = predict.withColumn('proba', extract_udf(predict.probability))

In [79]:
predict = predict.select('user_id', 'item_id', 'proba').orderBy(['user_id', 'item_id'])

In [80]:
predict.show()

+-------+-------+------------+
|user_id|item_id|       proba|
+-------+-------+------------+
|   1654|    336|3.0465933E-4|
|   1654|    678|2.5438727E-4|
|   1654|    691|3.4608628E-4|
|   1654|    696|0.0023978103|
|   1654|    763|0.0025053618|
|   1654|    795| 0.004865705|
|   1654|    861|0.0026504558|
|   1654|   1137|0.0034343926|
|   1654|   1159|0.0029824148|
|   1654|   1428|0.0016794871|
|   1654|   1685|0.0014787787|
|   1654|   1686|0.0014529492|
|   1654|   1704|0.0024808447|
|   1654|   2093|3.0876778E-4|
|   1654|   2343|0.0017559158|
|   1654|   2451| 3.578144E-4|
|   1654|   2469|   0.0050459|
|   1654|   2603|0.0021950793|
|   1654|   2609|2.4468554E-4|
|   1654|   2621|0.0022136455|
+-------+-------+------------+
only showing top 20 rows



In [81]:
predict_df = predict.toPandas()
predict_df.columns = ['user_id', 'item_id', 'purchase']


In [82]:
predict_df = predict_df.sort_values(['user_id', 'item_id'])
predict_df.reset_index(drop=True, inplace=True)

In [83]:
predict_df['purchase'].max()

0.9741041

In [84]:
predict_df.to_csv('/data/home/andrey.karavaev/lab03.csv')

In [85]:
predict_df

Unnamed: 0,user_id,item_id,purchase
0,1654,336,0.000305
1,1654,678,0.000254
2,1654,691,0.000346
3,1654,696,0.002398
4,1654,763,0.002505
5,1654,795,0.004866
6,1654,861,0.002650
7,1654,1137,0.003434
8,1654,1159,0.002982
9,1654,1428,0.001679


In [3]:
spark.stop()

NameError: name 'spark' is not defined