In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "ML app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

In [4]:
# USERS PURCHASES

schema = StructType([StructField("user_id", IntegerType()),
                    StructField("item_id", IntegerType()),
                    StructField("purchase", DoubleType())])

df_train = spark.read.csv('/labs/slaba03/laba03_train.csv', header= True, schema=schema)
test = spark.read.csv('/labs/slaba03/laba03_test.csv', header= True, schema=schema)

df_views = spark.read.csv('/labs/slaba03/laba03_views_programmes.csv', header= True)
df_items = spark.read.csv('/labs/slaba03/laba03_items.csv', header= True, sep= '\t')

In [5]:
train = df_train.sampleBy("purchase", fractions={0: 0.8, 1: 0.8}, seed=5757)

valid = df_train.join(train, on=["user_id", "item_id"], how="leftanti")

In [6]:
from pyspark.sql.functions import col

train_purchases = train.groupBy('user_id')\
                .sum().select(col("sum(purchase)").alias("user_purchases"), col("user_id")).cache()

train_purchases.show(2)

+--------------+-------+
|user_purchases|user_id|
+--------------+-------+
|          56.0| 754230|
|           1.0| 761341|
+--------------+-------+
only showing top 2 rows



In [7]:
item_purchases = train.groupBy('item_id')\
                        .sum().select(col("sum(purchase)").alias("item_purchases"), col("item_id")).cache()

item_purchases.show(2)

+--------------+-------+
|item_purchases|item_id|
+--------------+-------+
|           1.0|  95940|
|           1.0|  74757|
+--------------+-------+
only showing top 2 rows



In [8]:
# Сколько было покупок у пользователя и сколько раз покупали item?
train = train.join(train_purchases, on='user_id', how='left')
valid = valid.join(train_purchases, on='user_id', how='left')
test = test.join(train_purchases, on='user_id', how='left')

train = train.join(item_purchases, on='item_id', how='left')
valid = valid.join(item_purchases, on='item_id', how='left')
test = test.join(item_purchases, on='item_id', how='left')

In [9]:
train_user_attempts = train.groupBy('user_id').count().select(col("count").alias("user_attempts"), col("user_id"))\
                            .cache()

train_item_attempts = train.groupBy('item_id').count().select(col("count").alias("item_attempts"), col("item_id"))\
                            .cache()

train_user_attempts.show(2)

+-------------+-------+
|user_attempts|user_id|
+-------------+-------+
|         2089| 754230|
|         2058| 761341|
+-------------+-------+
only showing top 2 rows



In [10]:
train = train.join(train_user_attempts, on='user_id', how='left')
valid = valid.join(train_user_attempts, on='user_id', how='left')
test = test.join(train_user_attempts, on='user_id', how='left')

train = train.join(train_item_attempts, on='item_id', how='left')
valid = valid.join(train_item_attempts, on='item_id', how='left')
test = test.join(train_item_attempts, on='item_id', how='left')

In [11]:
train = train.withColumn('user_addict', (train.user_purchases / train.user_attempts))
valid = valid.withColumn('user_addict', col('user_purchases') / col('user_attempts'))
test = test.withColumn('user_addict', col('user_purchases') / col('user_attempts'))

In [12]:
train = train.withColumn('item_addict', col('item_purchases') / col('item_attempts'))
valid = valid.withColumn('item_addict', col('item_purchases') / col('item_attempts'))
test = test.withColumn('item_addict', col('item_purchases') / col('item_attempts'))

In [13]:
train_purchases.unpersist()
item_purchases.unpersist()
train_user_attempts.unpersist()
train_item_attempts.unpersist()

DataFrame[item_attempts: bigint, item_id: int]

In [14]:
from pyspark.ml.feature import VectorAssembler
# Выбираю колонки, которые войдут в features для GBT
cols = ['item_purchases', 'user_purchases', 'user_addict', 'item_addict']
assembler = VectorAssembler(inputCols=cols, outputCol="features")

train_data = assembler.transform(train).cache()
valid_data = assembler.transform(valid)
test_data = assembler.transform(test)

In [15]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="purchase", metricName='areaUnderROC')

In [16]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [18]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol="purchase")

pipeline = Pipeline(stages=[
    gbt
])

In [22]:
train_data.unpersist()

DataFrame[item_id: int, user_id: int, purchase: double, user_purchases: double, item_purchases: double, user_attempts: bigint, item_attempts: bigint, user_addict: double, item_addict: double, features: vector]

In [23]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol="purchase", maxDepth=4, minInstancesPerNode=3, maxBins=50)

gbt_model = gbt.fit(train_data)
predictions_valid = gbt_model.transform(valid_data)

In [24]:
evaluator = BinaryClassificationEvaluator(labelCol="purchase", metricName='areaUnderROC')
score = evaluator.evaluate(predictions_valid)
score

0.8841512308258217

In [25]:
test_predictions = gbt_model.transform(test_data)

In [26]:
predictions_pd = test_predictions.select("user_id", "item_id", col("probability").alias("purchase")).toPandas()
predictions_pd = predictions_pd.sort_values(by=['user_id', 'item_id'])
predictions_pd['purchase'] = predictions_pd['purchase'].apply(lambda x: x[1])
predictions_pd.to_csv('lab03.csv', index=False)