In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 5g pyspark-shell'
#os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
import pandas as pd
import numpy as np

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

In [48]:
# USERS PURCHASES

schema = StructType([StructField("user_id", IntegerType()),
                    StructField("item_id", IntegerType()),
                    StructField("purchase", DoubleType())])

df_train = spark.read.csv('/labs/slaba03/laba03_train.csv', header= True, schema=schema)
test = spark.read.csv('/labs/slaba03/laba03_test.csv', header= True, schema=schema)

df_views = spark.read.csv('/labs/slaba03/laba03_views_programmes.csv', header= True)
df_items = spark.read.csv('/labs/slaba03/laba03_items.csv', header= True, sep= '\t')

In [49]:
df_train.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: double (nullable = true)



In [50]:
train = df_train.sampleBy("purchase", fractions={0: 0.8, 1: 0.8}, seed=5757)

valid = df_train.join(train, on=["user_id", "item_id"], how="leftanti")



In [51]:
from pyspark.sql.functions import col

train_purchases = train.groupBy('user_id')\
                .sum().select(col("sum(purchase)").alias("user_purchases"), col("user_id")).cache()

train_purchases.show(2)

+--------------+-------+
|user_purchases|user_id|
+--------------+-------+
|          56.0| 754230|
|           1.0| 761341|
+--------------+-------+
only showing top 2 rows



In [52]:
item_purchases = train.groupBy('item_id')\
                        .sum().select(col("sum(purchase)").alias("item_purchases"), col("item_id")).cache()

item_purchases.show(2)

+--------------+-------+
|item_purchases|item_id|
+--------------+-------+
|           1.0|  95940|
|           1.0|  74757|
+--------------+-------+
only showing top 2 rows



In [53]:
# Сколько было покупок у пользователя и сколько раз покупали item?
train = train.join(train_purchases, on='user_id', how='left')
valid = valid.join(train_purchases, on='user_id', how='left')
test = test.join(train_purchases, on='user_id', how='left')

train = train.join(item_purchases, on='item_id', how='left')
valid = valid.join(item_purchases, on='item_id', how='left')
test = test.join(item_purchases, on='item_id', how='left')

In [54]:
train_user_attempts = train.groupBy('user_id').count().select(col("count").alias("user_attempts"), col("user_id"))\
                            .cache()

train_item_attempts = train.groupBy('item_id').count().select(col("count").alias("item_attempts"), col("item_id"))\
                            .cache()

train_user_attempts.show(2)

+-------------+-------+
|user_attempts|user_id|
+-------------+-------+
|         2089| 754230|
|         2058| 761341|
+-------------+-------+
only showing top 2 rows



In [55]:
train = train.join(train_user_attempts, on='user_id', how='left')
valid = valid.join(train_user_attempts, on='user_id', how='left')
test = test.join(train_user_attempts, on='user_id', how='left')

train = train.join(train_item_attempts, on='item_id', how='left')
valid = valid.join(train_item_attempts, on='item_id', how='left')
test = test.join(train_item_attempts, on='item_id', how='left')

In [56]:
train = train.withColumn('user_addict', (train.user_purchases / train.user_attempts))
valid = valid.withColumn('user_addict', col('user_purchases') / col('user_attempts'))
test = test.withColumn('user_addict', col('user_purchases') / col('user_attempts'))

In [58]:
train = train.withColumn('item_addict', col('item_purchases') / col('item_attempts'))
valid = valid.withColumn('item_addict', col('item_purchases') / col('item_attempts'))
test = test.withColumn('item_addict', col('item_purchases') / col('item_attempts'))

In [59]:
train_purchases.unpersist()
item_purchases.unpersist()
train_user_attempts.unpersist()
train_item_attempts.unpersist()

DataFrame[item_attempts: bigint, item_id: int]

In [60]:
from pyspark.ml.feature import VectorAssembler
# Выбираю колонки, которые войдут в features для GBT
cols = ['item_purchases', 'user_purchases', 'user_addict', 'item_addict']
assembler = VectorAssembler(inputCols=cols, outputCol="features")

train_data = assembler.transform(train).cache()
valid_data = assembler.transform(valid)
test_data = assembler.transform(test)

In [61]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol="purchase")

pipeline = Pipeline(stages=[
    gbt
])

In [62]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="purchase", metricName='areaUnderROC')


In [63]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [64]:
paramGrid = ParamGridBuilder().addGrid(gbt.maxDepth, [3, 4])\
                              .addGrid(gbt.minInstancesPerNode, [2, 3])\
                              .addGrid(gbt.maxBins, [50, 55])\
                              .build()

In [65]:
crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,
                              evaluator=evaluator, numFolds=3, parallelism=3)

cv_model = crossval.fit(train_data)

KeyboardInterrupt: 

In [None]:
cv_model.avgMetrics

In [None]:
predictions_valid = cv_model.transform(valid_data)

In [None]:
evaluator.evaluate(predictions_valid)

In [None]:
train_data.unpersist()

In [66]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol="purchase", maxDepth=4, minInstancesPerNode=3, maxBins=50)

gbt_model = gbt.fit(train_data)
predictions_valid = gbt_model.transform(valid_data)

In [67]:
evaluator = BinaryClassificationEvaluator(labelCol="purchase", metricName='areaUnderROC')
score = evaluator.evaluate(predictions_valid)
score

0.885847373338315

In [68]:
test_predictions = gbt_model.transform(test_data)

In [69]:
predictions_pd = test_predictions.select("user_id", "item_id", col("probability").alias("purchase")).toPandas()
predictions_pd = predictions_pd.sort_values(by=['user_id', 'item_id'])
predictions_pd['purchase'] = predictions_pd['purchase'].apply(lambda x: x[1])
predictions_pd.to_csv('lab03.csv', index=False)

In [70]:
spark.stop()