In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'
spark_home = os.environ.get('SPARK_HOME', None)

if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json
from pyspark import SparkContext
conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType,ByteType
schema = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", ByteType()),
])

In [4]:
df_train = spark.read.csv('/labs/slaba03/laba03_train.csv', header= True, schema=schema)
test = spark.read.csv('/labs/slaba03/laba03_test.csv', header= True, schema=schema)

df_views = spark.read.csv('/labs/slaba03/laba03_views_programmes.csv', header= True)
df_items = spark.read.csv('/labs/slaba03/laba03_items.csv', header= True, sep= '\t')

In [5]:
train = df_train.sampleBy("purchase", fractions={0: 0.8, 1: 0.8}, seed=5757)

valid = df_train.join(train, on=["user_id", "item_id"], how="leftanti")

In [6]:

from pyspark.sql.functions import col

train_choice = train.groupBy('user_id')\
                .sum().select(col("sum(purchase)").alias("user_choice"), col("user_id")).cache()



In [7]:
item_choice = train.groupBy('item_id')\
                        .sum().select(col("sum(purchase)").alias("item_choice"), col("item_id")).cache()



In [8]:
train = train.join(train_choice, on='user_id', how='left')
valid = valid.join(train_choice, on='user_id', how='left')
test = test.join(train_choice, on='user_id', how='left')

train = train.join(item_choice, on='item_id', how='left')
valid = valid.join(item_choice, on='item_id', how='left')
test = test.join(item_choice, on='item_id', how='left')

In [9]:
train_user_attempts = train.groupBy('user_id').count().select(col("count").alias("user_attempts"), col("user_id"))\
                            .cache()
train_item_attempts = train.groupBy('item_id').count().select(col("count").alias("item_attempts"), col("item_id"))\
                            .cache()

train_user_attempts.show(2)

+-------------+-------+
|user_attempts|user_id|
+-------------+-------+
|         2089| 754230|
|         2059| 780033|
+-------------+-------+
only showing top 2 rows



In [10]:
train = train.join(train_user_attempts, on='user_id', how='left')
valid = valid.join(train_user_attempts, on='user_id', how='left')
test = test.join(train_user_attempts, on='user_id', how='left')
train = train.join(train_item_attempts, on='item_id', how='left')
valid = valid.join(train_item_attempts, on='item_id', how='left')
test = test.join(train_item_attempts, on='item_id', how='left')
train = train.withColumn('user_addict', (train.user_choice / train.user_attempts))
valid = valid.withColumn('user_addict', col('user_choice') / col('user_attempts'))
test = test.withColumn('user_addict', col('user_choice') / col('user_attempts'))
train = train.withColumn('item_addict', col('item_choice') / col('item_attempts'))
valid = valid.withColumn('item_addict', col('item_choice') / col('item_attempts'))
test = test.withColumn('item_addict', col('item_choice') / col('item_attempts'))

In [11]:
train.show()

+-------+-------+--------+-----------+-----------+-------------+-------------+--------------------+-------------------+
|item_id|user_id|purchase|user_choice|item_choice|user_attempts|item_attempts|         user_addict|        item_addict|
+-------+-------+--------+-----------+-----------+-------------+-------------+--------------------+-------------------+
|   8389| 556825|       0|          6|          5|         2063|         1061| 0.00290838584585555|0.00471253534401508|
|   8389| 566701|       0|         10|          5|         2104|         1061|0.004752851711026616|0.00471253534401508|
|   8389| 613775|       0|          1|          5|         2044|         1061|4.892367906066536E-4|0.00471253534401508|
|   8389| 619378|       0|          0|          5|         2051|         1061|                 0.0|0.00471253534401508|
|   8389| 625678|       0|         13|          5|         2046|         1061| 0.00635386119257087|0.00471253534401508|
|   8389| 632495|       0|          6|  

In [12]:
valid.show()

+-------+-------+--------+-----------+-----------+-------------+-------------+--------------------+--------------------+
|item_id|user_id|purchase|user_choice|item_choice|user_attempts|item_attempts|         user_addict|         item_addict|
+-------+-------+--------+-----------+-----------+-------------+-------------+--------------------+--------------------+
|   7679|   1654|       0|          4|          1|         2014|         1062|0.001986097318768...|9.416195856873823E-4|
|  67318|   1654|       0|          4|          1|         2014|         1093|0.001986097318768...|9.149130832570906E-4|
|  10820| 510087|       0|          4|          1|         2026|         1075|0.001974333662388...|9.302325581395349E-4|
|  67040| 510087|       0|          4|          0|         2026|         1098|0.001974333662388...|                 0.0|
|  72905| 510087|       0|          4|          0|         2026|         1059|0.001974333662388...|                 0.0|
|  77562| 510087|       0|      

In [13]:
test.show()

+-------+-------+--------+-----------+-----------+-------------+-------------+--------------------+--------------------+
|item_id|user_id|purchase|user_choice|item_choice|user_attempts|item_attempts|         user_addict|         item_addict|
+-------+-------+--------+-----------+-----------+-------------+-------------+--------------------+--------------------+
|  94814|   1654|    null|          4|          0|         2014|         1097|0.001986097318768...|                 0.0|
|  93629|   1654|    null|          4|          4|         2014|         1099|0.001986097318768...|0.003639672429481347|
|   9980|   1654|    null|          4|          1|         2014|         1089|0.001986097318768...|9.182736455463728E-4|
|  95099|   1654|    null|          4|          0|         2014|         1113|0.001986097318768...|                 0.0|
|  11265|   1654|    null|          4|          6|         2014|         1091|0.001986097318768...|0.005499541704857928|
|  88896|   1654|    null|      

In [14]:
train_choice.unpersist()
item_choice.unpersist()
train_user_attempts.unpersist()
train_item_attempts.unpersist()

DataFrame[item_attempts: bigint, item_id: int]

In [15]:
from pyspark.ml.feature import VectorAssembler
features = ['item_choice', 'user_choice', 'user_addict', 'item_addict']
vecassembler = VectorAssembler(inputCols=features, outputCol="features")

train_data = vecassembler.transform(train).cache()
valid_data = vecassembler.transform(valid)
test_data = vecassembler.transform(test)

In [16]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol="purchase", maxDepth=4, minInstancesPerNode=3, maxBins=50)

gbt_model = gbt.fit(train_data)
predictions_train = gbt_model.transform(train_data)
predictions_valid = gbt_model.transform(valid_data)

In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="purchase", metricName='areaUnderROC')
score_v = evaluator.evaluate(predictions_valid)
score_tr = evaluator.evaluate(predictions_train)

In [21]:
print(score_tr)
print(score_v)

0.9354502642651171
0.8930571310198385


In [22]:
test_predictions = gbt_model.transform(test_data)

In [23]:
predictions_pd = test_predictions.select("user_id", "item_id", col("probability").alias("purchase")).toPandas()
predictions_pd = predictions_pd.sort_values(by=['user_id', 'item_id'])
predictions_pd['purchase'] = predictions_pd['purchase'].apply(lambda x: x[1])
predictions_pd.to_csv('lab03.csv', index=False)

In [24]:
spark.stop()