In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "pankov lab03") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
from pyspark.sql.types import DoubleType

In [4]:
import pyspark.sql.functions as f

In [43]:
items = spark.read.csv('/labs/slaba03/laba03_items.csv', sep='\t', header=True)
test = spark.read.csv('/labs/slaba03/laba03_test.csv', sep=',', header=True)
train = spark.read.csv('/labs/slaba03/laba03_train.csv', sep=',', header=True)
views_programmes = spark.read.csv('/labs/slaba03/laba03_views_programmes.csv', sep=',', header=True)

In [6]:
items = items.cache()
items.collect()[0]

Row(item_id='65667', channel_id=None, datetime_availability_start='1970-01-01T00:00:00Z', datetime_availability_stop='2018-01-01T00:00:00Z', datetime_show_start=None, datetime_show_stop=None, content_type='1', title='на пробах только девушки (all girl auditions)', year='2013.0', genres='Эротика', region_id=None)

In [44]:
test = test.cache()
test.collect()[0]

Row(user_id='1654', item_id='94814', purchase=None)

In [8]:
train = train.cache()
train.collect()[0]

Row(user_id='1654', item_id='74107', purchase='0')

In [9]:
#(trainingData, testData) = train.randomSplit([0.7, 0.3])

In [10]:
joined_df = train.join(items, on='item_id')

In [11]:
joined_df = joined_df.cache()

In [12]:
joined_df.show()

+-------+-------+--------+----------+---------------------------+--------------------------+-------------------+------------------+------------+-------+------+-------+---------+
|item_id|user_id|purchase|channel_id|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|content_type|  title|  year| genres|region_id|
+-------+-------+--------+----------+---------------------------+--------------------------+-------------------+------------------+------------+-------+------+-------+---------+
| 100140| 903319|       0|      null|       1970-01-01T00:00:00Z|      2099-12-31T21:00:00Z|               null|              null|           1|поездка|2014.0|Комедии|     null|
| 100140| 903337|       0|      null|       1970-01-01T00:00:00Z|      2099-12-31T21:00:00Z|               null|              null|           1|поездка|2014.0|Комедии|     null|
| 100140| 903348|       0|      null|       1970-01-01T00:00:00Z|      2099-12-31T21:00:00Z|               nul

In [13]:
joined_df.printSchema()

root
 |-- item_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- purchase: string (nullable = true)
 |-- channel_id: string (nullable = true)
 |-- datetime_availability_start: string (nullable = true)
 |-- datetime_availability_stop: string (nullable = true)
 |-- datetime_show_start: string (nullable = true)
 |-- datetime_show_stop: string (nullable = true)
 |-- content_type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- region_id: string (nullable = true)



In [14]:
joined_df.count()

5032624

In [15]:
joined_df.groupby('purchase').count().show()

+--------+-------+
|purchase|  count|
+--------+-------+
|       0|5021720|
|       1|  10904|
+--------+-------+



In [16]:
def prepare_dataset(df):
    total_purchase = df.agg(f.sum(f.col('purchase'))).collect()[0][0]
    item_total_buys = df.groupby(['item_id']).agg(f.sum('purchase').alias('item_total_buys'))
    user_total_buys = df.groupby(['user_id']).agg(f.sum('purchase').alias('user_total_buys'))
    item_total_buys = item_total_buys.withColumn('purchase_item_dolya', f.udf(lambda x: x / total_purchase)('item_total_buys'))
    user_total_buys = user_total_buys.withColumn('purchase_user_dolya', f.udf(lambda x: x / total_purchase)('user_total_buys'))
    return item_total_buys, user_total_buys

In [17]:
item_purchase, user_purchase = prepare_dataset(joined_df)

In [18]:
item_purchase.show()

+-------+---------------+--------------------+
|item_id|item_total_buys| purchase_item_dolya|
+-------+---------------+--------------------+
| 100140|            0.0|                 0.0|
| 100263|            1.0|9.170946441672781E-5|
| 100735|            1.0|9.170946441672781E-5|
|   1159|            2.0|1.834189288334556...|
|   2136|            1.0|9.170946441672781E-5|
|  60351|            2.0|1.834189288334556...|
|    691|            0.0|                 0.0|
|  74605|            0.0|                 0.0|
|   7711|            1.0|9.170946441672781E-5|
|  77371|            1.0|9.170946441672781E-5|
|  81824|            3.0|2.751283932501834E-4|
|  88649|            2.0|1.834189288334556...|
|  93545|            2.0|1.834189288334556...|
|  97128|            2.0|1.834189288334556...|
|  98725|            1.0|9.170946441672781E-5|
| 100402|            1.0|9.170946441672781E-5|
|  11205|            9.0|8.253851797505503E-4|
|  11236|            2.0|1.834189288334556...|
|   4975|    

In [19]:
user_purchase.show()

+-------+---------------+--------------------+
|user_id|user_total_buys| purchase_user_dolya|
+-------+---------------+--------------------+
| 921852|            1.0|9.170946441672781E-5|
| 927169|            9.0|8.253851797505503E-4|
| 929499|            7.0|6.419662509170946E-4|
| 930508|            7.0|6.419662509170946E-4|
| 867363|            1.0|9.170946441672781E-5|
| 882935|            2.0|1.834189288334556...|
| 889974|            0.0|                 0.0|
| 891250|            0.0|                 0.0|
| 902451|            1.0|9.170946441672781E-5|
| 748042|            4.0|3.668378576669112...|
| 837166|            1.0|9.170946441672781E-5|
| 855465|            4.0|3.668378576669112...|
| 905618|            0.0|                 0.0|
| 781373|            1.0|9.170946441672781E-5|
| 830843|            1.0|9.170946441672781E-5|
| 906395|           25.0|0.002292736610418...|
| 920785|            0.0|                 0.0|
| 922481|           14.0|0.001283932501834...|
| 938008|    

In [20]:
train = train.join(item_purchase, on='item_id', how='left').join(user_purchase, on='user_id', how='left').cache()

In [21]:
train = train.withColumn('purchase_item_dolya', f.col('purchase_item_dolya').cast(DoubleType()))
train = train.withColumn('purchase_user_dolya', f.col('purchase_user_dolya').cast(DoubleType()))
train = train.withColumn('purchase', f.col('purchase').cast(DoubleType()))

In [22]:
train.show()

+-------+-------+--------+---------------+--------------------+---------------+--------------------+
|user_id|item_id|purchase|item_total_buys| purchase_item_dolya|user_total_buys| purchase_user_dolya|
+-------+-------+--------+---------------+--------------------+---------------+--------------------+
| 867363| 100277|     0.0|            0.0|                 0.0|            1.0|9.170946441672781E-5|
| 867363|   1870|     0.0|            1.0|9.170946441672781E-5|            1.0|9.170946441672781E-5|
| 867363|   2696|     0.0|            1.0|9.170946441672781E-5|            1.0|9.170946441672781E-5|
| 867363|  66998|     0.0|            2.0|1.834189288334556...|            1.0|9.170946441672781E-5|
| 867363|  74106|     0.0|            6.0|5.502567865003668E-4|            1.0|9.170946441672781E-5|
| 867363|  74415|     0.0|            7.0|6.419662509170946E-4|            1.0|9.170946441672781E-5|
| 867363|  89081|     0.0|            1.0|9.170946441672781E-5|            1.0|9.1709464416

In [23]:
train.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- purchase: double (nullable = true)
 |-- item_total_buys: double (nullable = true)
 |-- purchase_item_dolya: double (nullable = true)
 |-- user_total_buys: double (nullable = true)
 |-- purchase_user_dolya: double (nullable = true)



In [24]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [25]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
featuresCols = train.columns
featuresCols.remove('purchase')
featuresCols.remove('user_id')
featuresCols.remove('item_id')
# This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="features")
# This identifies categorical features and indexes them.
#vectorIndexer = VectorIndexer(inputCol="rawFeatures", outputCol="features", maxCategories=4)

In [27]:
gbt = GBTClassifier(labelCol="purchase", featuresCol="features")

In [28]:
paramGrid = ParamGridBuilder()\
  .addGrid(gbt.maxDepth, [2])\
  .addGrid(gbt.maxIter, [10])\
  .build()

In [29]:
evaluator = BinaryClassificationEvaluator(labelCol='purchase')

In [30]:
train = (vectorAssembler.transform(train).select("purchase", "features"))

In [31]:
crossval = CrossValidator(
    estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)


model = crossval.fit(train)


In [32]:
model.avgMetrics[0]

0.8835015406751885

In [45]:
test_df = test.join(item_purchase, on='item_id', how='left').join(user_purchase, on='user_id', how='left').cache()

In [46]:
test_df = test_df.withColumn('purchase_item_dolya', f.col('purchase_item_dolya').cast(DoubleType()))
test_df = test_df.withColumn('purchase_user_dolya', f.col('purchase_user_dolya').cast(DoubleType()))
test_df = test_df.withColumn('purchase', f.col('purchase').cast(DoubleType()))

In [47]:
test_df.show()

+-------+-------+--------+---------------+--------------------+---------------+--------------------+
|user_id|item_id|purchase|item_total_buys| purchase_item_dolya|user_total_buys| purchase_user_dolya|
+-------+-------+--------+---------------+--------------------+---------------+--------------------+
| 867363| 100263|    null|            1.0|9.170946441672781E-5|            1.0|9.170946441672781E-5|
| 867363| 100735|    null|            1.0|9.170946441672781E-5|            1.0|9.170946441672781E-5|
| 867363|   1159|    null|            2.0|1.834189288334556...|            1.0|9.170946441672781E-5|
| 867363|  74605|    null|            0.0|                 0.0|            1.0|9.170946441672781E-5|
| 867363|  81824|    null|            3.0|2.751283932501834E-4|            1.0|9.170946441672781E-5|
| 867363|  88649|    null|            2.0|1.834189288334556...|            1.0|9.170946441672781E-5|
| 867363|  98725|    null|            1.0|9.170946441672781E-5|            1.0|9.1709464416

In [49]:
#test_df = (vectorAssembler.transform(test).select("purchase", "features"))

In [61]:
predictions = model.transform(vectorAssembler.transform(test_df).select(['user_id', 'item_id', "features"]))

In [71]:
predictions.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [86]:
predictions = predictions.withColumn('item_id', f.col('item_id').cast('int'))

In [89]:
predictions = predictions.select(['user_id', 'item_id', 'probability']).orderBy(['user_id', 'item_id'], ascending=[1, 1])

In [90]:
predictions.show()

+-------+-------+--------------------+
|user_id|item_id|         probability|
+-------+-------+--------------------+
|   1654|    336|[0.93374288829826...|
|   1654|    678|[0.93374288829826...|
|   1654|    691|[0.93374288829826...|
|   1654|    696|[0.93374288829826...|
|   1654|    763|[0.93374288829826...|
|   1654|    795|[0.93343052812923...|
|   1654|    861|[0.93374288829826...|
|   1654|   1137|[0.93374288829826...|
|   1654|   1159|[0.93374288829826...|
|   1654|   1428|[0.93374288829826...|
|   1654|   1685|[0.93374288829826...|
|   1654|   1686|[0.93374288829826...|
|   1654|   1704|[0.93374288829826...|
|   1654|   2093|[0.93374288829826...|
|   1654|   2343|[0.93374288829826...|
|   1654|   2451|[0.93374288829826...|
|   1654|   2469|[0.93280588912935...|
|   1654|   2603|[0.93374288829826...|
|   1654|   2609|[0.93374288829826...|
|   1654|   2621|[0.93374288829826...|
+-------+-------+--------------------+
only showing top 20 rows



In [94]:
predictions = predictions.withColumn('purchase', f.udf(lambda x: float(x[1]))('probability')).select(['user_id', 'item_id', 'purchase'])

In [95]:
predictions.show()

+-------+-------+-------------------+
|user_id|item_id|           purchase|
+-------+-------+-------------------+
|   1654|    336|0.06625711170173343|
|   1654|    678|0.06625711170173343|
|   1654|    691|0.06625711170173343|
|   1654|    696|0.06625711170173343|
|   1654|    763|0.06625711170173343|
|   1654|    795|0.06656947187076212|
|   1654|    861|0.06625711170173343|
|   1654|   1137|0.06625711170173343|
|   1654|   1159|0.06625711170173343|
|   1654|   1428|0.06625711170173343|
|   1654|   1685|0.06625711170173343|
|   1654|   1686|0.06625711170173343|
|   1654|   1704|0.06625711170173343|
|   1654|   2093|0.06625711170173343|
|   1654|   2343|0.06625711170173343|
|   1654|   2451|0.06625711170173343|
|   1654|   2469|0.06719411087064164|
|   1654|   2603|0.06625711170173343|
|   1654|   2609|0.06625711170173343|
|   1654|   2621|0.06625711170173343|
+-------+-------+-------------------+
only showing top 20 rows



In [100]:
predictions.toPandas().to_csv('../../lab03.csv', index=False)

In [101]:
spark.stop()