In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Sabanov Denis Spark Dataframe test lab02") 

spark = SparkSession.builder.config(conf=conf).appName("Sabanov Denis Spark Dataframe test lab03").getOrCreate()

In [3]:
from pyspark.sql import types as T
import pyspark.sql.functions as F
import numpy as np
import json


In [4]:
trainSchema = T.StructType(fields=[
    T.StructField("user_id", T.IntegerType()),
    T.StructField("item_id", T.IntegerType()),
    T.StructField("purchase", T.IntegerType()),
])

In [5]:
dfTrain = spark.read \
          .schema(trainSchema)\
          .format("csv") \
          .option('header', 'True')\
          .load("/labs/slaba03/laba03_train.csv")

In [6]:
testSchema = T.StructType(fields=[
    T.StructField("user_id", T.IntegerType()),
    T.StructField("item_id", T.IntegerType()),
])

In [7]:
dfTest = spark.read \
          .schema(testSchema)\
          .option('header', 'True')\
          .format("csv") \
          .load("/labs/slaba03/laba03_test.csv")

In [9]:
userPurchase = (dfTrain
                 .groupBy('user_id')
                 .agg(F.mean('purchase').alias("user_purchase"))
         )

In [10]:
itemPurchase = (dfTrain
                 .groupBy('item_id')
                 .agg(F.mean('purchase').alias("item_purchase"))
         )

In [11]:
from pyspark.ml.feature import VectorAssembler

def createDataset(dfTrain):
    dataTrain = (
        dfTrain
        .join(userPurchase, on = ['user_id'], how = 'left')
        .join(itemPurchase, on = ['item_id'], how = 'left')
    )

    dataTrain = (dataTrain.fillna(0))

    assembler = VectorAssembler(
        inputCols=['user_id', 'item_id', "user_purchase", 'item_purchase'],
        outputCol="features")

    dataTrain = assembler.transform(dataTrain)
    
    return dataTrain

In [12]:
data = createDataset(dfTrain.sample(False, 0.1, seed=0).limit(300000))

In [13]:
data.show(1, vertical=True, truncate=True)

-RECORD 0-----------------------------
 item_id       | 8389                 
 user_id       | 754230               
 purchase      | 0                    
 user_purchase | 0.027575641516660282 
 item_purchase | 0.005979073243647235 
 features      | [754230.0,8389.0,... 
only showing top 1 row



In [14]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features",
                        labelCol='purchase', 
                        maxIter=10)

In [15]:
share = 0.8

test, train = data.randomSplit(weights=[1-share, share],  seed=12345)

In [16]:
lrModel = lr.fit(train)

In [17]:
predictions = lrModel.transform(test)

predictions.select("user_id", "purchase", "prediction", "probability").show(1, False, True)

-RECORD 0-------------------------------------------------
 user_id     | 751096                                     
 purchase    | 0                                          
 prediction  | 0.0                                        
 probability | [0.9956913577825055,0.0043086422174944215] 
only showing top 1 row



In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability", 
                                          labelCol="purchase", 
                                          metricName='areaUnderROC')
evaluator.evaluate(predictions)

0.887069643673311

In [19]:
lrModel = lr.fit(data)

In [20]:
dataTest = createDataset(dfTest)

dataTest.show(4)

+-------+-------+--------------------+--------------------+--------------------+
|item_id|user_id|       user_purchase|       item_purchase|            features|
+-------+-------+--------------------+--------------------+--------------------+
|   8389| 886063|0.002698535080956...|0.005979073243647235|[886063.0,8389.0,...|
|   8389| 900335|0.004615384615384616|0.005979073243647235|[900335.0,8389.0,...|
|   8389| 936359|7.613247049866769E-4|0.005979073243647235|[936359.0,8389.0,...|
|   8389| 901323|3.846153846153846E-4|0.005979073243647235|[901323.0,8389.0,...|
+-------+-------+--------------------+--------------------+--------------------+
only showing top 4 rows



In [21]:
predictions = lrModel.transform(dataTest)

In [22]:
predictions = (predictions
               .select("user_id", "item_id", "probability")
               .orderBy(["user_id", "item_id"], ascending=True)
              )

In [23]:
result = predictions.toPandas()
result.head()

  Unsupported type in conversion to Arrow: VectorUDT
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.


Unnamed: 0,user_id,item_id,probability
0,1654,336,"[0.9974725809391505, 0.002527419060849455]"
1,1654,678,"[0.99747615076008, 0.0025238492399200594]"
2,1654,691,"[0.9974762863557611, 0.0025237136442387183]"
3,1654,696,"[0.9971443482415624, 0.002855651758437547]"
4,1654,763,"[0.997320215147373, 0.002679784852627047]"


In [24]:
result['purchase'] = result["probability"].apply(lambda x: x[1])
del result["probability"]

In [25]:
result = result.sort_values(by=["user_id", "item_id"])
result.head()

Unnamed: 0,user_id,item_id,purchase
0,1654,336,0.002527
1,1654,678,0.002524
2,1654,691,0.002524
3,1654,696,0.002856
4,1654,763,0.00268


In [26]:
result.to_csv('/data/home/denis.sabanov/lab03.csv')

In [27]:
spark.stop()