In [42]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [43]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.executor.instances", "6")
conf.set("spark.executor.cores", "3")
conf.set("spark.executor.memory", "9g")
conf.set("spark.driver.cores", "3")
conf.set("spark.driver.memory", "6g")

spark = SparkSession\
    .builder\
    .config(conf = conf)\
    .appName("Lab 03 Model").getOrCreate()

In [None]:
spark.stop()

In [44]:
import pandas as pd
import numpy as np

In [45]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
from pyspark.sql.functions import col, monotonically_increasing_id
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, RegexTokenizer, CountVectorizer
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
from pyspark.ml.linalg import VectorUDT

from pyspark.sql.functions import udf

In [46]:
schema = StructType([StructField("user_id", IntegerType()),
                    StructField("item_id", IntegerType()),
                    StructField("purchase", DoubleType())])

df_train = spark.read.csv('/labs/slaba03/laba03_train.csv', header= True, schema=schema)
test = spark.read.csv('/labs/slaba03/laba03_test.csv', header= True, schema=schema)

df_views = spark.read.csv('/labs/slaba03/laba03_views_programmes.csv', header= True)
df_items = spark.read.csv('/labs/slaba03/laba03_items.csv', header= True, sep= '\t')

In [47]:
spark.conf.set("spark.sql.shuffle.partitions", 200)

In [48]:
train = df_train.sampleBy("purchase", fractions={0: 0.8, 1: 0.8}, seed=5757)

In [49]:
valid = df_train.join(train, on=["user_id", "item_id"], how="leftanti")

In [50]:
# test = spark.read.csv('/labs/laba03/lab10_test.csv', header= True, schema=schema)
test = spark.read.csv('/labs/slaba03/laba03_test.csv', header= True, schema=schema)

In [51]:
train_purchases = train\
    .groupBy('user_id')\
    .sum()\
    .select(col("sum(purchase)").alias("user_purchases"), col("user_id"))\
    .cache()

In [52]:
item_purchases = train\
    .groupBy('item_id')\
    .sum()\
    .select(col("sum(purchase)").alias("item_purchases"), col("item_id"))\
    .cache()

In [53]:
train = train\
    .join(train_purchases, on='user_id', how='left')\
    .join(item_purchases, on='item_id', how='left')

In [54]:
valid = valid\
    .join(train_purchases, on='user_id', how='left')\
    .join(item_purchases, on='item_id', how='left')

In [55]:
test = test\
    .join(train_purchases, on='user_id', how='left')\
    .join(item_purchases, on='item_id', how='left')

In [56]:
train_user_attempts = train\
    .groupBy('user_id')\
    .count()\
    .select(col("count").alias("user_attempts"), col("user_id"))\
    .cache()

In [57]:
train_item_attempts = train\
    .groupBy('item_id')\
    .count()\
    .select(col("count").alias("item_attempts"), col("item_id"))\
    .cache()

In [58]:
train = train\
    .join(train_user_attempts, on='user_id', how='left')\
    .join(train_item_attempts, on='item_id', how='left')\
    .withColumn('user_addict', col('user_purchases') / col('user_attempts'))\
    .withColumn('item_addict', col('item_purchases') / col('item_attempts'))\
    .na.fill(0)

In [59]:
valid = valid\
    .join(train_user_attempts, on='user_id', how='left')\
    .join(train_item_attempts, on='item_id', how='left')\
    .withColumn('user_addict', col('user_purchases') / col('user_attempts'))\
    .withColumn('item_addict', col('item_purchases') / col('item_attempts'))\
    .na.fill(0)    

In [60]:
test = test\
    .join(train_user_attempts, on='user_id', how='left')\
    .join(train_item_attempts, on='item_id', how='left')\
    .withColumn('user_addict', col('user_purchases') / col('user_attempts'))\
    .withColumn('item_addict', col('item_purchases') / col('item_attempts'))\
    .na.fill(0)    

In [61]:
train_purchases.unpersist()
item_purchases.unpersist()
train_user_attempts.unpersist()
train_item_attempts.unpersist()

DataFrame[item_attempts: bigint, item_id: int]

In [62]:
cols = ['item_purchases', 'user_purchases', 'user_addict', 'item_addict']
assembler = VectorAssembler(inputCols=cols, outputCol="features")

In [63]:
train_data = assembler.transform(train).cache()
valid_data = assembler.transform(valid)
test_data = assembler.transform(test)

In [64]:
gbt = GBTClassifier(labelCol="purchase")

pipeline = Pipeline(stages=[gbt])

In [65]:
evaluator = BinaryClassificationEvaluator(labelCol="purchase", metricName='areaUnderROC')

In [66]:
paramGrid = ParamGridBuilder().addGrid(gbt.maxDepth, [3, 4])\
                              .addGrid(gbt.minInstancesPerNode, [2, 3])\
                              .addGrid(gbt.maxBins, [50, 55])\
                              .build()

In [67]:
crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,
                              evaluator=evaluator, numFolds=3, parallelism=3)

In [68]:
cv_model = crossval.fit(train_data)

In [70]:
predictions_valid = cv_model.transform(valid_data)

In [71]:
train_data.unpersist()

DataFrame[item_id: int, user_id: int, purchase: double, user_purchases: double, item_purchases: double, user_attempts: bigint, item_attempts: bigint, user_addict: double, item_addict: double, features: vector]

In [72]:
gbt = GBTClassifier(labelCol="purchase", maxDepth=4, minInstancesPerNode=3, maxBins=50)
gbt_model = gbt.fit(train_data)
predictions_valid = gbt_model.transform(valid_data)

In [97]:
predictions_pd = predictions_valid.select("user_id", "item_id", col("probability").alias("purchase")).toPandas()
predictions_pd = predictions_pd.sort_values(by=['user_id', 'item_id'])
predictions_pd['purchase'] = predictions_pd['purchase'].apply(lambda x: x[1])
predictions_pd.to_csv('/data/home/andrey.blednykh/lab03.csv', index=False)

  Unsupported type in conversion to Arrow: VectorUDT
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.
