In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkContext,SparkConf

conf = SparkConf()
conf.set('spark.app.cinema',"lab03.kachetov")

spark = SparkSession.builder.config(conf=conf).appName("Spark SQL").getOrCreate()
sc = spark.sparkContext

In [3]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.linalg import *
from pyspark.ml.feature import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import GBTClassifier
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from sklearn.metrics import classification_report, precision_score, roc_auc_score
import pyspark.sql.functions as f

In [4]:
schema = StructType([
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", IntegerType())
])

In [5]:
train = spark.read.csv("/labs/slaba03/laba03_train.csv", header = True, schema = schema)
test = spark.read.csv("/labs/slaba03/laba03_test.csv", header = True, schema = schema)

In [6]:
train = train.join(train[['user_id', 'purchase']].groupBy('user_id').sum().select(col("sum(purchase)").alias("user_sum"), col("user_id")), on = 'user_id', how = 'inner')
test = test.join(train[['user_id', 'purchase']].groupBy('user_id').sum().select(col("sum(purchase)").alias("user_sum"), col("user_id")), on = 'user_id', how = 'left')

In [7]:
train = train.join(train[['item_id', 'purchase']].groupBy('item_id').sum().select(col("sum(purchase)").alias("item_sum"), col("item_id")), on = 'item_id', how = 'inner')
test = test.join(train[['item_id', 'purchase']].groupBy('item_id').sum().select(col("sum(purchase)").alias("item_sum"), col("item_id")), on = 'item_id', how = 'left')

In [8]:
train = train.join(train[['user_id', 'purchase']].groupBy('user_id').count().select(col("count").alias("user_count"), col("user_id")), on = 'user_id', how = 'inner')
test = test.join(train[['user_id', 'purchase']].groupBy('user_id').count().select(col("count").alias("user_count"), col("user_id")), on = 'user_id', how = 'left')

In [9]:
train = train.join(train[['item_id', 'purchase']].groupBy('item_id').count().select(col("count").alias("item_count"), col("item_id")), on = 'item_id', how = 'inner')
test = test.join(train[['item_id', 'purchase']].groupBy('item_id').count().select(col("count").alias("item_count"), col("item_id")), on = 'item_id', how = 'left')

In [10]:
train = train.withColumn('user_addict', col('user_sum')/col('user_count'))
test = test.withColumn('user_addict', col('user_sum')/col('user_count'))

In [11]:
train = train.withColumn('item_addict', col('item_sum')/col('item_count'))
test = test.withColumn('item_addict', col('item_sum')/col('item_count'))

In [12]:
test = test.na.fill(0)
train = train.na.fill(0)

In [13]:
cols = ['item_sum', 'user_sum','item_addict','user_addict']
assembler = VectorAssembler(inputCols=cols, outputCol="features")

raw_data = assembler.transform(train)
raw_data_test = assembler.transform(test)

In [14]:
raw_data = raw_data.repartition(100).cache()
raw_data_test = raw_data_test.repartition(100).cache()

In [15]:
lr = GBTClassifier(labelCol="purchase", maxDepth=4, minInstancesPerNode=2, maxBins=55)

lr_model = lr.fit(raw_data)
predictions = lr_model.transform(raw_data_test)

In [16]:
lr_model.featureImportances

SparseVector(4, {0: 0.3519, 1: 0.3377, 2: 0.1922, 3: 0.1183})

In [17]:
predictions_pd = predictions.select("user_id", "item_id", f.col("probability").alias("purchase")).toPandas()
predictions_pd = predictions_pd.sort_values(by=['user_id', 'item_id'])
predictions_pd['purchase'] = predictions_pd['purchase'].apply(lambda x: x[1])
predictions_pd.to_csv('lab03.csv', index=False)

  Unsupported type in conversion to Arrow: VectorUDT
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.


In [18]:
sc.stop()