In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("bea")
         .getOrCreate())

In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark.sql.functions import col, udf, lit, row_number, monotonically_increasing_id 
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, IDF
from pyspark.sql.window import Window
from pyspark.sql.functions import sum as _sum
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [4]:
@udf(returnType=StringType()) 
def firstword(v):
    if v is not None:
        return v.split(',')[0]
    else: return 'Пусто'

In [5]:
items = spark.read\
          .format("csv")\
          .option("header",True)\
          .option("delimiter","\t")\
          .load("/labs/slaba03/laba03_items.csv")
test = spark.read\
          .format("csv")\
          .option("header",True)\
          .load("/labs/slaba03/laba03_test.csv")
train = spark.read\
          .format("csv")\
          .option("header",True)\
          .load("/labs/slaba03/laba03_train.csv")
vp = spark.read\
          .format("csv")\
          .option("header",True)\
          .load("/labs/slaba03/laba03_views_programmes.csv")


In [27]:

train1 = train.withColumn("label", col("purchase").cast("int"))\
              .withColumn("user_sum", f.sum("label").over(Window.partitionBy("user_id")))\
              .withColumn("item_sum", f.sum("label").over(Window.partitionBy("item_id")))\
              .withColumn("user_avg", f.avg("label").over(Window.partitionBy("user_id")))\
              .withColumn("item_avg", f.avg("label").over(Window.partitionBy("item_id")))\
              .join(items, train.item_id ==  items.item_id, "left")\
              .select (train.user_id, train.item_id, "user_sum", "item_sum","user_avg", "item_avg", "label",\
                       col("content_type").cast("int"), col("year").cast("int"))\
              .cache()
                      

In [29]:
test1 = test.join(train1.groupBy("user_id").max("user_avg", "user_sum"), test.user_id ==  train1.user_id,"left")\
            .join(train1.groupBy("item_id").max("item_avg", "item_sum"), test.item_id ==  train1.item_id,"left")\
            .join(items, test.item_id ==  items.item_id, "left")\
            .select (test.user_id, test.item_id, col("max(user_avg)").alias("user_avg"), col("max(item_avg)").alias("item_avg"),\
                     col("max(user_sum)").alias("user_sum"), col("max(item_sum)").alias("item_sum"),\
                     test.purchase.cast("int"),\
                     col("content_type").cast("int"), col("year").cast("int"))\
            .cache()

In [30]:
vector_test = VectorAssembler(
    inputCols=[ "year", "content_type", "user_avg", "item_avg", "user_sum", "item_sum"],
    outputCol="features", handleInvalid="keep")

output_test = vector_test.transform(test1)
output_test.select("features", col("purchase").alias("label")).show(2, truncate=False)

+----------------------------------------------+-----+
|features                                      |label|
+----------------------------------------------+-----+
|[2014.0,1.0,0.0015308075009567547,0.0,4.0,0.0]|null |
|[2014.0,1.0,0.0015527950310559005,0.0,4.0,0.0]|null |
+----------------------------------------------+-----+
only showing top 2 rows



In [31]:
train1 = train1.dropna()


In [32]:
vector = VectorAssembler(
    inputCols=[ "year", "content_type", "item_avg", "user_sum", "item_sum"],
    outputCol="features", handleInvalid="keep")

output = vector.transform(train1)
output.select("features", "label").show(2, truncate=False)

+------------------------+-----+
|features                |label|
+------------------------+-----+
|[2014.0,1.0,0.0,1.0,0.0]|0    |
|[2014.0,1.0,0.0,1.0,0.0]|0    |
+------------------------+-----+
only showing top 2 rows



In [None]:
gbt = GBTClassifier(maxIter=10)
gbtModel = gbt.fit(output)


In [None]:
predictions = gbtModel.transform(output_test)


In [None]:
firstelement=udf(lambda v:float(v[1]),FloatType())


In [None]:

final = predictions.orderBy(col("user_id").asc(), col("item_id").asc())\
          .withColumn("id", f.monotonically_increasing_id())\
           .select(col("id").alias(""), "user_id", "item_id", firstelement("probability").alias("purchase"))

           
                

DataFrame[: bigint, user_id: string, item_id: string, purchase: float]

In [None]:
final.toPandas().to_csv('lab03.csv')

In [None]:
final.coalesce(1).write.csv('lab032')

In [None]:
!hdfs dfs -copyToLocal lab031

In [None]:
gbtModel.featureImportances

In [None]:
spark.stop()