In [6]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [7]:
from pyspark.ml.feature import CountVectorizer, IDF, StopWordsRemover, VectorAssembler, VectorSizeHint
from pyspark.ml import Pipeline
from pyspark.sql import functions as f
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [8]:
import re
import json

In [9]:
PARTITIONS = 18
spark.conf.set("spark.sql.shuffle.partitions", "18")

In [10]:
schema = StructType([StructField('user_id', StringType()),
                    StructField('item_id', StringType()),
                    StructField('purchase', IntegerType())])
train = spark.read.csv('/labs/slaba03/laba03_train.csv', schema=schema, header=True)
train.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
+-------+-------+--------+
only showing top 5 rows



In [11]:
schema = StructType([StructField('user_id', StringType()),
                    StructField('item_id', StringType())])
test = spark.read.csv('/labs/slaba03/laba03_test.csv', schema=schema, header=True)
test.show(5)

+-------+-------+
|user_id|item_id|
+-------+-------+
|   1654|  94814|
|   1654|  93629|
|   1654|   9980|
|   1654|  95099|
|   1654|  11265|
+-------+-------+
only showing top 5 rows



In [12]:
schema = StructType([StructField('item_id', StringType()),
                    StructField('channel_id', StringType()), 
                     StructField('datetime_availability_start', DateType()),
                    StructField('datetime_availability_stop', DateType()), 
                    StructField('datetime_show_start', DateType()),
                    StructField('datetime_show_stop', DateType()),
                    StructField('content_type', IntegerType()),
                    StructField('title', StringType()),
                    StructField('year', FloatType()),
                    StructField('genres', StringType()),
                    StructField('region_id', IntegerType())])
items = spark.read.csv('/labs/slaba03/laba03_items.csv', header=True, multiLine=True, sep ='\t', schema=schema)
items.show(5, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------
 item_id                     | 65667                                                                                  
 channel_id                  | null                                                                                   
 datetime_availability_start | 1970-01-01                                                                             
 datetime_availability_stop  | 2018-01-01                                                                             
 datetime_show_start         | null                                                                                   
 datetime_show_stop          | null                                                                                   
 content_type                | 1                                                                                      
 title                       | на пробах только 

In [13]:
schema = StructType([StructField('user_id', StringType()),
                    StructField('item_id', StringType()), 
                     StructField('ts_start', LongType()),
                    StructField('ts_end', LongType()), 
                    StructField('item_type', StringType())])
views = spark.read.csv('/labs/slaba03/laba03_views_programmes.csv', header=True, schema=schema)
views.show(5, vertical=True, truncate=False)

-RECORD 0---------------
 user_id   | 0          
 item_id   | 7101053    
 ts_start  | 1491409931 
 ts_end    | 1491411600 
 item_type | live       
-RECORD 1---------------
 user_id   | 0          
 item_id   | 7101054    
 ts_start  | 1491412481 
 ts_end    | 1491451571 
 item_type | live       
-RECORD 2---------------
 user_id   | 0          
 item_id   | 7101054    
 ts_start  | 1491411640 
 ts_end    | 1491412481 
 item_type | live       
-RECORD 3---------------
 user_id   | 0          
 item_id   | 6184414    
 ts_start  | 1486191290 
 ts_end    | 1486191640 
 item_type | live       
-RECORD 4---------------
 user_id   | 257        
 item_id   | 4436877    
 ts_start  | 1490628499 
 ts_end    | 1490630256 
 item_type | live       
only showing top 5 rows



In [14]:
train.rdd.getNumPartitions(), test.rdd.getNumPartitions(), views.rdd.getNumPartitions(), items.rdd.getNumPartitions()

(3, 3, 7, 1)

In [15]:
train.groupby('purchase').count().collect(), test.count()

([Row(purchase=1, count=10904), Row(purchase=0, count=5021720)], 2156840)

In [16]:
from pyspark.ml.recommendation import ALS

In [17]:
from pyspark.ml.regression import GBTRegressor

In [18]:
from pyspark.ml.feature import MinMaxScaler, VectorAssembler, CountVectorizer
from pyspark.ml import Pipeline

In [19]:
train = train.select('purchase', f.col('user_id').cast(LongType()), f.col('item_id').cast(LongType()))
test = test.select(f.col('user_id').cast(LongType()), f.col('item_id').cast(LongType()))

In [20]:
als = ALS(rank=10, maxIter=10, implicitPrefs=True, alpha=0.01, seed=42, ratingCol='purchase', userCol="user_id", itemCol="item_id")

In [21]:
model = als.fit(train)

In [22]:
predictions = model.transform(test)

In [23]:
assembler = VectorAssembler(inputCols=['prediction'],outputCol="prediction_vect")
scaler = MinMaxScaler(inputCol=assembler.getOutputCol(), outputCol="prediction_scaled")
pipeline = Pipeline(stages = [assembler, scaler])

In [24]:
unlist = f.udf(lambda x: float(list(x)[0]), DoubleType())

In [25]:
predictions = pipeline.fit(predictions).transform(predictions) \
                .withColumn("purchase", unlist("prediction_scaled")).drop(*["prediction_scaled", "prediction_vect"])

In [26]:
predictions.show()

+-------+-------+--------------+-------------------+
|user_id|item_id|    prediction|           purchase|
+-------+-------+--------------+-------------------+
| 632495|    546|   9.780827E-4|0.24061553887636625|
| 820048|    546| -1.9151293E-4|0.23944475576633972|
| 822223|    546|-1.6651055E-15| 0.2396364631415825|
| 829957|    546|           0.0|0.23963646314158416|
| 831698|    546|   6.148561E-4|0.24025194350646872|
| 841994|    546| -2.8825822E-5|0.23960760805208756|
| 848670|    546|  1.857072E-15|0.23963646314158601|
| 862829|    546| -1.5989634E-5|0.23962045727317854|
| 864002|    546| -2.1148409E-4|0.23942476432400228|
| 867939|    546|  1.3248062E-4|0.23976907827574892|
| 868176|    546|  -4.857222E-4| 0.2391502477614563|
| 872983|    546| -2.5424012E-4| 0.2393819648823526|
| 874709|    546|           0.0|0.23963646314158416|
| 886192|    546| 1.15136645E-4| 0.2397517166891163|
| 886854|    546|           0.0|0.23963646314158416|
| 891912|    546| -8.1380174E-5|0.239555000338

In [27]:
predictions.select('user_id', 'item_id', 'purchase') \
                .orderBy('user_id', 'item_id').coalesce(1).toPandas().to_csv('lab03.csv')

In [28]:
user_purchased = train.groupby('user_id').agg(f.sum('purchase').alias('user_times_purchased'))
item_purchased = train.groupby('item_id').agg(f.sum('purchase').alias('item_times_purchased'))                                            

In [29]:
train_f = model.transform(train)

In [30]:
train_f = train_f.join(user_purchased, ['user_id'], how='left') \
                .join(item_purchased, ['item_id'], how='left')
test_f = predictions.select('user_id', 'item_id', 'prediction') \
            .join(user_purchased, ['user_id'], how='left') \
            .join(item_purchased, ['item_id'], how='left')

In [31]:
train_f.printSchema()

root
 |-- item_id: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- purchase: integer (nullable = true)
 |-- prediction: float (nullable = false)
 |-- user_times_purchased: long (nullable = true)
 |-- item_times_purchased: long (nullable = true)



In [32]:
test_f.printSchema()

root
 |-- item_id: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- prediction: float (nullable = false)
 |-- user_times_purchased: long (nullable = true)
 |-- item_times_purchased: long (nullable = true)



In [33]:
feat_assembler = VectorAssembler(inputCols=['prediction', 'user_times_purchased', 'item_times_purchased'], outputCol="features")

In [34]:
train_f = feat_assembler.transform(train_f.fillna(-1))
test_f = feat_assembler.transform(test_f.fillna(-1))

In [35]:
gbt = GBTRegressor(featuresCol='features', labelCol='purchase', predictionCol='prob', seed=42)
gbt_model = gbt.fit(train_f)
test_predictions = gbt_model.transform(test_f)

In [36]:
test_predictions.select('user_id', 'item_id', f.col('prob').alias('purchase')) \
                .orderBy('user_id', 'item_id').coalesce(1).toPandas().to_csv('lab03.csv')

In [40]:
sc.stop()