In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()

spark = SparkSession.builder.config(conf=conf).appName("dmitriy.sokolov lab03").getOrCreate()

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from pyspark.ml.regression import GBTRegressor, RandomForestRegressor
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.types import *

spark.sparkContext.setCheckpointDir('checkpoint/')

In [3]:
from pyspark.sql.functions import col, expr, when, lower
from urllib.parse import urlparse
# import pyspark.sql.functions as f

from pyspark.sql.functions import lit


In [4]:
from pyspark.sql.types import *

In [5]:
sc = spark.sparkContext

In [6]:
sc

In [7]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [8]:
! hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2022-01-06 18:46 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2022-01-06 18:46 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2022-01-06 18:46 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2022-01-06 18:46 /labs/slaba03/laba03_views_programmes.csv


## **Items**

In [9]:
spark.read \
     .format("csv") \
     .option("sep", "\t") \
     .option("header", True) \
     .load("/labs/slaba03/laba03_items.csv").show(2, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------
 item_id                     | 65667                                                    
 channel_id                  | null                                                     
 datetime_availability_start | 1970-01-01T00:00:00Z                                     
 datetime_availability_stop  | 2018-01-01T00:00:00Z                                     
 datetime_show_start         | null                                                     
 datetime_show_stop          | null                                                     
 content_type                | 1                                                        
 title                       | на пробах только девушки (all girl auditions)            
 year                        | 2013.0                                                   
 genres                      | Эротика                                                  
 region_id           

In [10]:
items = spark.read.csv("/labs/slaba03/laba03_items.csv", sep="\t",header=True,  inferSchema="true").cache()
items.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- channel_id: double (nullable = true)
 |-- datetime_availability_start: timestamp (nullable = true)
 |-- datetime_availability_stop: timestamp (nullable = true)
 |-- datetime_show_start: timestamp (nullable = true)
 |-- datetime_show_stop: timestamp (nullable = true)
 |-- content_type: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- year: double (nullable = true)
 |-- genres: string (nullable = true)
 |-- region_id: double (nullable = true)



## **Test**

In [11]:
spark.read.csv("/labs/slaba03/laba03_test.csv").show(2)

+-------+-------+--------+
|    _c0|    _c1|     _c2|
+-------+-------+--------+
|user_id|item_id|purchase|
|   1654|  94814|    null|
+-------+-------+--------+
only showing top 2 rows



In [12]:
test = spark.read.csv("/labs/slaba03/laba03_test.csv", sep=",",header=True, inferSchema="true").cache()
test.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: string (nullable = true)



## **Train**

In [13]:
train = spark.read.csv("/labs/slaba03/laba03_train.csv", sep=",",header=True, inferSchema="true").cache()
train = train.na.fill(-1)
train.printSchema()
# train.show(2 )

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)



## **Views_programmes**

In [15]:
views = spark.read.csv("/labs/slaba03/laba03_views_programmes.csv", sep=",",inferSchema="true",header=True).cache()
views = views.na.fill(-1)
views = views.withColumn("time_spend", col("ts_end") - col("ts_start"))

In [16]:
v_time_spend = views.groupBy('user_id').sum('time_spend')
v_count = views.groupBy('user_id').count()
views = v_time_spend.join(v_count, on="user_id", how="inner" )
views.show(2 )

+-------+---------------+-----+
|user_id|sum(time_spend)|count|
+-------+---------------+-----+
| 561425|          37699|    5|
| 612390|           4406|    1|
+-------+---------------+-----+
only showing top 2 rows



## **Modify train & test**

In [17]:
train_avg_purchase = train.groupBy('user_id').sum('purchase')
train_avg_item = train.groupBy('item_id').sum('purchase')

mod_train = train.join(train_avg_purchase, on="user_id", how="left")
mod_train = mod_train.withColumnRenamed("sum(purchase)", "sum_user")
mod_train = mod_train.join(train_avg_item, on="item_id", how="left")
mod_train = mod_train.withColumnRenamed("sum(purchase)", "sum_item")

mod_train = mod_train.join(views, on="user_id", how="left")
mod_train = mod_train.withColumnRenamed("sum(time_spend)", "time_spend")

mod_train = mod_train.na.fill(0.5)

In [18]:
mod_train.show()

+-------+-------+--------+--------+--------+----------+-----+
|user_id|item_id|purchase|sum_user|sum_item|time_spend|count|
+-------+-------+--------+--------+--------+----------+-----+
| 754230|   8389|       0|      72|       8|   2256455| 1164|
| 754230|   8638|       1|      72|       2|   2256455| 1164|
| 754230|  10817|       0|      72|       1|   2256455| 1164|
| 754230|  72820|       0|      72|       1|   2256455| 1164|
| 754230|  74757|       0|      72|       1|   2256455| 1164|
| 754230|  74820|       0|      72|       1|   2256455| 1164|
| 754230|  78113|       0|      72|       2|   2256455| 1164|
| 754230|  90019|       0|      72|       3|   2256455| 1164|
| 754230|  94851|       0|      72|       1|   2256455| 1164|
| 754230|  95080|       0|      72|       0|   2256455| 1164|
| 754230|  95940|       0|      72|       1|   2256455| 1164|
| 754230|  99817|       0|      72|       0|   2256455| 1164|
| 754230|   2027|       0|      72|       2|   2256455| 1164|
| 754230

In [19]:
mod_test = test.join(train_avg_purchase, on="user_id", how="left")
mod_test = mod_test.withColumnRenamed("sum(purchase)", "sum_user")

mod_test = mod_test.join(train_avg_item, on="item_id", how="left")
mod_test = mod_test.withColumnRenamed("sum(purchase)", "sum_item")

mod_test = mod_test.join(views, on="user_id", how="left")
mod_test = mod_test.withColumnRenamed("sum(time_spend)", "time_spend")

mod_test = mod_test.na.fill(0.5)

In [20]:
ignore = ['purchase', 'item_id','user_id']
assembler = VectorAssembler(
    inputCols=[x for x in mod_train.columns if x not in ignore],
    outputCol='features')

train_ = (assembler.transform(mod_train).select("purchase", "features"))
test_ = (assembler.transform(mod_test).select('features','item_id','user_id'))

In [21]:
train_.show(1)

+--------+--------------------+
|purchase|            features|
+--------+--------------------+
|       0|[72.0,1.0,2256455...|
+--------+--------------------+
only showing top 1 row



In [22]:
test_.show(1)

+--------------------+-------+-------+
|            features|item_id|user_id|
+--------------------+-------+-------+
|[72.0,3.0,2256455...|  93486| 754230|
+--------------------+-------+-------+
only showing top 1 row



In [23]:
rf_regressor = RandomForestRegressor(labelCol='purchase', numTrees=50, maxDepth=5).fit(train_)

In [24]:
rf_regressor.featureImportances

SparseVector(4, {0: 0.4286, 1: 0.484, 2: 0.041, 3: 0.0464})

In [25]:
rf_predictions=rf_regressor.transform(test_)


In [26]:
rf_predictions.show()

+--------------------+-------+-------+--------------------+
|            features|item_id|user_id|          prediction|
+--------------------+-------+-------+--------------------+
|[72.0,3.0,2256455...|  93486| 754230|0.019882314562094945|
|[72.0,1.0,2256455...|  94819| 754230|0.009072317312185368|
|[72.0,1.0,2256455...|  73041| 754230|0.009072317312185368|
|[72.0,5.0,2256455...|  74440| 754230| 0.03676912006650483|
|[72.0,2.0,2256455...|  74452| 754230|0.013913501786130782|
|[72.0,1.0,2256455...|  93131| 754230|0.009072317312185368|
|[72.0,3.0,2256455...|  93633| 754230|0.019882314562094945|
|[72.0,2.0,2256455...|  95151| 754230|0.013913501786130782|
|[72.0,0.0,2256455...|  11025| 754230|0.007568492813458312|
|[72.0,0.0,2256455...|  72912| 754230|0.007568492813458312|
|[72.0,0.0,2256455...|  86406| 754230|0.007568492813458312|
|[72.0,6.0,2256455...|  88999| 754230| 0.03986912939661451|
|[72.0,4.0,2256455...|  93487| 754230| 0.02758758256486628|
|[72.0,0.0,2256455...|   9071| 754230|0.

In [27]:
rf_predictions = rf_predictions.withColumnRenamed("prediction", "purchase")

In [28]:
rf_predictions = rf_predictions.na.fill(0.5)

In [29]:
rf_predictions.select("user_id","item_id","purchase").show()

+-------+-------+--------------------+
|user_id|item_id|            purchase|
+-------+-------+--------------------+
| 754230|  93486|0.019882314562094945|
| 754230|  94819|0.009072317312185368|
| 754230|  73041|0.009072317312185368|
| 754230|  74440| 0.03676912006650483|
| 754230|  74452|0.013913501786130782|
| 754230|  93131|0.009072317312185368|
| 754230|  93633|0.019882314562094945|
| 754230|  95151|0.013913501786130782|
| 754230|  11025|0.007568492813458312|
| 754230|  72912|0.007568492813458312|
| 754230|  86406|0.007568492813458312|
| 754230|  88999| 0.03986912939661451|
| 754230|  93487| 0.02758758256486628|
| 754230|   9071|0.007568492813458312|
| 754230|  72387|0.007568492813458312|
| 754230|  78269|0.007568492813458312|
| 754230|  93477|0.009072317312185368|
| 754230|  94726|0.013913501786130782|
| 754230|  10788| 0.03986912939661451|
| 754230|  11520|0.013913501786130782|
+-------+-------+--------------------+
only showing top 20 rows



In [30]:
cols = ['user_id','item_id']
rf_predictions = rf_predictions.orderBy(cols,ascending=True)

In [31]:
rf_predictions.select("user_id","item_id","purchase").show()

+-------+-------+--------------------+
|user_id|item_id|            purchase|
+-------+-------+--------------------+
|   1654|    336|6.819719739069844E-4|
|   1654|    678|6.819719739069844E-4|
|   1654|    691|6.819719739069844E-4|
|   1654|    696|8.241313955401766E-4|
|   1654|    763|6.819719739069844E-4|
|   1654|    795|0.003367037819940212|
|   1654|    861|6.819719739069844E-4|
|   1654|   1137|0.001044149494750...|
|   1654|   1159|8.241313955401766E-4|
|   1654|   1428|6.819719739069844E-4|
|   1654|   1685|8.241313955401766E-4|
|   1654|   1686|6.819719739069844E-4|
|   1654|   1704|8.957259043218441E-4|
|   1654|   2093|6.819719739069844E-4|
|   1654|   2343|6.819719739069844E-4|
|   1654|   2451|6.819719739069844E-4|
|   1654|   2469|0.004592723172531993|
|   1654|   2603|6.819719739069844E-4|
|   1654|   2609|6.819719739069844E-4|
|   1654|   2621|8.241313955401766E-4|
+-------+-------+--------------------+
only showing top 20 rows



In [32]:
rf_predictions.select("user_id","item_id","purchase").toPandas().to_csv("lab03.csv",header = True)

In [33]:
sc.stop()