In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 6 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.window import Window
from pyspark.sql.types import *


from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier


from pyspark.sql.functions import udf



conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())


In [3]:
schema_t = StructType([StructField('user_id',LongType(),True)
                       ,StructField('item_id',LongType(),True)
                       ,StructField('purchase',IntegerType(),True)               
                      ]) 

schema_i = StructType([StructField('item_id',LongType(),True)
                       ,StructField('channel_id',StringType(),True)
                       ,StructField('datetime_availability_start',StringType(),True)
                       ,StructField('datetime_availability_stop',StringType(),True)
                       ,StructField('datetime_show_start',StringType(),True) 
                       ,StructField('datetime_show_stop',StringType(),True) 
                       ,StructField('content_type',IntegerType(),True)
                       ,StructField('title',StringType(),True) 
                       ,StructField('year',StringType(),True)
                       ,StructField('genres',StringType(),True)
                       ,StructField('region_id',StringType(),True)
                      ]) 

schema_v = StructType([StructField('user_id',LongType(),True)
                       ,StructField('item_id',LongType(),True)
                       ,StructField('ts_start',StringType(),True)
                       ,StructField('ts_end',StringType(),True) 
                       ,StructField('item_type',StringType(),True) 
                      ]) 

In [4]:
train_df = spark.read.csv("/labs/slaba03/laba03_train.csv", header=True, schema=schema_t)#, sep = '\t'
test_df = spark.read.csv("/labs/slaba03/laba03_test.csv", header=True, schema=schema_t)#, sep = '\t'
items_df = spark.read.csv("/labs/slaba03/laba03_items.csv", header=True, sep = '\t', schema=schema_i)
views_df = spark.read.csv("/labs/slaba03/laba03_views_programmes.csv", header=True, schema=schema_v)#, sep = '\t')

In [5]:
train_df.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
+-------+-------+--------+
only showing top 5 rows



In [6]:
train_df.groupBy('purchase').count().show()

+--------+-------+
|purchase|  count|
+--------+-------+
|       1|  10904|
|       0|5021720|
+--------+-------+



In [7]:
views_df.groupby('item_type').count().show()

+---------+--------+
|item_type|   count|
+---------+--------+
|     live|17704201|
|      pvr| 3141406|
+---------+--------+



In [8]:
test_df.select(F.countDistinct('user_id')).show()

+-----------------------+
|count(DISTINCT user_id)|
+-----------------------+
|                   1941|
+-----------------------+



In [9]:
features_df = train_df.union(test_df)
features_df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- item_id: long (nullable = true)
 |-- purchase: integer (nullable = true)



In [10]:
features_df = (
    features_df
    .join(items_df,on='item_id',how='left')
    .select(features_df['user_id']
            ,features_df['item_id']
            ,'genres'
            ,'year'
            ,F.col('purchase').alias('target')
            ,F.col('datetime_availability_start').alias('avail_start_dt')
            ,F.col('datetime_availability_stop').alias('avail_stop_dt'))
)
features_df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- item_id: long (nullable = true)
 |-- genres: string (nullable = true)
 |-- year: string (nullable = true)
 |-- target: integer (nullable = true)
 |-- avail_start_dt: string (nullable = true)
 |-- avail_stop_dt: string (nullable = true)



In [11]:
features_df = features_df.fillna( { 'genres':'no_genre'} )
features_df = features_df.fillna( { 'year':'no_year'} )
features_df = features_df.fillna( { 'avail_start_dt':'no_avail_start_dt'} )
features_df = features_df.fillna( { 'avail_stop_dt':'no_avail_stop_dt'} )


features_df = features_df.withColumn('genres',F.lower(F.col('genres')))
features_df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- item_id: long (nullable = true)
 |-- genres: string (nullable = false)
 |-- year: string (nullable = false)
 |-- target: integer (nullable = true)
 |-- avail_start_dt: string (nullable = false)
 |-- avail_stop_dt: string (nullable = false)



In [12]:
# user
user_purchases = (
    features_df 
    .groupBy('user_id')
    .agg(F.sum('target').alias('user_pays'),
         F.count('*').alias('user_cnt')
        )
    .select('user_id','user_pays','user_cnt',(F.col('user_pays') / F.col('user_cnt')).alias('user_pay_to_all'))
)
user_purchases.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- user_pays: long (nullable = true)
 |-- user_cnt: long (nullable = false)
 |-- user_pay_to_all: double (nullable = true)



In [13]:
# item
item_purchases = (
    features_df 
    .groupBy('item_id')
    .agg(F.sum('target').alias('item_pays'),
         F.count('*').alias('item_cnt')
        )
    .select('item_id','item_pays','item_cnt',(F.col('item_pays') / F.col('item_cnt')).alias('item_pay_to_all'))
)  
item_purchases.printSchema()

root
 |-- item_id: long (nullable = true)
 |-- item_pays: long (nullable = true)
 |-- item_cnt: long (nullable = false)
 |-- item_pay_to_all: double (nullable = true)



In [14]:
features_df = features_df.join(user_purchases,on='user_id',how='left')
features_df = features_df.join(item_purchases,on='item_id',how='left')

In [15]:
features_df.printSchema()

root
 |-- item_id: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- genres: string (nullable = false)
 |-- year: string (nullable = false)
 |-- target: integer (nullable = true)
 |-- avail_start_dt: string (nullable = false)
 |-- avail_stop_dt: string (nullable = false)
 |-- user_pays: long (nullable = true)
 |-- user_cnt: long (nullable = true)
 |-- user_pay_to_all: double (nullable = true)
 |-- item_pays: long (nullable = true)
 |-- item_cnt: long (nullable = true)
 |-- item_pay_to_all: double (nullable = true)



In [16]:
views_df = views_df.withColumn('time',views_df.ts_end-views_df.ts_start)
views_df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- item_id: long (nullable = true)
 |-- ts_start: string (nullable = true)
 |-- ts_end: string (nullable = true)
 |-- item_type: string (nullable = true)
 |-- time: double (nullable = true)



In [17]:
user_views = (
    views_df
    .groupBy('user_id')
    .agg(F.count('*').alias('user_num_views'),
         F.sum('time').alias('user_total_view_time')
        )
)
user_views.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- user_num_views: long (nullable = false)
 |-- user_total_view_time: double (nullable = true)



In [18]:
user_type_views = (
    views_df.groupBy('user_id','item_type')
    .agg(F.count('*').alias('user_type_view_cnt'),
         F.sum('time').alias('user_type_view_time')
    )
)
user_type_views.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- item_type: string (nullable = true)
 |-- user_type_view_cnt: long (nullable = false)
 |-- user_type_view_time: double (nullable = true)



In [19]:
features_df = (
    features_df
    .join(user_views,on='user_id',how='left')
)
features_df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- item_id: long (nullable = true)
 |-- genres: string (nullable = false)
 |-- year: string (nullable = false)
 |-- target: integer (nullable = true)
 |-- avail_start_dt: string (nullable = false)
 |-- avail_stop_dt: string (nullable = false)
 |-- user_pays: long (nullable = true)
 |-- user_cnt: long (nullable = true)
 |-- user_pay_to_all: double (nullable = true)
 |-- item_pays: long (nullable = true)
 |-- item_cnt: long (nullable = true)
 |-- item_pay_to_all: double (nullable = true)
 |-- user_num_views: long (nullable = true)
 |-- user_total_view_time: double (nullable = true)



In [20]:
features_df = (
    features_df
    .join(user_type_views.where("item_type=='live'")
          .select('user_id'
                  ,col('user_type_view_cnt').alias('user_num_live_views')
                  ,col('user_type_view_time').alias('user_time_live_views')),on='user_id',how='left')
)
features_df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- item_id: long (nullable = true)
 |-- genres: string (nullable = false)
 |-- year: string (nullable = false)
 |-- target: integer (nullable = true)
 |-- avail_start_dt: string (nullable = false)
 |-- avail_stop_dt: string (nullable = false)
 |-- user_pays: long (nullable = true)
 |-- user_cnt: long (nullable = true)
 |-- user_pay_to_all: double (nullable = true)
 |-- item_pays: long (nullable = true)
 |-- item_cnt: long (nullable = true)
 |-- item_pay_to_all: double (nullable = true)
 |-- user_num_views: long (nullable = true)
 |-- user_total_view_time: double (nullable = true)
 |-- user_num_live_views: long (nullable = true)
 |-- user_time_live_views: double (nullable = true)



In [21]:
features_df = (
    features_df
    .join(user_type_views.where("item_type=='pvr'")
          .select('user_id'
                  ,col('user_type_view_cnt').alias('user_num_pvr_views')
                  ,col('user_type_view_time').alias('user_time_pvr_views')),on='user_id',how='left')
)
features_df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- item_id: long (nullable = true)
 |-- genres: string (nullable = false)
 |-- year: string (nullable = false)
 |-- target: integer (nullable = true)
 |-- avail_start_dt: string (nullable = false)
 |-- avail_stop_dt: string (nullable = false)
 |-- user_pays: long (nullable = true)
 |-- user_cnt: long (nullable = true)
 |-- user_pay_to_all: double (nullable = true)
 |-- item_pays: long (nullable = true)
 |-- item_cnt: long (nullable = true)
 |-- item_pay_to_all: double (nullable = true)
 |-- user_num_views: long (nullable = true)
 |-- user_total_view_time: double (nullable = true)
 |-- user_num_live_views: long (nullable = true)
 |-- user_time_live_views: double (nullable = true)
 |-- user_num_pvr_views: long (nullable = true)
 |-- user_time_pvr_views: double (nullable = true)



In [22]:
features_df2 = features_df.fillna( { 'user_num_views':0,
                                   'user_total_view_time':0,
                                   'user_num_live_views':0,
                                   'user_time_live_views':0,
                                   'user_num_pvr_views':0,
                                   'user_time_pvr_views':0} )
features_df2.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- item_id: long (nullable = true)
 |-- genres: string (nullable = false)
 |-- year: string (nullable = false)
 |-- target: integer (nullable = true)
 |-- avail_start_dt: string (nullable = false)
 |-- avail_stop_dt: string (nullable = false)
 |-- user_pays: long (nullable = true)
 |-- user_cnt: long (nullable = true)
 |-- user_pay_to_all: double (nullable = true)
 |-- item_pays: long (nullable = true)
 |-- item_cnt: long (nullable = true)
 |-- item_pay_to_all: double (nullable = true)
 |-- user_num_views: long (nullable = false)
 |-- user_total_view_time: double (nullable = false)
 |-- user_num_live_views: long (nullable = false)
 |-- user_time_live_views: double (nullable = false)
 |-- user_num_pvr_views: long (nullable = false)
 |-- user_time_pvr_views: double (nullable = false)



In [23]:
features_df = features_df2.select('user_id', 'item_id', 'year', 'target', 'avail_start_dt', 'avail_stop_dt', 'user_pays', 'user_cnt', 'user_pay_to_all', 'item_pays', 'item_cnt', 'item_pay_to_all', 'user_num_views', 'user_total_view_time', 'user_num_live_views', 'user_time_live_views', 'user_num_pvr_views', 'user_time_pvr_views')
features_df2.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- item_id: long (nullable = true)
 |-- genres: string (nullable = false)
 |-- year: string (nullable = false)
 |-- target: integer (nullable = true)
 |-- avail_start_dt: string (nullable = false)
 |-- avail_stop_dt: string (nullable = false)
 |-- user_pays: long (nullable = true)
 |-- user_cnt: long (nullable = true)
 |-- user_pay_to_all: double (nullable = true)
 |-- item_pays: long (nullable = true)
 |-- item_cnt: long (nullable = true)
 |-- item_pay_to_all: double (nullable = true)
 |-- user_num_views: long (nullable = false)
 |-- user_total_view_time: double (nullable = false)
 |-- user_num_live_views: long (nullable = false)
 |-- user_time_live_views: double (nullable = false)
 |-- user_num_pvr_views: long (nullable = false)
 |-- user_time_pvr_views: double (nullable = false)



In [24]:
year_indexer = StringIndexer(inputCol="year", outputCol="yearIndex")
avail_start_indexer = StringIndexer(inputCol="avail_start_dt", outputCol="avail_start_dtIndex")
avail_stop_indexer = StringIndexer(inputCol="avail_stop_dt", outputCol="avail_stop_dtIndex")


OHE_year_vector = OneHotEncoder(inputCol="yearIndex", outputCol="year_vec")
OHE_avail_start_dt_vector = OneHotEncoder(inputCol="avail_start_dtIndex", outputCol="avail_start_dt_vec")
OHE_avail_stop_dt_vector = OneHotEncoder(inputCol="avail_stop_dtIndex", outputCol="avail_stop_dt_vec")


features = [a for a in features_df.columns if a not in ['item_id', 'user_id', 'target','year','avail_start_dt','avail_stop_dt']]
features = features + ['year_vec','avail_start_dt_vec','avail_stop_dt_vec']
assembler = VectorAssembler(inputCols=features, outputCol='features')

In [25]:
pipeline = Pipeline(stages=[year_indexer,
                            avail_start_indexer,
                            avail_stop_indexer,
                            OHE_year_vector,
                            OHE_avail_start_dt_vector,
                            OHE_avail_stop_dt_vector,
                            assembler
                    ])

In [26]:
transf_feats = pipeline.fit(features_df).transform(features_df)

In [27]:
transf_feats.rdd.getNumPartitions()

200

In [28]:
transf_feats = transf_feats.coalesce(50)

In [29]:
transf_feats = transf_feats.select('user_id','item_id','features','target')

In [30]:
train = transf_feats.where('target is not null')
test = transf_feats.where('target is null')

In [31]:
lr = LogisticRegression(featuresCol='features'
                        , labelCol="target"
                        , maxIter=30
                        ,regParam=0.01)

In [32]:
lr_model = lr.fit(train)

In [33]:
sparse_values = udf(lambda v: v.values.tolist(), ArrayType(DoubleType()))

In [34]:
predictions_prom_lr = lr_model.transform(test)

In [35]:
predictions_fin_lr = predictions_prom_lr.select('user_id','item_id','probability').withColumn("proba", sparse_values("probability"))

In [36]:
predictions_fin_lr.rdd.getNumPartitions()

50

In [37]:
res = predictions_fin_lr.select('user_id','item_id',F.col('proba').getItem(1).alias('purchase')).orderBy(F.col('user_id').asc(),F.col('item_id').asc())
res.count()

2156840

In [38]:
res.show(5)

+-------+-------+--------------------+
|user_id|item_id|            purchase|
+-------+-------+--------------------+
|   1654|    336|0.001504998055953...|
|   1654|    678|0.001482745401259...|
|   1654|    691|0.001453873659942103|
|   1654|    696|0.001700678351169...|
|   1654|    763|0.001584054110551...|
+-------+-------+--------------------+
only showing top 5 rows



In [39]:
res.toPandas().to_csv('lab03.csv')

In [45]:
spark.stop()