In [1]:
#%%

import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 6 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())


#%%

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.window import Window
from pyspark.sql.types import *


from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier


from pyspark.sql.functions import udf



conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())


schema_t = StructType([StructField('user_id',LongType(),True)
                       ,StructField('item_id',LongType(),True)
                       ,StructField('purchase',IntegerType(),True)               
                      ]) 

schema_i = StructType([StructField('item_id',LongType(),True)
                       ,StructField('channel_id',StringType(),True)
                       ,StructField('datetime_availability_start',StringType(),True)
                       ,StructField('datetime_availability_stop',StringType(),True)
                       ,StructField('datetime_show_start',StringType(),True) 
                       ,StructField('datetime_show_stop',StringType(),True) 
                       ,StructField('content_type',IntegerType(),True)
                       ,StructField('title',StringType(),True) 
                       ,StructField('year',StringType(),True)
                       ,StructField('genres',StringType(),True)
                       ,StructField('region_id',StringType(),True)
                      ]) 

schema_v = StructType([StructField('user_id',LongType(),True)
                       ,StructField('item_id',LongType(),True)
                       ,StructField('ts_start',StringType(),True)
                       ,StructField('ts_end',StringType(),True) 
                       ,StructField('item_type',StringType(),True) 
                      ]) 


train_df = spark.read.csv("/labs/slaba03/laba03_train.csv", header=True, schema=schema_t)#, sep = '\t'
test_df = spark.read.csv("/labs/slaba03/laba03_test.csv", header=True, schema=schema_t)#, sep = '\t'
items_df = spark.read.csv("/labs/slaba03/laba03_items.csv", header=True, sep = '\t', schema=schema_i)
views_df = spark.read.csv("/labs/slaba03/laba03_views_programmes.csv", header=True, schema=schema_v)#, sep = '\t')


train_df.show(5)

train_df.groupBy('purchase').count().show()
views_df.groupby('item_type').count().show()
test_df.select(F.countDistinct('user_id')).show()
features_df = train_df.union(test_df)
features_df.printSchema()
features_df = (
    features_df
    .join(items_df,on='item_id',how='left')
    .select(features_df['user_id']
            ,features_df['item_id']
            ,'genres'
            ,'year'
            ,F.col('purchase').alias('target')
            ,F.col('datetime_availability_start').alias('avail_start_dt')
            ,F.col('datetime_availability_stop').alias('avail_stop_dt'))
)
features_df.printSchema()
features_df = features_df.fillna( { 'genres':'no_genre'} )
features_df = features_df.fillna( { 'year':'no_year'} )
features_df = features_df.fillna( { 'avail_start_dt':'no_avail_start_dt'} )
features_df = features_df.fillna( { 'avail_stop_dt':'no_avail_stop_dt'} )


features_df = features_df.withColumn('genres',F.lower(F.col('genres')))
features_df.printSchema()

user_purchases = (
    features_df 
    .groupBy('user_id')
    .agg(F.sum('target').alias('user_pays'),
         F.count('*').alias('user_cnt')
        )
    .select('user_id','user_pays','user_cnt',(F.col('user_pays') / F.col('user_cnt')).alias('user_pay_to_all'))
)
user_purchases.printSchema()
item_purchases = (
    features_df 
    .groupBy('item_id')
    .agg(F.sum('target').alias('item_pays'),
         F.count('*').alias('item_cnt')
        )
    .select('item_id','item_pays','item_cnt',(F.col('item_pays') / F.col('item_cnt')).alias('item_pay_to_all'))
)  
item_purchases.printSchema()
features_df = features_df.join(user_purchases,on='user_id',how='left')
features_df = features_df.join(item_purchases,on='item_id',how='left')
features_df.printSchema()
views_df = views_df.withColumn('time',views_df.ts_end-views_df.ts_start)
views_df.printSchema()
user_views = (
    views_df
    .groupBy('user_id')
    .agg(F.count('*').alias('user_num_views'),
         F.sum('time').alias('user_total_view_time')
        )
)
user_views.printSchema()

user_type_views = (
    views_df.groupBy('user_id','item_type')
    .agg(F.count('*').alias('user_type_view_cnt'),
         F.sum('time').alias('user_type_view_time')
    )
)
user_type_views.printSchema()
features_df = (
    features_df
    .join(user_views,on='user_id',how='left')
)
features_df.printSchema()
features_df = (
    features_df
    .join(user_type_views.where("item_type=='live'")
          .select('user_id'
                  ,col('user_type_view_cnt').alias('user_num_live_views')
                  ,col('user_type_view_time').alias('user_time_live_views')),on='user_id',how='left')
)
features_df.printSchema()
features_df = (
    features_df
    .join(user_type_views.where("item_type=='pvr'")
          .select('user_id'
                  ,col('user_type_view_cnt').alias('user_num_pvr_views')
                  ,col('user_type_view_time').alias('user_time_pvr_views')),on='user_id',how='left')
)
features_df.printSchema()
features_df2 = features_df.fillna( { 'user_num_views':0,
                                   'user_total_view_time':0,
                                   'user_num_live_views':0,
                                   'user_time_live_views':0,
                                   'user_num_pvr_views':0,
                                   'user_time_pvr_views':0} )
features_df2.printSchema()
features_df = features_df2.select('user_id', 'item_id', 'year', 'target', 'avail_start_dt', 'avail_stop_dt', 'user_pays', 'user_cnt', 'user_pay_to_all', 'item_pays', 'item_cnt', 'item_pay_to_all', 'user_num_views', 'user_total_view_time', 'user_num_live_views', 'user_time_live_views', 'user_num_pvr_views', 'user_time_pvr_views')
features_df2.printSchema()
year_indexer = StringIndexer(inputCol="year", outputCol="yearIndex")
avail_start_indexer = StringIndexer(inputCol="avail_start_dt", outputCol="avail_start_dtIndex")
avail_stop_indexer = StringIndexer(inputCol="avail_stop_dt", outputCol="avail_stop_dtIndex")


OHE_year_vector = OneHotEncoder(inputCol="yearIndex", outputCol="year_vec")
OHE_avail_start_dt_vector = OneHotEncoder(inputCol="avail_start_dtIndex", outputCol="avail_start_dt_vec")
OHE_avail_stop_dt_vector = OneHotEncoder(inputCol="avail_stop_dtIndex", outputCol="avail_stop_dt_vec")


features = [a for a in features_df.columns if a not in ['item_id', 'user_id', 'target','year','avail_start_dt','avail_stop_dt']]
features = features + ['year_vec','avail_start_dt_vec','avail_stop_dt_vec']
assembler = VectorAssembler(inputCols=features, outputCol='features')
pipeline = Pipeline(stages=[year_indexer,
                            avail_start_indexer,
                            avail_stop_indexer,
                            OHE_year_vector,
                            OHE_avail_start_dt_vector,
                            OHE_avail_stop_dt_vector,
                            assembler
                    ])
transf_feats = pipeline.fit(features_df).transform(features_df)
transf_feats.rdd.getNumPartitions()
transf_feats = transf_feats.coalesce(50)
transf_feats = transf_feats.select('user_id','item_id','features','target')
train = transf_feats.where('target is not null')
test = transf_feats.where('target is null')
lr = GBTClassifier(featuresCol='features'
                        , labelCol="target"
                        , maxIter=30)
lr_model = lr.fit(train)
sparse_values = udf(lambda v: v.values.tolist(), ArrayType(DoubleType()))

predictions_prom_lr = lr_model.transform(test)
predictions_fin_lr = predictions_prom_lr.select('user_id','item_id','probability').withColumn("proba", sparse_values("probability"))
predictions_fin_lr.rdd.getNumPartitions()
res = predictions_fin_lr.select('user_id','item_id',F.col('proba').getItem(1).alias('purchase')).orderBy(F.col('user_id').asc(),F.col('item_id').asc())
res.count()
res.show(5)
res.toPandas().to_csv('lab03_.csv')

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.
+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
+-------+-------+--------+
only showing top 5 rows

+--------+-------+
|purchase|  count|
+--------+-------+
|       1|  10904|
|       0|5021720|
+--------+-------+

+---------+--------+
|item_type|   count|
+---------+--------+
|     live|17704201|
|      pvr| 3141406|
+---------+--------+

+-----------------------+
|count(DISTINCT user_id)|
+-----------------------+
|                   1941|
+-----------------------+

root
 |-- user_id: long (nullable = true)
 |-- item_id: long (nullable = true)
 |-- purchase: integer (nullable = 

In [2]:
res.show()

+-------+-------+--------------------+
|user_id|item_id|            purchase|
+-------+-------+--------------------+
|   1654|    336|0.032905070014759286|
|   1654|    678|0.032905070014759286|
|   1654|    691|0.032905070014759286|
|   1654|    696|  0.0329250952293082|
|   1654|    763|0.032905070014759286|
|   1654|    795| 0.03489766832813934|
|   1654|    861|0.032905070014759286|
|   1654|   1137|0.033118672439478525|
|   1654|   1159|  0.0329250952293082|
|   1654|   1428|0.032905070014759286|
|   1654|   1685|  0.0329250952293082|
|   1654|   1686|0.032905070014759286|
|   1654|   1704| 0.03299673230867528|
|   1654|   2093|0.032905070014759286|
|   1654|   2343|0.032905070014759286|
|   1654|   2451|0.032905070014759286|
|   1654|   2469| 0.03669501095494343|
|   1654|   2603|0.032905070014759286|
|   1654|   2609|0.032905070014759286|
|   1654|   2621|  0.0329250952293082|
+-------+-------+--------------------+
only showing top 20 rows



In [3]:
res.filter(col("purchase") > 0.5).count()

27

In [4]:
res.filter(col("purchase") <= 0.5).count()

2156813

In [5]:
predictions_prom_lr.show()

+-------+-------+--------------------+------+--------------------+--------------------+----------+
|user_id|item_id|            features|target|       rawPrediction|         probability|prediction|
+-------+-------+--------------------+------+--------------------+--------------------+----------+
| 728960|  79884|(96,[0,1,2,3,4,5,...|  null|[1.69485989093793...|[0.96738170375237...|       0.0|
| 728960|  88979|(96,[0,1,2,3,4,5,...|  null|[1.69517443945017...|[0.96740154866004...|       0.0|
| 728960|  98225|(96,[0,1,2,4,6,7,...|  null|[1.69517443945017...|[0.96740154866004...|       0.0|
| 728960|   6162|(96,[0,1,2,3,4,5,...|  null|[1.69517443945017...|[0.96740154866004...|       0.0|
| 728960|  10176|(96,[0,1,2,3,4,5,...|  null|[1.69517443945017...|[0.96740154866004...|       0.0|
| 728960|  66180|(96,[0,1,2,4,6,7,...|  null|[1.69517443945017...|[0.96740154866004...|       0.0|
| 728960|  68160|(96,[0,1,2,4,6,7,...|  null|[1.69517443945017...|[0.96740154866004...|       0.0|
| 728960| 

In [6]:
predictions_prom_lr_train = lr_model.transform(train)

In [7]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

tp_double = predictions_prom_lr_train.withColumn(
    "target", col("target").cast("double"
)).withColumn(
    "prediction", col("prediction").cast("double"
))
evaluator = BinaryClassificationEvaluator(
                                         labelCol="target",
                                         rawPredictionCol="rawPrediction", 
                                         metricName="areaUnderROC")
evaluator.evaluate(tp_double)

0.936081370651349

In [None]:
# spark.stop()