In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "ZK ML app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
from pyspark.sql.types import *

In [4]:
! hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2021-02-27 22:12 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2021-02-27 22:12 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2021-02-27 22:12 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2021-02-27 22:12 /labs/slaba03/laba03_views_programmes.csv


In [5]:
schema_train = StructType([
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", IntegerType())
])

train_df = spark.read.csv("/labs/slaba03/laba03_train.csv", schema=schema_train, header=True)

In [6]:
train_df.show(2, vertical=True)

-RECORD 0---------
 user_id  | 1654  
 item_id  | 74107 
 purchase | 0     
-RECORD 1---------
 user_id  | 1654  
 item_id  | 89249 
 purchase | 0     
only showing top 2 rows



In [25]:
train_df.rdd.getNumPartitions()

6

In [7]:
train_df = train_df.repartition(6)
train_df.rdd.getNumPartitions()

6

In [8]:
schema_items = StructType(fields=[StructField('item_id', IntegerType()), 
                                       StructField('channel_id', IntegerType()),
                                       StructField('datetime_availability_start', StringType()),
                                       StructField('datetime_availability_stop', StringType()),
                                       StructField('datetime_show_start', StringType()),
                                       StructField('datetime_show_stop', StringType()),
                                       StructField('content_type', IntegerType()),
                                       StructField('title', StringType(), nullable=True),
                                       StructField('year', FloatType(), nullable=True),
                                       StructField('genres', StringType()),
                                       StructField('region_id', IntegerType()),
                                      ])

In [9]:
items_df = spark.read \
.format('csv')\
.schema(schema_items)\
.option("header", "true")\
.option("delimiter", "\t")\
.load('/labs/slaba03/laba03_items.csv')

In [10]:
items_df.show(2,False,True)

-RECORD 0-------------------------------------------------------------------------------
 item_id                     | 65667                                                    
 channel_id                  | null                                                     
 datetime_availability_start | 1970-01-01T00:00:00Z                                     
 datetime_availability_stop  | 2018-01-01T00:00:00Z                                     
 datetime_show_start         | null                                                     
 datetime_show_stop          | null                                                     
 content_type                | 1                                                        
 title                       | на пробах только девушки (all girl auditions)            
 year                        | 2013.0                                                   
 genres                      | Эротика                                                  
 region_id           

In [11]:
from pyspark.sql.functions import col,when,count,sum
items_df_filtered = items_df.select(col("item_id"), col("content_type"), col("title"), col("year"), col("genres"))\
.na.fill({'year': -1, 'genres': 'na'})

items_df_filtered.show(2,False,True)

-RECORD 0----------------------------------------------------------------
 item_id      | 65667                                                    
 content_type | 1                                                        
 title        | на пробах только девушки (all girl auditions)            
 year         | 2013.0                                                   
 genres       | Эротика                                                  
-RECORD 1----------------------------------------------------------------
 item_id      | 65669                                                    
 content_type | 1                                                        
 title        | скуби ду: эротическая пародия (scooby doo: a xxx parody) 
 year         | 2011.0                                                   
 genres       | Эротика                                                  
only showing top 2 rows



In [12]:
schema_test = StructType([
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", IntegerType())
])

test_df = spark.read.csv("/labs/slaba03/laba03_test.csv", schema=schema_test, header=True)

In [13]:
test_df.show(2,False,True)

-RECORD 0---------
 user_id  | 1654  
 item_id  | 94814 
 purchase | null  
-RECORD 1---------
 user_id  | 1654  
 item_id  | 93629 
 purchase | null  
only showing top 2 rows



In [14]:
import pyspark.sql.functions as f

user_mean = train_df.select(['user_id', 'purchase'])\
        .groupBy('user_id')\
        .agg(f.mean('purchase').alias('user_purchase_mean'))

In [15]:
user_mean.show(2,False,True)

-RECORD 0----------------------------------
 user_id            | 754230               
 user_purchase_mean | 0.027575641516660282 
-RECORD 1----------------------------------
 user_id            | 833685               
 user_purchase_mean | 0.007500986971969996 
only showing top 2 rows



In [16]:
item_mean = train_df.select(['item_id', 'purchase'])\
        .groupBy('item_id')\
        .agg(f.mean('purchase').alias('item_purchase_mean'))
item_mean.show(2,False,True)

-RECORD 0-----------------------------------
 item_id            | 93486                 
 item_purchase_mean | 0.0021413276231263384 
-RECORD 1-----------------------------------
 item_id            | 90019                 
 item_purchase_mean | 0.0022813688212927757 
only showing top 2 rows



In [17]:
train_df_pre = train_df.join(user_mean, train_df.user_id == user_mean.user_id)\
.select(train_df.user_id, train_df.item_id, train_df.purchase, user_mean.user_purchase_mean)
train_df_mean = train_df_pre.join(item_mean, train_df_pre.item_id == item_mean.item_id)\
.select(train_df_pre.user_id, train_df_pre.item_id, train_df_pre.purchase, train_df_pre.user_purchase_mean, \
        item_mean.item_purchase_mean)
train_df_mean.show(1,False,True)

-RECORD 0-----------------------------------
 user_id            | 920599                
 item_id            | 8389                  
 purchase           | 0                     
 user_purchase_mean | 0.0015564202334630351 
 item_purchase_mean | 0.005979073243647235  
only showing top 1 row



In [18]:
test_df_pre = test_df.join(user_mean, test_df.user_id == user_mean.user_id)\
.select(test_df.user_id, test_df.item_id, test_df.purchase, user_mean.user_purchase_mean)
test_df_mean = test_df_pre.join(item_mean, test_df_pre.item_id == item_mean.item_id)\
.select(test_df_pre.user_id, test_df_pre.item_id, test_df_pre.purchase, test_df_pre.user_purchase_mean, item_mean.item_purchase_mean)
test_df_mean.show(1,False,True)

-RECORD 0----------------------------------
 user_id            | 761341               
 item_id            | 8389                 
 purchase           | null                 
 user_purchase_mean | 3.875968992248062E-4 
 item_purchase_mean | 0.005979073243647235 
only showing top 1 row



In [19]:
train_df_items = train_df_mean.join(items_df_filtered, train_df_mean.item_id == items_df_filtered.item_id, how="left")\
.select(train_df_mean.user_id, train_df_mean.item_id, train_df_mean.purchase, train_df_mean.user_purchase_mean, \
        train_df_mean.item_purchase_mean \
       , items_df_filtered.content_type, items_df_filtered.title, items_df_filtered.year, items_df_filtered.genres)
train_df_items.show(1,False,True)

-RECORD 0-----------------------------------------------------
 user_id            | 754230                                  
 item_id            | 8389                                    
 purchase           | 0                                       
 user_purchase_mean | 0.027575641516660282                    
 item_purchase_mean | 0.005979073243647235                    
 content_type       | 1                                       
 title              | пес в сапогах (сурдоперевод)            
 year               | 1981.0                                  
 genres             | Мультфильмы,Детские,Союзмультфильм,Наши 
only showing top 1 row



In [62]:
train_df_items.count()

5032624

In [20]:
test_df_items = test_df_mean.join(items_df_filtered, test_df_mean.item_id == items_df_filtered.item_id, how="left")\
.select(test_df_mean.user_id, test_df_mean.item_id, test_df_mean.purchase, test_df_mean.user_purchase_mean \
        , test_df_mean.item_purchase_mean \
       , items_df_filtered.content_type, items_df_filtered.title, items_df_filtered.year, items_df_filtered.genres)
test_df_items.show(1,False,True)

-RECORD 0-----------------------------------------------------
 user_id            | 814235                                  
 item_id            | 8389                                    
 purchase           | null                                    
 user_purchase_mean | 7.78816199376947E-4                     
 item_purchase_mean | 0.005979073243647235                    
 content_type       | 1                                       
 title              | пес в сапогах (сурдоперевод)            
 year               | 1981.0                                  
 genres             | Мультфильмы,Детские,Союзмультфильм,Наши 
only showing top 1 row



In [19]:
from pyspark.ml import Pipeline, Transformer, Estimator

cv = CountVectorizer(inputCol="genres_array", outputCol="genres_vector")

train_model = cv.fit(train_df_items_v)
train_result = train_model.transform(train_df_items_v)
train_result.show(1,False,True)

regression

In [21]:
from pyspark.ml.feature import Tokenizer, VectorAssembler, CountVectorizer

In [22]:
tokenizer = Tokenizer(inputCol="genres", outputCol="genres_words")

In [23]:
Count_Vectorizer = CountVectorizer(inputCol="genres_words", outputCol="features")

In [24]:
assembler = VectorAssembler(inputCols=["features", "user_purchase_mean", "item_purchase_mean", "content_type"], \
                            outputCol="features_fin")

In [25]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol=assembler.getOutputCol(), labelCol="purchase", maxIter=15)

In [26]:
from pyspark.ml import Pipeline

pipeline2 = Pipeline(stages=[
    tokenizer,
    Count_Vectorizer,
    assembler,
    lr
])

In [27]:
pipeline_model = pipeline2.fit(train_df_items)

In [28]:
pipeline_model

PipelineModel_fd695dcb68a8

In [29]:
predictions2 = pipeline_model.transform(test_df_items)

In [30]:
predictions2.show(1,False,True)

-RECORD 0--------------------------------------------------------------------------------------------
 user_id            | 822709                                                                         
 item_id            | 8389                                                                           
 purchase           | null                                                                           
 user_purchase_mean | 3.7893141341417203E-4                                                          
 item_purchase_mean | 0.005979073243647235                                                           
 content_type       | 1                                                                              
 title              | пес в сапогах (сурдоперевод)                                                   
 year               | 1981.0                                                                         
 genres             | Мультфильмы,Детские,Союзмультфильм,Наши                     

In [31]:
from pyspark.sql.functions import udf
split1_udf = udf(lambda value: value[0].item(), DoubleType())
split2_udf = udf(lambda value: value[1].item(), DoubleType())
output2 = predictions2.select("user_id","item_id", "prediction", "probability", split1_udf('probability').alias('c1'), split2_udf('probability').alias('c2'))
output2.show(1,False,True)

-RECORD 0------------------------------------------------
 user_id     | 846231                                    
 item_id     | 8389                                      
 prediction  | 0.0                                       
 probability | [0.9983060094178553,0.001693990582144607] 
 c1          | 0.9983060094178553                        
 c2          | 0.001693990582144607                      
only showing top 1 row



In [32]:
lab_df = output2.select("user_id","item_id",col("c2").alias('purchase'))\
.orderBy('user_id','item_id')
lab_df.show(10,truncate=False)

+-------+-------+---------------------+
|user_id|item_id|purchase             |
+-------+-------+---------------------+
|1654   |336    |2.6411577989123003E-4|
|1654   |678    |5.756595342452835E-4 |
|1654   |691    |0.0010737500270940703|
|1654   |696    |0.0011576214649155882|
|1654   |763    |0.0014251116907521974|
|1654   |795    |0.0054233830496116945|
|1654   |861    |7.372708443897341E-4 |
|1654   |1137   |0.0018439466283109627|
|1654   |1159   |0.0015648764454790005|
|1654   |1428   |0.0010289588725163864|
+-------+-------+---------------------+
only showing top 10 rows



In [35]:
lab_df.toPandas().to_csv('lab03.csv')

In [33]:
spark.stop()