In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 5g --driver-memory 2g pyspark-shell'
 
spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
    
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "GP Lab03") 
conf.set("spark.sql.crossJoin.enabled", "True") 

spark = SparkSession.builder.config(conf=conf).appName("GP Lab03").getOrCreate()

In [3]:
spark

In [4]:
sc = spark.sparkContext

In [5]:
import re
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.sql.types import ArrayType, FloatType, StringType
from pyspark.ml.feature import HashingTF, IDF
import pyspark.sql.functions as f

In [6]:
!hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2022-01-06 18:46 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2022-01-06 18:46 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2022-01-06 18:46 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2022-01-06 18:46 /labs/slaba03/laba03_views_programmes.csv


In [7]:
# Читаем данные
# items = spark.read.csv('/labs/slaba03/laba03_items.csv', header=True)
# test = spark.read.csv('/labs/slaba03/laba03_test.csv')
# train = spark.read.csv('/labs/slaba03/laba03_train.csv')
# views_programmes = spark.read.csv('/labs/slaba03/laba03_views_programmes.csv')

In [8]:
items_schema = StructType(fields = [
    StructField('item_id', IntegerType()),
    StructField('channel_id', IntegerType()),
    StructField('datetime_availability_start', StringType()),                                    
    StructField('datetime_availability_stop', StringType()), 
    StructField('datetime_show_start', StringType()),
    StructField('datetime_show_stop', StringType()), 
    StructField('content_type', IntegerType()),                                   
    StructField('title', StringType(), nullable=True),  
    StructField('year', FloatType(), nullable=True), 
    StructField('genres', StringType()), 
    StructField('region_id', IntegerType(), nullable=True),                    
]) 

# items = spark.read.csv('/labs/slaba03/laba03_items.csv', header=True, schema=items_schema) #

df_items = spark.read.format("csv") \
    .option("header", True) \
    .option("sep", "\t") \
    .schema(items_schema) \
    .load("/labs/slaba03/laba03_items.csv")

In [9]:
df_items = df_items.filter(df_items.genres.isNotNull())

In [10]:
df_items.show(1, vertical=True)

-RECORD 0-------------------------------------------
 item_id                     | 65667                
 channel_id                  | null                 
 datetime_availability_start | 1970-01-01T00:00:00Z 
 datetime_availability_stop  | 2018-01-01T00:00:00Z 
 datetime_show_start         | null                 
 datetime_show_stop          | null                 
 content_type                | 1                    
 title                       | на пробах только ... 
 year                        | 2013.0               
 genres                      | Эротика              
 region_id                   | null                 
only showing top 1 row



In [11]:
schema = StructType() \
    .add("user_id", IntegerType(), True) \
    .add("item_id", IntegerType(), True) \
    .add("purchase", IntegerType(), True) \

df_user_trian = spark.read.format("csv") \
    .option("header", True) \
    .schema(schema) \
    .load("/labs/slaba03/laba03_train.csv")

In [12]:
schema_test = StructType() \
    .add("user_id", IntegerType(), True) \
    .add("item_id", IntegerType(), True) \

df_user_test = spark.read.format("csv") \
    .option("header", True) \
    .schema(schema_test) \
    .load("/labs/slaba03/laba03_test.csv")

In [13]:
views_schema = StructType(fields = [StructField('user_id', IntegerType()),
StructField('item_id', IntegerType()),
StructField('ts_start', IntegerType()),
StructField('ts_end', IntegerType()),
StructField('item_type', StringType()),
])

df_views_programmes = spark.read.format("csv") \
    .option("header", True) \
    .schema(views_schema) \
    .load("/labs/slaba03/laba03_views_programmes.csv")

In [14]:
df_user_trian.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
+-------+-------+--------+
only showing top 5 rows



In [15]:
df_user_trian.createOrReplaceTempView("df_user_trian")

In [16]:
querry = """ 
select 
a.purchase, count(*) as cnt
from df_user_trian a 
group by a.purchase
"""

In [17]:
spark.sql(querry).show()

+--------+-------+
|purchase|    cnt|
+--------+-------+
|       1|  10904|
|       0|5021720|
+--------+-------+



In [18]:
df_user_trian.count()

5032624

In [14]:
train_w_feats = df_user_trian.join(df_items, on="item_id", how="left")

In [15]:
train_w_feats = train_w_feats.filter("content_type == 1")

In [21]:
train_w_feats.createOrReplaceTempView("train_w_feats")

In [22]:
querry = """ 
select 
a.purchase, a.content_type, count(*) as cnt
from train_w_feats a 
group by a.purchase, a.content_type
"""

In [23]:
spark.sql(querry).show()

+--------+------------+-------+
|purchase|content_type|    cnt|
+--------+------------+-------+
|       1|           1|  10576|
|       0|           1|4977299|
+--------+------------+-------+



In [24]:
# train_w_feats[['genres']].map(lambda x: x.split(','))

In [25]:
train_w_feats.select(f.split("genres", ",").alias("genres_list")).count()

4987875

In [16]:
train_w_feats = train_w_feats.withColumn("genres_list", f.split("genres", ",").alias("genres_list"))

In [17]:
item_purchase_df = train_w_feats.select("title", 'purchase').groupBy("title").agg(
    f.sum("purchase").alias("sum_item_purchase")).orderBy("sum_item_purchase", ascending=False)


user_purchase_df = train_w_feats.select("user_id", 'purchase').groupBy("user_id").agg(
    f.sum("purchase").alias("sum_user_purchase")).orderBy("sum_user_purchase", ascending=False)

In [18]:
train_w_feats = train_w_feats.join(item_purchase_df, how='left', on="title")
train_w_feats = train_w_feats.join(user_purchase_df, how='left', on="user_id")

In [19]:
favor_genres_by_user = train_w_feats.select("user_id", "purchase", "genres_list")
favor_genres_by_user = favor_genres_by_user.filter("purchase = 1")

In [20]:
favor_genres_by_user = favor_genres_by_user.select(
    "user_id", f.explode("genres_list").alias("genres_of_purchase")).groupBy(
    "user_id").pivot("genres_of_purchase").agg(f.count('*')).fillna(0)

In [21]:
from pyspark.ml.feature import VectorAssembler

In [22]:
from pyspark.sql.window import Window
from pyspark.ml.linalg import DenseVector, SparseVector
w = Window.partitionBy('user_id')

In [23]:
favor_genres_by_user = train_w_feats.filter("purchase = 1").select("user_id", "genres_list").select(
    "user_id", f.explode("genres_list").alias("genres_of_purchase")).groupBy(
    'user_id').agg(f.collect_list('genres_of_purchase').alias('all_genres_of_purchase'))

In [24]:
from pyspark.ml.feature import CountVectorizer

In [25]:
cv = CountVectorizer(inputCol="all_genres_of_purchase", outputCol="favoriete_genres_vector", vocabSize=80, minDF=1.0)

In [26]:
model = cv.fit(favor_genres_by_user)

In [27]:
result = model.transform(favor_genres_by_user)
# result.show(truncate=False)

In [38]:
# import pyspark.sql.types as T

# to_array = f.udf(lambda v: v.toArray().tolist(), T.ArrayType(T.FloatType()))
# df = df.withColumn('features', to_array('features'))

In [28]:
items_fpr_vector = df_items.filter("content_type = 1").select(
    "item_id", f.split("genres", ",").alias("all_genres_of_purchase"))

In [29]:
item_result = model.transform(items_fpr_vector)

item_result = item_result.withColumnRenamed("all_genres_of_purchase","film_genre")\
.withColumnRenamed("favoriete_genres_vector","film_genre_vector")

# item_result.show(truncate=False)

In [30]:
train_w_feats_s1 = train_w_feats.join(result, how="left", on='user_id')
train_w_feats_s2 = train_w_feats_s1.join(item_result, how="left", on='item_id')

In [31]:
# функция косинусного расстояния
@f.udf(returnType=DoubleType())
def vector_dot_val(v1,v2):
    try:
        v1_sparse = DenseVector(v1)
        v2_sparse = DenseVector(v2)
        return float(v1_sparse.dot(v2_sparse))
    except:
        return 0

In [32]:
train_w_feats_s2 = train_w_feats_s2.withColumn('vector_dot_val', vector_dot_val(train_w_feats_s2["favoriete_genres_vector"], 
                                                                          train_w_feats_s2["film_genre_vector"]))

In [33]:
train_w_feats_s2.count()

4987875

In [45]:
train_w_feats_s2.createOrReplaceTempView("train_w_feats_s2")

In [49]:
train_w_feats_s2.columns

['item_id',
 'user_id',
 'title',
 'purchase',
 'channel_id',
 'datetime_availability_start',
 'datetime_availability_stop',
 'datetime_show_start',
 'datetime_show_stop',
 'content_type',
 'year',
 'genres',
 'region_id',
 'genres_list',
 'sum_item_purchase',
 'sum_user_purchase',
 'all_genres_of_purchase',
 'favoriete_genres_vector',
 'film_genre',
 'film_genre_vector',
 'vector_dot_val']

In [50]:
querry = """ 
select 
a.user_id, a.item_id,
a.sum_item_purchase, a.sum_user_purchase, a.vector_dot_val
from train_w_feats_s2 a
limit 15
"""

In [55]:
# spark.sql(querry).show()

In [34]:
train_w_feats_s3 = train_w_feats_s2[['user_id', 'item_id', 'sum_item_purchase', 'sum_user_purchase', 'vector_dot_val', 
                                    'purchase']]

In [35]:
assembler = VectorAssembler(inputCols=["sum_item_purchase", "sum_user_purchase", "vector_dot_val"],
                            outputCol="features", handleInvalid='skip')

In [36]:
train_w_feats_s4 = assembler.transform(train_w_feats_s3)

In [37]:
train_w_feats_s4

DataFrame[user_id: int, item_id: int, sum_item_purchase: bigint, sum_user_purchase: bigint, vector_dot_val: double, purchase: int, features: vector]

In [38]:
from pyspark.ml.classification import LogisticRegression

In [39]:
# assembler = VectorAssembler(inputCols=["sum_item_purchase", "sum_user_purchase", "vector_dot_val"],
#                             outputCol="features")

# train_w_feats_s3 = assembler.transform(train_w_feats_s2)

In [40]:
lr = LogisticRegression(featuresCol="features", labelCol="purchase", maxIter=15, regParam=0.01)

In [41]:
lr_model = lr.fit(train_w_feats_s4)

In [77]:
predictions = lr_model.transform(train_w_feats_s4)

In [78]:
predictions.select("item_id", "user_id", "purchase", "prediction", "probability", "rawPrediction").show(5, False, True)

-RECORD 0---------------------------------------------------
 item_id       | 100402                                     
 user_id       | 754230                                     
 purchase      | 0                                          
 prediction    | 0.0                                        
 probability   | [0.9924295884413139,0.0075704115586860184] 
 rawPrediction | [4.875908633371725,-4.875908633371725]     
-RECORD 1---------------------------------------------------
 item_id       | 93961                                      
 user_id       | 754230                                     
 purchase      | 0                                          
 prediction    | 0.0                                        
 probability   | [0.9939779621991357,0.006022037800864319]  
 rawPrediction | [5.10628932841722,-5.10628932841722]       
-RECORD 2---------------------------------------------------
 item_id       | 90009                                      
 user_id       | 754230 

In [79]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [82]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="purchase", metricName='areaUnderROC')

In [83]:
evaluator.evaluate(predictions)

0.9166672945571924

# Скоринг тестовой выборки 

In [42]:
test_w_feats = df_user_test.join(df_items, on="item_id", how="left")

In [43]:
test_w_feats = test_w_feats.filter("content_type == 1")

In [44]:
test_w_feats.count()

2137536

In [45]:
test_w_feats = test_w_feats.withColumn("genres_list", f.split("genres", ",").alias("genres_list"))

In [46]:
test_w_feats = test_w_feats.join(item_purchase_df, how='left', on="title")
test_w_feats = test_w_feats.join(user_purchase_df, how='left', on="user_id")

In [47]:
test_w_feats_s1 = test_w_feats.join(result, how="left", on='user_id')
test_w_feats_s2 = test_w_feats_s1.join(item_result, how="left", on='item_id')

In [48]:
test_w_feats_s2.count()

2137536

In [98]:
# from pyspark.sql.functions import isnan, when, count, col
# test_w_feats_s2.select([count(when(isnan(c), c)).alias(c) for c in test_w_feats_s2.columns]).show()

In [100]:
# test_w_feats_s2.show(5, vertical=True)

In [49]:
test_w_feats_s2 = test_w_feats_s2.withColumn('vector_dot_val', vector_dot_val(train_w_feats_s2["favoriete_genres_vector"], 
                                                                          train_w_feats_s2["film_genre_vector"]))

In [50]:
test_w_feats_s3 = test_w_feats_s2[['user_id', 'item_id', 'sum_item_purchase', 'sum_user_purchase', 'vector_dot_val']].cache()

In [51]:
test_w_feats_s4 = assembler.transform(test_w_feats_s3)

In [52]:
test_predictions = lr_model.transform(test_w_feats_s4)

In [53]:
firstelement = f.udf(lambda v:float(v[1]),FloatType())

In [54]:
res = test_predictions.select("user_id", "item_id", firstelement("probability").alias("purchase"))

In [55]:
res1 = res.orderBy("user_id", "item_id")

In [56]:
res1.show(5)

+-------+-------+------------+
|user_id|item_id|    purchase|
+-------+-------+------------+
|   1654|    336|0.0019944052|
|   1654|    678|0.0019944052|
|   1654|    691|0.0019848566|
|   1654|    696|0.0020420174|
|   1654|    763|0.0020612953|
+-------+-------+------------+
only showing top 5 rows



In [125]:
res1.repartition(1).write.option("header",True).csv('/user/georgiy.krupenchenkov/lab03.csv', mode="overwrite")

In [112]:
!pwd

/data/home/georgiy.krupenchenkov


In [126]:
!hdfs dfs -get /user/georgiy.krupenchenkov/lab03.csv

In [128]:
!mv lab03.csv/part-00000-6bac2eed-c058-40e7-93c9-1f86efb25d8d-c000.csv lab03_fin.csv

In [115]:
import gc
gc.collect()

1836

In [150]:
spark.stop()