In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 2g --executor-cores 1 --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

from pyspark.ml.feature import HashingTF, StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml import Transformer, Pipeline
from pyspark.ml.param.shared import HasOutputCol, HasInputCol
from pyspark import keyword_only

from pyspark.sql.types import LongType, StringType, StructType, StructField, IntegerType, FloatType, ArrayType, LongType
import pyspark.sql.functions as f

conf = SparkConf()
conf.set("spark.app.name", "lab4") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
train_path = "/labs/slaba03/laba03_train.csv"
test_path = "/labs/slaba03/laba03_test.csv"

schema = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", IntegerType())
])

In [4]:
items_path = "/labs/slaba03/laba03_items.csv"

items_schema = StructType(fields=[
    StructField("item_id", IntegerType()),
    StructField("channel_id", FloatType()),
    StructField("datetime_availability_start", StringType()),
    StructField("datetime_availability_stop", StringType()),    
    StructField("datetime_show_start", StringType()),
    StructField("datetime_show_stop", StringType()),
    StructField("content_type", IntegerType()),
    StructField("title", StringType()),
    StructField("year", FloatType()),
    StructField("genres", StringType()),   
    StructField("region_id", FloatType())
])

In [5]:
views_path = "/labs/slaba03/laba03_views_programmes.csv"

views_schema = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("ts_start", IntegerType()),
    StructField("ts_end", IntegerType()),
    StructField("item_type", StringType())
])

In [6]:
df_train = spark.read.csv(train_path, schema=schema, header=True)
df_test = spark.read.csv(test_path, schema=schema, header=True)
df_items = spark.read.csv(items_path, schema=items_schema, header=True, sep="\t")
df_views = spark.read.csv(views_path, schema=views_schema, header=True)

In [7]:
df_items = df_items.withColumn("duration", f.col("datetime_show_stop") - f.col("datetime_show_start"))

In [9]:
df_train = df_train.join(df_items, on="item_id", how="left")

In [10]:
films_info = df_train.groupby("item_id").agg(f.sum("purchase").alias("buy_cnt"),
                               f.count("purchase").alias("total")) \
            .withColumn("conversion", f.col("buy_cnt") / f.col("total"))

In [12]:
df_train = df_train.withColumn("genres_array", f.when(f.col("genres").isNotNull(), f.split("genres", ",")).otherwise(f.array([]))) \
    .withColumn("firsts_genres", f.array([f.col("genres_array")[0], f.col("genres_array")[1], f.col("genres_array")[2]])) 

In [13]:
## users

In [14]:
user_info = df_views \
    .withColumn("is_live", f.when(f.col("item_type") == "live", 1).otherwise(0)) \
    .withColumn("watch_time", f.col("ts_end") - f.col("ts_start")) \
    .groupby("user_id").agg(f.count("user_id").alias("total_views"),
                            f.max("watch_time").alias("max_watch_time"),
                            f.min("watch_time").alias("min_watch_time"),
                            f.mean("watch_time").alias("mean_time"),
                            f.sum("is_live").alias("cnt_is_live")) \
    .withColumn("live_ratio", f.col("cnt_is_live") / f.col("total_views"))

In [15]:
user_buy_rate = df_train.select("user_id", "purchase").groupby("user_id").agg(f.sum("purchase").alias("total_purchases"),
                                                              f.count("user_id").alias("total_offers")) \
        .withColumn("buy_ratio", f.col("total_purchases") / f.col("total_offers"))

In [16]:
df_train = df_train.join(user_buy_rate, on="user_id", how="left")

In [17]:
df_train = df_train.join(user_info, on="user_id", how="left")

In [18]:
df_train = df_train.join(films_info, on="item_id", how="left")

In [19]:
df_train = df_train.fillna({"year": 2005.0,
                            "region_id": 0,
                            "content_type": 0,
                            "total_views": 0,
                            "max_watch_time": 0,
                            "min_watch_time": 0,
                            "mean_time": 0,
                            "cnt_is_live": 0,
                            "live_ratio": 0})

In [20]:
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="firsts_genres", outputCol="cv_genres")
model = cv.fit(df_train)


In [23]:
df_train = model.transform(df_train)

In [21]:
# df_train = df_train.fillna({"primary_genre": "Не известно",
#                             "secondary_genre": "Не известно",
#                             "duration": 0, 
#                             "year": 0,
#                             "region_id": 0})

In [22]:
# primary = StringIndexer(inputCol="primary_genre", outputCol="primary_genre_enc", handleInvalid="skip")
# secondary = StringIndexer(inputCol="secondary_genre", outputCol="secondary_genre_enc", handleInvalid="skip")

In [23]:
# df_train = primary.fit(df_train).transform(df_train)
# df_train = secondary.fit(df_train).transform(df_train), "cv_genres" , handleInvalid="skip"

In [24]:
# assembler = VectorAssembler(inputCols=["buy_ratio", "total_purchases", "total_offers", "content_type",
#                                        "total_views", "max_watch_time", "min_watch_time", "mean_time", "cnt_is_live", "live_ratio", 
#                                        "year", "region_id", "conversion",
#                                        "total", "buy_cnt", "cv_genres"], outputCol="features")

In [26]:
assembler = VectorAssembler(inputCols=["buy_ratio", "total_purchases", "total_offers", "content_type",
                                       "total_views", "max_watch_time", "min_watch_time", "mean_time", "cnt_is_live", "live_ratio",
                                       ], outputCol="features")

In [27]:
# df_train = assembler.transform(df_train)
df_train1 = assembler.transform(df_train)

In [28]:
# train = df_train.sampleBy("purchase", fractions={0: 0.8, 1: 0.8}, seed=5757)
# test = df_train.join(train, on=["user_id", "item_id"], how="leftanti")

In [29]:
train = df_train1.sampleBy("purchase", fractions={0: 0.8, 1: 0.8}, seed=5757)
test = df_train1.join(train, on=["user_id", "item_id"], how="leftanti")

In [30]:
#

In [31]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="purchase", metricName='areaUnderROC')

gbt = GBTClassifier(featuresCol="features", labelCol="purchase")

In [32]:
pipeline = Pipeline(stages=[
    gbt
])

In [33]:
pipeline_model = pipeline.fit(train)

In [34]:
predictions = pipeline_model.transform(test)

In [35]:
evaluator.evaluate(predictions)

0.8511177007880439

In [36]:
v_udf = f.udf(lambda x: float(x[1]), FloatType())

In [37]:
predictions.select("user_id", "item_id", v_udf("probability").alias("purchase")).orderBy("user_id", "item_id").show(5)

+-------+-------+-----------+
|user_id|item_id|   purchase|
+-------+-------+-----------+
|   1654|    540|0.044613488|
|   1654|    546|0.044613488|
|   1654|   1125|0.044613488|
|   1654|   1131|0.044613488|
|   1654|   1320|0.044613488|
+-------+-------+-----------+
only showing top 5 rows



In [38]:
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
# gbparamGrid = ParamGridBuilder() \
#              .addGrid(gbt.maxDepth, [5, 10]) \
#              .build()

# crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=gbparamGrid,
#                               evaluator=evaluator, numFolds=3, parallelism=5)

# cv_model = crossval.fit(train)

# cv_model.bestModel


# predictions = cv_model.transform(test)
# evaluator.evaluate(predictions)

In [39]:
df_test = spark.read.csv(test_path, schema=schema, header=True)

In [40]:
df_test.count()

2156840

In [41]:
df_test = df_test.join(df_items, on="item_id", how="left")

In [42]:
df_test = df_test.join(user_buy_rate, on="user_id", how="left")

In [43]:
df_test = df_test.join(user_info, on="user_id", how="left")

In [44]:
df_test = df_test.join(films_info, on="item_id", how="left")

In [45]:
df_test = df_test.withColumn("genres_array", f.when(f.col("genres").isNotNull(), f.split("genres", ",")).otherwise(f.array([]))) \
    .withColumn("firsts_genres", f.array([f.col("genres_array")[0], f.col("genres_array")[1], f.col("genres_array")[2]])) 

In [46]:
df_test = df_test.fillna({"year": 2005.0,
                            "region_id": 0,
                            "content_type": 0,
                            "total_views": 0,
                            "max_watch_time": 0,
                            "min_watch_time": 0,
                            "mean_time": 0,
                            "cnt_is_live": 0,
                            "live_ratio": 0})

In [47]:
df_test = model.transform(df_test)

In [49]:
df_test = assembler.transform(df_test)

In [52]:
result = pipeline_model.transform(df_test)

In [53]:
r = result.select("user_id", "item_id", v_udf("probability").alias("purchase")).orderBy("user_id", "item_id")

In [55]:
r.count()

2156840

In [83]:
r.toPandas().to_csv("lab03.csv", index=False)

In [84]:
!hdfs dfs -rm lab03.csv /user/alexander.zhukov/

22/10/31 14:06:22 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-master-1.newprolab.com:8020/user/alexander.zhukov/lab03.csv' to trash at: hdfs://spark-master-1.newprolab.com:8020/user/alexander.zhukov/.Trash/Current/user/alexander.zhukov/lab03.csv1667214382534
rm: `/user/alexander.zhukov': Is a directory


In [85]:
!hdfs dfs -put lab03.csv /user/alexander.zhukov/

In [86]:
spark.stop()