In [None]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 3g --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Bulatov Nikolai ML lab3") 

spark = SparkSession.builder.config(conf=conf).appName("Bulatov Nikolai ML lab3").getOrCreate()

In [None]:
spark

In [None]:
! hdfs dfs -ls /labs/slaba03/

In [None]:
! hdfs dfs -cat /labs/slaba03/laba03_views_programmes.csv | head -n 5

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, FloatType, ArrayType

item_schema = StructType([
    StructField("item_id", IntegerType()),
    StructField("channel_id", IntegerType()),
    StructField("datetime_availability_start", TimestampType()),
    StructField("datetime_availability_stop", TimestampType()),
    StructField("datetime_show_start", TimestampType()),
    StructField("datetime_show_stop", TimestampType()),
    StructField("content_type", IntegerType()),
    StructField("title", StringType()),
    StructField("year", FloatType()),
    StructField("genres", StringType()),
    StructField("region_id", IntegerType())
])

item_df = spark.read\
          .format("csv")\
          .option("header", "true")\
          .schema(item_schema)\
          .option("sep", "\t")\
          .load("/labs/slaba03/laba03_items.csv")

In [None]:
item_df.show(10)

In [None]:
item_df.filter("content_type = 1").groupBy("genres").count().orderBy("count", ascending=False).show(10)

In [None]:
item_df.filter("item_id = 7101053").show(10)

In [None]:
views_schema = StructType([
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("ts_start", IntegerType()),
    StructField("ts_end", IntegerType()),
    StructField("item_type", StringType())
])

views_df = spark.read\
          .format("csv")\
          .option("header", "true")\
          .schema(views_schema)\
          .option("sep", ",")\
          .load("/labs/slaba03/laba03_views_programmes.csv")

In [None]:
views_df.show(5)

In [None]:
views_df.groupBy("item_type").count().orderBy("count", ascending=False).show(10)

In [None]:

import pyspark.sql.functions as f

In [None]:
view_agg_df = views_df\
    .join(item_df, "item_id", "inner")

In [None]:
view_agg_df.show(10)

In [None]:
test_schema = StructType([
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", IntegerType())
])

train_df = spark.read\
          .format("csv")\
          .option("header", "true")\
          .schema(test_schema)\
          .option("sep", ",")\
          .load("/labs/slaba03/laba03_train.csv")

train_df.show(10)

In [None]:
from pyspark.sql.window import Window
item_agg_df = item_df.select("item_id", "year",\
                             f.regexp_replace("title", r"[^\pL0-9\p{Space}]","" ).alias("title"),\
                             f.regexp_replace(f.regexp_replace(f.regexp_replace(f.trim(f.col("genres")) , r"[^\pL0-9,\p{Space}]","")," ","_"),","," ").alias("genres"))\
    .withColumn("item_year_group",\
                f.when(f.col("year") < f.lit(1970), f.lit("1960"))\
                 .when(f.col("year") < f.lit(1980), f.lit("1970"))\
                 .when(f.col("year") < f.lit(1990), f.lit("1980"))\
                 .when(f.col("year") < f.lit(2000), f.lit("1990"))\
                 .when(f.col("year") < f.lit(2005), f.lit("2000"))\
                 .when(f.col("year") < f.lit(2010), f.lit("2005"))\
                 .when(f.col("year") < f.lit(2015), f.lit("2010"))\
                 .otherwise(f.col("year").cast("int").cast("string")))\
    .join(train_df.filter("purchase = 1"), "item_id", "left")\
    .groupBy("item_id", "year", "title", "genres", "item_year_group").agg(f.count(f.lit(1)).alias("item_pay_cnt"))\
    .withColumn("item_pay_rate", f.col("item_pay_cnt") / f.count(f.lit(1)).over(Window.rowsBetween(-sys.maxsize,sys.maxsize)))\
    .fillna( { "genres":"n/a", 
               "item_year_group":0, 
               "year":0,
               "item_pay_cnt":0,
               "item_pay_rate":0.0} ).cache()
item_agg_df.show(10)

In [None]:
item_agg_df.select("item_pay_rate").show(10)

In [None]:
user_agg_df = train_df.filter("purchase = 1")\
    .groupBy("user_id").agg(f.sum("purchase").alias("user_buy_cnt"))\
    .withColumn("user_buy_rate", f.col("user_buy_cnt") / f.count(f.lit(1)).over(Window.rowsBetween(-sys.maxsize,sys.maxsize)))\

user_agg_df.show(10)        

In [None]:
from pyspark.sql.window import Window

year_window = Window.partitionBy("user_id", "item_year_group")
year_rnk_window = Window.partitionBy("user_id").orderBy(f.col("year_cnt").desc())

user_year_agg_df = train_df.filter("purchase = 1")\
    .join(item_agg_df, "item_id", "inner")\
    .withColumn("year_cnt", f.count(f.lit(1)).over(year_window))\
    .withColumn("year_rank", f.dense_rank().over(year_rnk_window))\
    .filter("year_rank <= 3")\
    .select("user_id", "item_year_group").distinct()\
    .groupBy("user_id").agg(f.collect_list("item_year_group").alias("top_3_year_group")).cache()

user_year_agg_df.show(10)

In [None]:
from pyspark.sql.window import Window

genre_rnk_window = Window.partitionBy("user_id").orderBy(f.col("count").desc())

user_genre_agg_df = train_df\
    .join(item_agg_df, "item_id", "inner")\
    .select("user_id", "purchase", f.explode(f.split("genres"," ")).alias("genre_item"))\
    .groupBy("user_id", "genre_item").agg(f.sum("purchase").alias("count"))\
    .withColumn("genre_rank", f.dense_rank().over(genre_rnk_window))\
    .filter("genre_rank <= 3")\
    .select("user_id", "genre_item").distinct()\
    .groupBy("user_id").agg(f.collect_list("genre_item").alias("top_3_genre_group")).cache()

user_genre_agg_df.show(10)

In [None]:
from pyspark.ml import Transformer, Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, StringIndexer, OneHotEncoder, HashingTF, IDF, VectorAssembler
from pyspark.ml.param.shared import HasOutputCol, HasInputCol
from pyspark import keyword_only

In [None]:
class GetItemInfoTransformer(Transformer):
    @keyword_only
    def __init__(self, inputCol=None):
        super(GetItemInfoTransformer, self).__init__()
            
    def _transform(self, dataset):
        return dataset.join(item_agg_df, "item_id", "left")
    #dataset.withColumn(self.getOutputCol(), f.md5(f.col(self.getInputCol()).cast("string")))

In [None]:
item_transformer = GetItemInfoTransformer()
year_group_indexer = StringIndexer(inputCol="item_year_group", outputCol="yearGIndex")
year_group_vector = OneHotEncoder(inputCol=year_group_indexer.getOutputCol(), outputCol="year_group_vec")
genre_tokenizer = Tokenizer(inputCol="genres", outputCol="genres_tok")
title_tokenizer = Tokenizer(inputCol="title", outputCol="title_tok")
genre_converter = CountVectorizer(inputCol = genre_tokenizer.getOutputCol(), outputCol="genres_vec", binary=True)
stop_words =\
    StopWordsRemover.loadDefaultStopWords("russian") + \
    StopWordsRemover.loadDefaultStopWords("english")
title_swr=StopWordsRemover(inputCol="title_tok", outputCol="title_swr", stopWords=stop_words)
title_htf = HashingTF(inputCol=title_swr.getOutputCol(), outputCol="title_tf", numFeatures=1000)
title_idf = IDF(inputCol=title_htf.getOutputCol(), outputCol="title_idf")

item_transformer_list = [\
        item_transformer,\
        year_group_indexer,\
        year_group_vector,\
        genre_tokenizer,\
        genre_converter,\
        title_tokenizer,\
        title_swr,\
        title_htf,\
        title_idf\
        ]

In [None]:
transform_item = Pipeline(stages=item_transformer_list)
transform_item_model = transform_item.fit(train_df)
transform_item_model.transform(train_df).show(10)

In [None]:
class GetUserInfoTransformer(Transformer):
    @keyword_only
    def __init__(self, inputCol=None):
        super(GetUserInfoTransformer, self).__init__()
            
    def _transform(self, dataset):
        df = dataset\
            .join(user_agg_df, "user_id", "left")\
            .join(user_year_agg_df, "user_id", "left")\
            .join(user_genre_agg_df, "user_id", "left")\
            .withColumn("top_3_genre_group_", f.coalesce("top_3_genre_group", f.array().cast("array<string>")))\
            .withColumn("top_3_year_group_", f.coalesce("top_3_year_group", f.array().cast("array<string>")))\
            .fillna({ "user_buy_cnt":0,
                      "user_buy_rate":0.0
                    })
        return df
    

In [None]:
user_transformer = GetUserInfoTransformer()
genre_top_converter = CountVectorizer(inputCol = "top_3_genre_group_", outputCol="top_genres_vec", binary=True)
year_top_converter = CountVectorizer(inputCol = "top_3_year_group_", outputCol="top_year_vec", binary=True)

user_transformer_list = [
         user_transformer
        ,genre_top_converter
        ,year_top_converter
        ]

In [None]:
transform_user = Pipeline(stages=user_transformer_list)
transform_user_model = transform_user.fit(train_df)
transform_user_model.transform(train_df).cache().show(10)

In [None]:
feature_list=[
"year_group_vec",
"item_pay_rate",
"title_idf",
"genres_vec",
"user_buy_rate",
"top_genres_vec",
"top_year_vec"
]

assembler = VectorAssembler(inputCols=feature_list, outputCol="features")

In [None]:
test_schema = StructType([
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", IntegerType())
])

test_df = spark.read\
          .format("csv")\
          .option("header", "true")\
          .schema(test_schema)\
          .option("sep", ",")\
          .load("/labs/slaba03/laba03_test.csv")

test_df.show(10)

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

lr = LogisticRegression(featuresCol='features', labelCol="purchase" , maxIter=200,regParam=0.1)
lr_pipeline_stages = item_transformer_list + user_transformer_list + [assembler, lr]

estimator = Pipeline(stages=lr_pipeline_stages)
lr_model = estimator.fit(train_df)
train_res = lr_model.transform(train_df).cache()

train_res.show(10)


In [None]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="purchase", metricName='areaUnderROC')

roc=evaluator.evaluate(train_res)
roc

In [None]:


predictions = lr_model.transform(test_df).cache()

predictions.show(10)

In [None]:
predictions.select("probability").take(10)

In [None]:
@f.udf(ArrayType(FloatType()))
def to_list(dense_vector):
    return dense_vector.toArray().tolist()

In [None]:
lab03_csv = predictions.select("user_id","item_id", to_list("probability").getItem(1).alias("purchase")).orderBy("user_id","item_id")

In [None]:
lab03_csv.toPandas().to_csv('lab03.csv')

In [None]:
spark.stop()