# 1. Библиотеки и настройки сессии

In [1]:
DATA_PATH = "/labs/slaba03/"
TRAIN_FILE = DATA_PATH + "laba03_train.csv"
TEST_FILE = DATA_PATH + "laba03_test.csv"
ITEMS_FILE = DATA_PATH + "laba03_items.csv"
VIEWS_FILE = DATA_PATH + "laba03_views_programmes.csv"

In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "ivan.strazov - lab03")

spark = SparkSession.builder.config(conf=conf).appName("ivan.strazov - lab03").getOrCreate()

In [4]:
import pyspark.sql.functions as f
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType, DoubleType
from pyspark.sql.window import Window

In [5]:
from pyspark.ml.feature import HashingTF, IDF, Normalizer, OneHotEncoder, StringIndexer, Tokenizer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, GBTClassifier

In [25]:
def transform2onehot(data, column):
    """
    Transform categorical column to binary vectors.
    
    Parameters:
        data (pyspark.DataFrame) - base dataframe.
        column (str) - name of column.
    
    Return:
        new_data () - transformed dataframe.
    """
    
    indexer = StringIndexer(inputCol=column, outputCol="index").setHandleInvalid("keep")
    encoder = OneHotEncoder(dropLast=True, inputCol="index", outputCol=column+"_onehot")
    
    pipe = Pipeline(stages = [indexer, encoder])
    
    new_data = pipe\
                .fit(data)\
                .transform(data) \
                .drop(column, "index")
    
    return new_data

In [36]:
def transform2tfidf(data, column):
    """
    Transform text column to normalized TF-IDF.
    
    Parameters:
        data (pyspark.DataFrame) - base dataframe.
        column (str) - name of column.
    
    Return:
        new_data () - transformed dataframe.
    """
    
    tokenizer = Tokenizer(inputCol=column, outputCol="token")
    tokenized = tokenizer.transform(data)
    
    hashingTF = HashingTF(inputCol="token",
                          outputCol="tf",
                          numFeatures=10000)
    tf = hashingTF.transform(tokenized)

    idf = IDF(inputCol="tf", outputCol="tfidf").fit(tf)
    tfidf = idf.transform(tf)

    normalizer = Normalizer(inputCol="tfidf",
                            outputCol=column+"_tfidf",
                            p=2)
    new_data = normalizer \
                    .transform(tfidf) \
                    .drop(column, "token", "tf", "tfidf")                
    
    return new_data

# 2. Загрузка данных

### Train

В файле содержатся факты покупки (колонка purchase) пользователями (колонка user_id) телепередач (колонка item_id). Такой формат файла вам уже знаком.

In [6]:
schema = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", IntegerType())
])

dataset = spark.read \
            .schema(schema) \
            .format("csv") \
            .load(TRAIN_FILE, header="true")
dataset.printSchema()
dataset.show(5)

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
+-------+-------+--------+
only showing top 5 rows



In [7]:
buys4users = dataset \
                .groupBy("user_id") \
                .agg(f.sum("purchase").alias("users_buys"),
                     (f.sum("purchase") / f.count("item_id")).alias("users_per_buys"))
buys4users.cache()
buys4users.show(2)

+-------+----------+--------------------+
|user_id|users_buys|      users_per_buys|
+-------+----------+--------------------+
| 867850|         1|3.829950210647261...|
| 870928|         2|7.674597083653108E-4|
+-------+----------+--------------------+
only showing top 2 rows



In [8]:
buys4items = dataset \
                .groupBy("item_id") \
                .agg(f.sum("purchase").alias("items_buys"),
                     (f.sum("purchase") / f.count("user_id")).alias("items_per_buys"))
buys4items.cache()
buys4items.show(2)

+-------+----------+--------------------+
|item_id|items_buys|      items_per_buys|
+-------+----------+--------------------+
|   8638|         2|0.001450326323422...|
|  95940|         1|   7.097232079489E-4|
+-------+----------+--------------------+
only showing top 2 rows



### Test

Тестовый датасет без указанного целевого признака purchase, который вам и предстоит предсказать.

In [9]:
schema = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", IntegerType())
])

result = spark.read \
            .schema(schema) \
            .format("csv") \
            .load(TEST_FILE, header="true")
result.printSchema()
result.show(5)

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  94814|    null|
|   1654|  93629|    null|
|   1654|   9980|    null|
|   1654|  95099|    null|
|   1654|  11265|    null|
+-------+-------+--------+
only showing top 5 rows



### Items

Дополнительные данные по items. В данном файле много лишней или ненужной информации, так что задача её фильтрации и отбора ложится на вас. Поля в файле, на которых хотелось бы остановиться:
- item_id — primary key. Соответствует item_id в предыдущем файле.
- content_type — тип телепередачи (1 — платная, 0 — бесплатная). Вас интересуют платные передачи.
- title — название передачи, текстовое поле.
- year — год выпуска передачи, число.
- genres — поле с жанрами передачи, разделёнными через запятую.

In [13]:
schema = StructType(fields=[
    StructField("item_id", IntegerType()),
    StructField("channel_id", StringType()),
    StructField("datetime_availability_start", StringType()),
    StructField("datetime_availability_stop", StringType()),
    StructField("datetime_show_start", StringType()),
    StructField("datetime_show_stop", StringType()),
    StructField("content_type", IntegerType()),
    StructField("title", StringType()),
    StructField("year", DoubleType()),
    StructField("genres", StringType()),
    StructField("region_id", StringType()),
])

items = spark.read \
            .schema(schema) \
            .format("csv") \
            .load(ITEMS_FILE, sep="\t", header="true")
items.printSchema()
items.limit(5).toPandas()

root
 |-- item_id: integer (nullable = true)
 |-- channel_id: string (nullable = true)
 |-- datetime_availability_start: string (nullable = true)
 |-- datetime_availability_stop: string (nullable = true)
 |-- datetime_show_start: string (nullable = true)
 |-- datetime_show_stop: string (nullable = true)
 |-- content_type: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- year: double (nullable = true)
 |-- genres: string (nullable = true)
 |-- region_id: string (nullable = true)



Unnamed: 0,item_id,channel_id,datetime_availability_start,datetime_availability_stop,datetime_show_start,datetime_show_stop,content_type,title,year,genres,region_id
0,65667,,1970-01-01T00:00:00Z,2018-01-01T00:00:00Z,,,1,на пробах только девушки (all girl auditions),2013.0,Эротика,
1,65669,,1970-01-01T00:00:00Z,2018-01-01T00:00:00Z,,,1,скуби ду: эротическая пародия (scooby doo: a x...,2011.0,Эротика,
2,65668,,1970-01-01T00:00:00Z,2018-01-01T00:00:00Z,,,1,горячие девочки для горячих девочек (hot babes...,2011.0,Эротика,
3,65671,,1970-01-01T00:00:00Z,2018-01-01T00:00:00Z,,,1,соблазнительницы женатых мужчин (top heavy hom...,2011.0,Эротика,
4,65670,,1970-01-01T00:00:00Z,2018-01-01T00:00:00Z,,,1,секретные секс-материалы ii: темная секс парод...,2010.0,Эротика,


In [14]:
items = items \
            .select("item_id", "content_type", "title", "year", "genres")
items.cache()

DataFrame[item_id: int, content_type: int, title: string, year: double, genres: string]

In [15]:
indexer = StringIndexer(inputCol="genres", outputCol="index").setHandleInvalid("keep")
encoder = OneHotEncoder(dropLast=True, inputCol="index", outputCol="genres"+"_onehot")

pipe = Pipeline(stages = [indexer, encoder])

items = pipe\
            .fit(items)\
            .transform(items) \
            .drop("genres", "index")
items.cache()

DataFrame[item_id: int, content_type: int, title: string, year: double, genres_onehot: vector]

In [16]:
tokenizer = Tokenizer(inputCol="title", outputCol="token")
tokenized = tokenizer.transform(items)

hashingTF = HashingTF(inputCol="token",
                      outputCol="tf",
                      numFeatures=10000)
tf = hashingTF.transform(tokenized)

idf = IDF(inputCol="tf", outputCol="tfidf").fit(tf)
tfidf = idf.transform(tf)

normalizer = Normalizer(inputCol="tfidf",
                        outputCol="title"+"_tfidf",
                        p=2)
items = normalizer \
                .transform(tfidf) \
                .drop("title", "token", "tf", "tfidf")
items.cache()

DataFrame[item_id: int, content_type: int, year: double, genres_onehot: vector, title_tfidf: vector]

In [17]:
items.show(5)

+-------+------------+------+----------------+--------------------+
|item_id|content_type|  year|   genres_onehot|         title_tfidf|
+-------+------------+------+----------------+--------------------+
|  65667|           1|2013.0|(1076,[6],[1.0])|(10000,[665,1029,...|
|  65669|           1|2011.0|(1076,[6],[1.0])|(10000,[1147,1299...|
|  65668|           1|2011.0|(1076,[6],[1.0])|(10000,[966,2231,...|
|  65671|           1|2011.0|(1076,[6],[1.0])|(10000,[164,266,3...|
|  65670|           1|2010.0|(1076,[6],[1.0])|(10000,[697,1147,...|
+-------+------------+------+----------------+--------------------+
only showing top 5 rows



# Views

Дополнительный файл по просмотрам передач с полями:
- ts_start — время начала просмотра.
- ts_end — время окончания просмотра.
- item_type — тип просматриваемого контента:
    - live — просмотр "вживую", в момент показа контента в эфире.
    - pvr — просмотр в записи, после показа контента в эфире.

In [10]:
schema = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("ts_start", StringType()),
    StructField("ts_end", StringType()),
    StructField("item_type", StringType()),
])

views = spark.read \
            .schema(schema) \
            .format("csv") \
            .load(VIEWS_FILE, header="true")
views.printSchema()
views.show(5)

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- ts_start: string (nullable = true)
 |-- ts_end: string (nullable = true)
 |-- item_type: string (nullable = true)

+-------+-------+----------+----------+---------+
|user_id|item_id|  ts_start|    ts_end|item_type|
+-------+-------+----------+----------+---------+
|      0|7101053|1491409931|1491411600|     live|
|      0|7101054|1491412481|1491451571|     live|
|      0|7101054|1491411640|1491412481|     live|
|      0|6184414|1486191290|1486191640|     live|
|    257|4436877|1490628499|1490630256|     live|
+-------+-------+----------+----------+---------+
only showing top 5 rows



In [11]:
@f.udf(IntegerType())
def item_type_live(x):
    if x == "live":
        return 1
    return 0

@f.udf(IntegerType())
def item_type_pvr(x):
    if x == "pvr":
        return 1
    return 0

In [12]:
views = views \
            .select("user_id",
                    "item_id",
                    (col("ts_end") - col("ts_start")).alias("ts_delta"),
                    item_type_live("item_type").alias("item_type_live"),
                    item_type_pvr("item_type").alias("item_type_pvr"))
views.show(5)

+-------+-------+--------+--------------+-------------+
|user_id|item_id|ts_delta|item_type_live|item_type_pvr|
+-------+-------+--------+--------------+-------------+
|      0|7101053|  1669.0|             1|            0|
|      0|7101054| 39090.0|             1|            0|
|      0|7101054|   841.0|             1|            0|
|      0|6184414|   350.0|             1|            0|
|    257|4436877|  1757.0|             1|            0|
+-------+-------+--------+--------------+-------------+
only showing top 5 rows



In [13]:
views4users = views \
            .groupBy("user_id") \
            .agg(f.mean("ts_delta").alias("ts_delta_mean"),
                 f.stddev("ts_delta").alias("ts_delta_std"),
                 f.count("item_id").alias("film_counts"),
                 (f.sum("item_type_live") / f.count("item_id")).alias("item_type_live"),
                 (f.sum("item_type_pvr") / f.count("item_id")).alias("item_type_pvr")) \
            .cache()
views4users.show(5)

+-------+-----------------+------------------+-----------+------------------+--------------------+
|user_id|    ts_delta_mean|      ts_delta_std|film_counts|    item_type_live|       item_type_pvr|
+-------+-----------------+------------------+-----------+------------------+--------------------+
| 819569|584.3846153846154|52.239095291965754|         13|               1.0|                 0.0|
| 820786|3700.068508287293|3875.9279351057703|        905|0.5756906077348066|  0.4243093922651934|
| 820904|       5029.84375| 6562.616136416318|        224|0.9419642857142857| 0.05803571428571429|
| 821554|      4846.078125| 4378.617966999981|        128|               1.0|                 0.0|
| 821806|4411.917594654788| 4821.358965756061|        449|0.9955456570155902|0.004454342984409799|
+-------+-----------------+------------------+-----------+------------------+--------------------+
only showing top 5 rows



In [14]:
views4items = views \
                .groupBy("item_id") \
                .agg(f.count("user_id").alias("view_count"),
                     f.min("item_type_live").alias("item_type"),
                     f.mean("ts_delta").alias("mean_time_view"))\
                .cache()
views4items.show(5)

+-------+----------+---------+------------------+
|item_id|view_count|item_type|    mean_time_view|
+-------+----------+---------+------------------+
|6622608|        98|        0| 3180.030612244898|
|6467228|       186|        0|4231.8494623655915|
|7371253|        74|        0|1454.1486486486488|
|6922406|      1558|        0|3287.0346598202823|
|6826302|      3773|        0| 4285.416114497747|
+-------+----------+---------+------------------+
only showing top 5 rows



# 3. Сборка датасета

In [15]:
dataset = dataset.join(buys4users, on="user_id", how="left")
dataset = dataset.join(buys4items, on="item_id", how="left")
#dataset = dataset.join(items.drop("tfidf"), on="item_id", how="left")
dataset = dataset.join(views4users, on="user_id", how="left")
dataset = dataset.join(views4items, on="item_id", how="left")
dataset = dataset.na.fill(0)
dataset.cache()
dataset.show(1, vertical=True)

-RECORD 0------------------------------
 item_id        | 74107                
 user_id        | 1654                 
 purchase       | 0                    
 users_buys     | 5                    
 users_per_buys | 0.001947040498442... 
 items_buys     | 1                    
 items_per_buys | 7.575757575757576E-4 
 ts_delta_mean  | 1844.3989898989898   
 ts_delta_std   | 2691.822888743697    
 film_counts    | 198                  
 item_type_live | 0.6313131313131313   
 item_type_pvr  | 0.3686868686868687   
 view_count     | 0                    
 item_type      | 0                    
 mean_time_view | 0.0                  
only showing top 1 row



# 4. Train&Test Split

In [16]:
train, test = dataset.randomSplit([0.7, 0.3], seed=42)
train = train.drop("user_id", "item_id").cache()
test = train.drop("user_id", "item_id").cache()
train.printSchema()

root
 |-- purchase: integer (nullable = true)
 |-- users_buys: long (nullable = true)
 |-- users_per_buys: double (nullable = false)
 |-- items_buys: long (nullable = true)
 |-- items_per_buys: double (nullable = false)
 |-- ts_delta_mean: double (nullable = false)
 |-- ts_delta_std: double (nullable = false)
 |-- film_counts: long (nullable = true)
 |-- item_type_live: double (nullable = false)
 |-- item_type_pvr: double (nullable = false)
 |-- view_count: long (nullable = true)
 |-- item_type: integer (nullable = true)
 |-- mean_time_view: double (nullable = false)



In [17]:
feat_cols = [col for col in train.columns if col not in ("purchase")]
print(*feat_cols, sep="\n")

users_buys
users_per_buys
items_buys
items_per_buys
ts_delta_mean
ts_delta_std
film_counts
item_type_live
item_type_pvr
view_count
item_type
mean_time_view


In [18]:
assembler = VectorAssembler(inputCols=feat_cols, outputCol="features").setHandleInvalid("keep")

# 5. Модель

### 5.1 Логистическая регрессия

In [19]:
lr = LogisticRegression(labelCol="purchase", maxIter=15)

pipeline = Pipeline(stages=[
    assembler,
    lr
])

In [20]:
pipeline_model = pipeline.fit(train)

In [40]:
predictions = pipeline_model.transform(test)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="purchase", metricName="areaUnderROC")
evaluator.evaluate(predictions)

0.767638722841348

In [65]:
predictions = pipeline_model.transform(test)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="purchase", metricName="areaUnderROC")
evaluator.evaluate(predictions)

0.7677539915790177

In [21]:
predictions = pipeline_model.transform(test)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="purchase", metricName="areaUnderROC")
evaluator.evaluate(predictions)

0.8977932531321091

### 5.2 Градиентный бустинг

In [74]:
gb = GBTClassifier(labelCol="purchase", maxIter=15, maxDepth=10)

pipeline_gb = Pipeline(stages=[
    assembler,
    gb
])

In [75]:
pipeline_gb_model = pipeline_gb.fit(train)

KeyboardInterrupt: 

Ну как-то слишком долго.......

In [None]:
predictions = pipeline_gb_model.transform(test)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="purchase", metricName="areaUnderROC")
evaluator.evaluate(predictions)

# N. Сохранение результата

In [22]:
result = result.join(buys4users, on="user_id", how="left")
result = result.join(buys4items, on="item_id", how="left")
#result = result.join(items.drop("tfidf"), on="item_id", how="left")
result = result.join(views4users, on="user_id", how="left")
result = result.join(views4items, on="item_id", how="left")
result = result.na.fill(0)
result.cache()
result.show(1, vertical=True)

-RECORD 0------------------------------
 item_id        | 94814                
 user_id        | 1654                 
 purchase       | 0                    
 users_buys     | 5                    
 users_per_buys | 0.001947040498442... 
 items_buys     | 1                    
 items_per_buys | 7.246376811594203E-4 
 ts_delta_mean  | 1844.3989898989898   
 ts_delta_std   | 2691.822888743697    
 film_counts    | 198                  
 item_type_live | 0.6313131313131313   
 item_type_pvr  | 0.3686868686868687   
 view_count     | 0                    
 item_type      | 0                    
 mean_time_view | 0.0                  
only showing top 1 row



In [23]:
predict = pipeline_model \
            .transform(result) \
            .select("user_id",
                    "item_id",
                    col("probability").alias("purchase")) \
            .orderBy(["user_id", "item_id"])
predict.cache()
predict.show(1, vertical=True)

-RECORD 0------------------------
 user_id  | 1654                 
 item_id  | 336                  
 purchase | [0.99862074167450... 
only showing top 1 row



In [24]:
predict_df = predict.toPandas()
predict_df.head()

  Unsupported type in conversion to Arrow: VectorUDT
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.


Unnamed: 0,user_id,item_id,purchase
0,1654,336,"[0.9986207416745089, 0.0013792583254909833]"
1,1654,678,"[0.9986207416745089, 0.0013792583254909833]"
2,1654,691,"[0.9986207416745089, 0.0013792583254909833]"
3,1654,696,"[0.9984350433045214, 0.0015649566954786577]"
4,1654,763,"[0.9985319878300942, 0.0014680121699059228]"


In [25]:
predict_df["purchase"] = predict_df["purchase"].apply(lambda x: x[1])

In [26]:
predict_df.head()

Unnamed: 0,user_id,item_id,purchase
0,1654,336,0.001379
1,1654,678,0.001379
2,1654,691,0.001379
3,1654,696,0.001565
4,1654,763,0.001468


In [27]:
predict_df.dtypes

user_id       int32
item_id       int32
purchase    float64
dtype: object

In [28]:
predict_df.sort_values(["user_id", "item_id"]).to_csv("lab03.csv")

In [27]:
spark.stop()