In [1]:
import os
import sys

os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 2 --driver-memory 2g pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("Morozov_Nikita")
         .getOrCreate())

In [3]:
!hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2022-01-06 18:46 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2022-01-06 18:46 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2022-01-06 18:46 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2022-01-06 18:46 /labs/slaba03/laba03_views_programmes.csv


### Импорт библиотек

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType
from pyspark.sql import functions as f

from tqdm import tqdm
import numpy as np
from pyspark.ml.linalg import *
from pyspark.sql.types import * 
from pyspark.sql.functions import *
from pyspark.ml.linalg import Vectors

In [5]:
from pyspark.ml.feature import Tokenizer, CountVectorizerModel, CountVectorizer, HashingTF, VectorAssembler, OneHotEncoder

In [6]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover
from pyspark.sql import functions as F
from pyspark.ml import Pipeline

from pyspark.ml.classification import LogisticRegression, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

### Создание схемы для каждого датасета

In [7]:
schema_train_and_test = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", FloatType())
])

schema_views_programmes = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("ts_start", IntegerType()),
    StructField("ts_end", IntegerType()),
    StructField("item_type", StringType())
])

schema_items = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("channel_id", IntegerType()),
    StructField("datetime_availability_start", StringType()),
    StructField("datetime_availability_stop", StringType()),
    StructField("datetime_show_start", StringType()),
    StructField("datetime_show_stop", StringType()),
    StructField("content_type", IntegerType()),
    StructField("title", StringType()),
    StructField("year", FloatType()),
    StructField("genres", StringType()),
   StructField("region_id", FloatType())
])

### Считывание данных

In [8]:
train =  spark.read.options(header = True).csv("/labs/slaba03/laba03_train.csv", schema_train_and_test)
test =  spark.read.options(header = True).csv("/labs/slaba03/laba03_test.csv", schema_train_and_test)
items =  spark.read.options(delimiter = '\t', header = True).csv("/labs/slaba03/laba03_items.csv")
views_programmes = spark.read.options(header = True).csv("/labs/slaba03/laba03_views_programmes.csv", schema_views_programmes)

In [9]:
items = items.withColumn("content_type", 
                                  items["content_type"]
                                  .cast('int'))

# 0 FILLNA

In [10]:
items = items.fillna(0, subset=['channel_id'])
items = items.fillna("None", subset=['genres'])

# 1. EDA

### 1.1 Объединим train и test для удобства создания признаков

In [11]:
full_data = train.union(test)

### 1.2 Найдем пустые значения

In [12]:
items_feach = items.select([c for c in items.columns if c in ['item_id','channel_id', 'datetime_show_start', 'datetime_show_stop',
                                                             'content_type', 'title', 'year', 'genres', 'region_id']])

In [13]:
items_feach.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in items_feach.columns]).show()

+-------+----------+-------------------+------------------+------------+-----+------+------+---------+
|item_id|channel_id|datetime_show_start|datetime_show_stop|content_type|title|  year|genres|region_id|
+-------+----------+-------------------+------------------+------------+-----+------+------+---------+
|      0|      3704|               3704|              3704|           0|    0|631868|     0|   362264|
+-------+----------+-------------------+------------------+------------+-----+------+------+---------+



In [14]:
views_programmes.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in views_programmes.columns]).show()

+-------+-------+--------+------+---------+
|user_id|item_id|ts_start|ts_end|item_type|
+-------+-------+--------+------+---------+
|      0|      0|       0|     0|        0|
+-------+-------+--------+------+---------+



# 2. Feature Engeneering

### 2.1 Агрегации по пользователю и фильмам

In [15]:
sum_purchase_user = full_data.groupBy("user_id").sum().\
                             select('user_id', col('sum(purchase)').alias('sum_purchase_user'))

#Среднее количество фильмов у пользователя
mean_purchase_user = full_data.groupBy("user_id").mean().\
                             select('user_id', col('avg(purchase)').alias('mean_purchased_user'))

# Количество пользователей, купившых данный фильм
sum_purchase_item = full_data.\
                             groupBy("item_id").sum().\
                             select('item_id', col('sum(purchase)').alias('sum_purchase_item'))

#Среднее количество купленных фильмов
mean_purchased_item = full_data.groupBy("item_id").mean().\
                             select('item_id', col('avg(purchase)').alias('mean_purchased_item'))

In [16]:
sum_purchase_user.rdd.getNumPartitions()

200

### 2.2 Фичи, связанные со временем начала и окончания программы(не взлетело)

In [16]:
# items_duration = items_feach.withColumn('start_timestamp',to_timestamp(col('datetime_show_start')))\
#   .withColumn('end_timestamp',
#               to_timestamp(col("datetime_show_stop")))\
#   .withColumn('diff_in_seconds',          #Длительность программы
#               col("end_timestamp").cast("long") - col('start_timestamp').cast("long"))\
#   .withColumn('hour_begin',               #Час
#               f.hour('start_timestamp'))\
#   .withColumn('day_of_week',              #День недели
#               ((f.dayofweek('start_timestamp')+5)%7)+1)\
#   .withColumn('day_of_month',             #День месяца
#               f.dayofmonth('start_timestamp'))\
#   .withColumn('day_of_year',              #День года
#               f.dayofyear('start_timestamp'))\
#   .withColumn('week_of_year',              #Неделя года
#               f.weekofyear('start_timestamp'))\
#   .withColumn('month',                     #Месяц
#               f.month('start_timestamp'))\
#   .withColumn("is_weekend",
#               col("day_of_week").isin([6,7]).cast("int")).select(col("item_id")
#                                                 ,col("diff_in_seconds")
#                                                 ,col("hour_begin")
#                                                 ,col("day_of_week")
#                                                 ,col("day_of_month")
#                                                 ,col("day_of_year")
#                                                 ,col("week_of_year")
#                                                 ,col("month")
#                                                 ,col("is_weekend"))

### 2.3 Жанры через TF-IDF

In [17]:
items_genres = items_feach.select(col("item_id"), col("genres"))

In [18]:
def removing_the_comma(data, column):
    data=data.withColumn(column, lower(col('genres')))
    data = data.withColumn(column, F.regexp_replace('desc', ',', ' '))
    return data

In [19]:
items_genres = removing_the_comma(items_genres, 'desc')

In [20]:
list_text = items_genres.select('desc').collect()
set_lang = set()
for i in list_text:
    for j in i[0].split():
        set_lang.add(j)
print(len(set_lang))

94


In [21]:
stop_words_ru = StopWordsRemover.loadDefaultStopWords("russian")

tokenizer = Tokenizer(inputCol="desc", outputCol="words")

swr_en = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered_en", stopWords=stop_words_ru)

hasher = HashingTF(numFeatures=94, binary=False, inputCol=swr_en.getOutputCol(), outputCol="word_vector")

idf = IDF(inputCol="word_vector", outputCol="tf_idf_genre")

pipeline_genre = Pipeline(stages=[
    tokenizer,
    swr_en,
    hasher,
    idf
])

In [22]:
pipeline_model = pipeline_genre.fit(items_genres)
tf_idf_genre = pipeline_model.transform(items_genres)
tf_idf_genre = tf_idf_genre.select(col("item_id"), col("tf_idf_genre"))

In [23]:
items_genres = items_genres.withColumn('wordCount', f.size(f.split(f.col('desc'), ' ')))

### 2.4 Купленные жанры пользователя

In [24]:
@f.pandas_udf(StringType(), f.PandasUDFType.GROUPED_AGG)
def collect_list(name):    
    s = ','.join(name)
    lst = s.split(",")
    lst = set(lst)
    return ' '.join(lst)

user_purchased_genres = train\
     .filter(col('purchase') == 1)\
     .join(items, on='item_id', how='inner')\
     .select(['user_id', 'item_id', 'genres'])\
     .groupBy('user_id')\
     .agg(collect_list(col('genres')).alias('purchased_genres'))\
     .fillna('', subset=['purchased_genres'])

In [25]:
user_purchased_genres = user_purchased_genres.withColumn('purchased_genres_count',
                                                         f.size(f.split(f.col('purchased_genres'), ' ')))

### 2.4 Объединим все признаки, связанные с items

In [26]:
feature_items = items_genres.join(tf_idf_genre,
                items_genres.item_id == tf_idf_genre.item_id,
                'inner').drop(items_genres.item_id)

### 2.5 Создание признаков о просмотре 

In [27]:
views_programmes = views_programmes.withColumn('diff_time', col("ts_end") - col("ts_start"))

In [28]:
### Статистики по Users

count_live_program = views_programmes.filter(col("item_type") == "live")\
                                     .select(col('user_id'), col("item_type"))\
                                     .groupBy("user_id").count().select("user_id", col("count").alias("count_live_program"))

count_prv_program = views_programmes.filter(col("item_type") == "pvr")\
                                     .select(col('user_id'), col("item_type"))\
                                     .groupBy("user_id").count().select("user_id", col("count").alias("count_prv_program"))

sum_total_viewed_user = views_programmes.select(col("user_id"), col("diff_time"), col("item_type"))\
                .groupBy("user_id").sum("diff_time").select("user_id", col("sum(diff_time)").alias("sum_total_viewed_user"))

mean_total_viewed_user = views_programmes.select(col("user_id"), col("diff_time"), col("item_type"))\
                .groupBy("user_id").mean("diff_time").select("user_id", col("avg(diff_time)").alias("mean_total_viewed_user"))

sum_live_viewed_user = views_programmes.select(col("user_id"), col("diff_time"), col("item_type"))\
                .filter(col("item_type") == "live")\
                .groupBy("user_id").sum("diff_time").select("user_id", col("sum(diff_time)").alias("sum_live_viewed_user"))

sum_pvr_viewed_user = views_programmes.select(col("user_id"), col("diff_time"), col("item_type"))\
                .filter(col("item_type") == "pvr")\
                .groupBy("user_id").sum("diff_time").select("user_id", col("sum(diff_time)").alias("sum_pvr_viewed_user"))

mean_live_viewed_user = views_programmes.select(col("user_id"), col("diff_time"), col("item_type"))\
                .filter(col("item_type") == "live")\
                .groupBy("user_id").mean("diff_time").select("user_id", col("avg(diff_time)").alias("mean_live_viewed_user"))

mean_pvr_viewed_user = views_programmes.select(col("user_id"), col("diff_time"), col("item_type"))\
                .filter(col("item_type") == "pvr")\
                .groupBy("user_id").mean("diff_time").select("user_id", col("avg(diff_time)").alias("mean_pvr_viewed_user"))

### Статистики по Items
sum_total_viewed_item = views_programmes.select(col("item_id"), col("diff_time"), col("item_type"))\
                .groupBy("item_id").sum("diff_time").select("item_id", col("sum(diff_time)").alias("sum_total_viewed_item"))

mean_total_viewed_item = views_programmes.select(col("item_id"), col("diff_time"), col("item_type"))\
                .groupBy("item_id").mean("diff_time").select("item_id", col("avg(diff_time)").alias("mean_total_viewed_item"))

sum_live_viewed_item = views_programmes.select(col("item_id"), col("diff_time"), col("item_type"))\
                .filter(col("item_type") == "live")\
                .groupBy("item_id").sum("diff_time").select("item_id", col("sum(diff_time)").alias("sum_live_viewed_item"))

sum_pvr_viewed_item = views_programmes.select(col("item_id"), col("diff_time"), col("item_type"))\
                .filter(col("item_type") == "pvr")\
                .groupBy("item_id").sum("diff_time").select("item_id", col("sum(diff_time)").alias("sum_pvr_viewed_item"))

mean_live_viewed_item = views_programmes.select(col("item_id"), col("diff_time"), col("item_type"))\
                .filter(col("item_type") == "live")\
                .groupBy("item_id").mean("diff_time").select("item_id", col("avg(diff_time)").alias("mean_live_viewed_item"))

mean_pvr_viewed_item = views_programmes.select(col("item_id"), col("diff_time"), col("item_type"))\
                .filter(col("item_type") == "pvr")\
                .groupBy("item_id").mean("diff_time").select("item_id", col("avg(diff_time)").alias("mean_pvr_viewed_item"))

# 3. Final preprocessing

### 3.1 Объединим все признаки в full_data

In [29]:
feature_data = full_data.join(sum_purchase_user,
               full_data.user_id == sum_purchase_user.user_id,
               'left').\
            select(full_data.user_id, full_data.item_id, sum_purchase_user.sum_purchase_user)

feature_data = feature_data.join(feature_items,
               feature_data.item_id == feature_items.item_id,
               'left').drop(feature_items.item_id)

feature_data = feature_data.join(mean_purchase_user,
               feature_data.user_id == mean_purchase_user.user_id,
                'left').drop(feature_data.user_id)

feature_data = feature_data.join(sum_purchase_item,
               feature_data.item_id == sum_purchase_item.item_id,
               'left').drop(feature_data.item_id)

feature_data = feature_data.join(mean_purchased_item,
               feature_data.item_id == mean_purchased_item.item_id,
               'left').drop(feature_data.item_id)

feature_data = feature_data.join(count_live_program,
               feature_data.user_id == count_live_program.user_id,
               'left').drop(count_live_program.user_id)

feature_data = feature_data.join(count_prv_program,
               feature_data.user_id == count_prv_program.user_id,
               'left').drop(count_prv_program.user_id)

feature_data = feature_data.join(sum_total_viewed_user,
               feature_data.user_id == sum_total_viewed_user.user_id,
               'left').drop(sum_total_viewed_user.user_id)

feature_data = feature_data.join(mean_total_viewed_user,
               feature_data.user_id == mean_total_viewed_user.user_id,
               'left').drop(mean_total_viewed_user.user_id)

feature_data = feature_data.join(sum_live_viewed_user,
               feature_data.user_id == sum_live_viewed_user.user_id,
               'left').drop(sum_live_viewed_user.user_id)

feature_data = feature_data.join(sum_pvr_viewed_user,
               feature_data.user_id == sum_pvr_viewed_user.user_id,
               'left').drop(sum_pvr_viewed_user.user_id)

feature_data = feature_data.join(mean_live_viewed_user,
               feature_data.user_id == mean_live_viewed_user.user_id,
               'left').drop(mean_live_viewed_user.user_id)

feature_data = feature_data.join(mean_pvr_viewed_user,
               feature_data.user_id == mean_pvr_viewed_user.user_id,
               'left').drop(mean_pvr_viewed_user.user_id)


feature_data = feature_data.join(items.select(col("item_id"), col("content_type")),
               feature_data.item_id == items.item_id,
               'left').drop(items.item_id)

feature_data = feature_data.join(user_purchased_genres,
               feature_data.user_id == user_purchased_genres.user_id,
               'left').drop(user_purchased_genres.user_id)

feature_data = feature_data.join(items.select(col("item_id"), col("year")),
               feature_data.item_id == items.item_id,
               'left').drop(items.item_id)

In [30]:
feature_data = feature_data.fillna(0, subset=["wordCount", "sum_purchase_user", "mean_purchased_user",
                                               "sum_purchase_item", "mean_purchased_item", "count_live_program",
                                              "count_prv_program", "sum_total_viewed_user", "mean_total_viewed_user",
                                              "sum_live_viewed_user", "sum_pvr_viewed_user", "mean_live_viewed_user",
                                              "mean_pvr_viewed_user", "content_type", "purchased_genres_count"])

In [31]:
feature_data

DataFrame[sum_purchase_user: double, genres: string, desc: string, wordCount: int, tf_idf_genre: vector, user_id: int, mean_purchased_user: double, sum_purchase_item: double, item_id: int, mean_purchased_item: double, count_live_program: bigint, count_prv_program: bigint, sum_total_viewed_user: bigint, mean_total_viewed_user: double, sum_live_viewed_user: bigint, sum_pvr_viewed_user: bigint, mean_live_viewed_user: double, mean_pvr_viewed_user: double, content_type: int, purchased_genres: string, purchased_genres_count: int, year: string]

In [32]:
feature_data.count()

7189464

### 3.2 Удаление пустых значенний

In [33]:
df = feature_data.drop(feature_data.tf_idf_genre)
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+-----------------+------+----+---------+-------+-------------------+-----------------+-------+-------------------+------------------+-----------------+---------------------+----------------------+--------------------+-------------------+---------------------+--------------------+------------+----------------+----------------------+----+
|sum_purchase_user|genres|desc|wordCount|user_id|mean_purchased_user|sum_purchase_item|item_id|mean_purchased_item|count_live_program|count_prv_program|sum_total_viewed_user|mean_total_viewed_user|sum_live_viewed_user|sum_pvr_viewed_user|mean_live_viewed_user|mean_pvr_viewed_user|content_type|purchased_genres|purchased_genres_count|year|
+-----------------+------+----+---------+-------+-------------------+-----------------+-------+-------------------+------------------+-----------------+---------------------+----------------------+--------------------+-------------------+---------------------+--------------------+------------+----------------+---------

### 3.2 Разделим на train и test, как было изначально

In [34]:
train_feature_data = train.join(feature_data, on=['user_id', 'item_id'], how='left')
test_feature_data = test.join(feature_data, on=['user_id', 'item_id'], how='left')

### 3.3 Явно укажем, какие столбцы являются признаками

In [35]:
ass = VectorAssembler(inputCols=['sum_purchase_user', "mean_purchased_user",
                                 "sum_purchase_item", "mean_purchased_item", 
                                 "wordCount", "tf_idf_genre",
                                 "count_live_program", "count_prv_program",
                                 "sum_live_viewed_user", "mean_live_viewed_user",
                                 "sum_pvr_viewed_user", "mean_pvr_viewed_user",
                                 "content_type", "purchased_genres_count"],
                      outputCol='features')

In [36]:
vector_feature_train = ass.transform(train_feature_data)
vector_feature_test = ass.transform(test_feature_data)

# 4. Обучение модели

In [37]:
model_bdt = GBTClassifier(featuresCol='features', labelCol='purchase', maxDepth = 3, maxIter = 19)

In [38]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability",
                                          labelCol="purchase",
                                          metricName='areaUnderROC')

In [39]:
# paramGrid = ParamGridBuilder().addGrid(model_bdt.maxDepth, [3])\
#                                .build()

# crossval = CrossValidator(estimator=model_lg,
#                            evaluator=evaluator,
#                            numFolds=3,
#                            parallelism=3,
#                            estimatorParamMaps=paramGrid
#                           )

# cv = crossval.fit(vector_feature_train)

# cv.avgMetrics

# cv.bestModel

In [40]:
gbt = model_bdt.fit(vector_feature_train)

In [41]:
predictions = gbt.transform(vector_feature_train)

In [42]:
res = evaluator.evaluate(predictions)

In [43]:
res

0.9183934307325691

# 5. Predict and Submit

In [44]:
predictions_test = gbt.transform(vector_feature_test)

In [45]:
%%time
ress = predictions_test.select('user_id', 'item_id', col('probability').alias('purchase')).toPandas()

CPU times: user 40 s, sys: 1.21 s, total: 41.3 s
Wall time: 1min 44s


In [46]:
ress_pd = ress.sort_values(['user_id', 'item_id']).reset_index(drop=True)
ress_pd['purchase'] = ress_pd['purchase'].apply(lambda x: x[1])

In [47]:
ress_pd.to_csv('~/lab03.csv', index=False)

In [57]:
spark.stop()