In [1]:
import os
import sys

os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 2g --executor-cores 2 --driver-memory 2g pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("Morozov_Nikita")
         .getOrCreate())

In [3]:
!hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2022-01-06 18:46 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2022-01-06 18:46 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2022-01-06 18:46 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2022-01-06 18:46 /labs/slaba03/laba03_views_programmes.csv


### Импорт библиотек

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType
from pyspark.sql import functions as f

from tqdm import tqdm
import numpy as np
from pyspark.ml.linalg import *
from pyspark.sql.types import * 
from pyspark.sql.functions import *
from pyspark.ml.linalg import Vectors

In [5]:
from pyspark.ml.feature import Tokenizer, CountVectorizerModel, CountVectorizer, HashingTF, VectorAssembler

### Создание схемы для каждого датасета

In [6]:
schema_train_and_test = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", FloatType())
])

schema_views_programmes = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("ts_start", IntegerType()),
    StructField("ts_end", IntegerType()),
    StructField("item_type", StringType())
])

schema_items = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("channel_id", FloatType()),
    StructField("datetime_availability_start", StringType()),
    StructField("datetime_availability_stop", StringType()),
    StructField("datetime_show_start", StringType()),
    StructField("datetime_show_stop", StringType()),
    StructField("content_type", FloatType()),
    StructField("title", StringType()),
    StructField("year", FloatType()),
    StructField("genres", StringType()),
   StructField("region_id", FloatType())
])

### Считывание данных

In [7]:
train =  spark.read.options(header = True).csv("/labs/slaba03/laba03_train.csv", schema_train_and_test)
test =  spark.read.options(header = True).csv("/labs/slaba03/laba03_test.csv", schema_train_and_test)
items =  spark.read.options(delimiter = '\t', header = True).csv("/labs/slaba03/laba03_items.csv")
views_programmes = spark.read.options(header = True).csv("/labs/slaba03/laba03_views_programmes.csv", schema_views_programmes)

# 1. EDA (анализ датасета)

### 1.1 Объединим train и test для удобства создания признаков

In [8]:
full_data = train.union(test)

### 1.2 Найдем пустые значения (можно убрать)

In [9]:
items_feach = items.select([c for c in items.columns if c in ['item_id','channel_id', 'datetime_show_start', 'datetime_show_stop',
                                                             'content_type', 'title', 'year', 'genres', 'region_id']])

In [10]:
items_feach.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in items_feach.columns]).show()

+-------+----------+-------------------+------------------+------------+-----+------+------+---------+
|item_id|channel_id|datetime_show_start|datetime_show_stop|content_type|title|  year|genres|region_id|
+-------+----------+-------------------+------------------+------------+-----+------+------+---------+
|      0|      3704|               3704|              3704|           0|    0|631868|    33|   362264|
+-------+----------+-------------------+------------------+------------+-----+------+------+---------+



In [11]:
views_programmes.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in views_programmes.columns]).show()

+-------+-------+--------+------+---------+
|user_id|item_id|ts_start|ts_end|item_type|
+-------+-------+--------+------+---------+
|      0|      0|       0|     0|        0|
+-------+-------+--------+------+---------+



# 2. Feature Engeneering

### 2.1 Найдем количество купленных фильмов у каждого пользователя

In [13]:
count_purchase_1 = full_data.filter(col("purchase") == 1).\
                             groupBy("user_id").count().\
                             select('user_id', col('count').alias('n_purchased'))

# 3. Final preprocessing

### 3.1 Объединим все признаки в full_data

In [14]:
feature_data = full_data.join(count_purchase_1,
               full_data.user_id == count_purchase_1.user_id,
               'left').\
            select(full_data.user_id, full_data.item_id, count_purchase_1.n_purchased)

### 3.2 Разделим на train и test, как было изначально

In [15]:
train_feature_data = train.join(feature_data, on=['user_id', 'item_id'], how='left')
test_feature_data = test.join(feature_data, on=['user_id', 'item_id'], how='left')

### 3.3 Удаление пустых значений

In [16]:
train_feature_data = train_feature_data.fillna(0, subset=['n_purchased'])
test_feature_data = test_feature_data.fillna(0, subset=['n_purchased'])

### 3.3 Явно укажем, какие столбцы являются признаками(в будущем реализовать pipeline преобразования)

In [17]:
ass = VectorAssembler(inputCols=['n_purchased'],
                      outputCol='features')

In [18]:
vector_feature_train = ass.transform(train_feature_data)
vector_feature_test = ass.transform(test_feature_data)

# 4. Обучение модели

In [19]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [20]:
#ass.getOutputCol()
model_lg = LogisticRegression(featuresCol='features', labelCol='purchase')

In [21]:
model_lg

LogisticRegression_08af463ce97d

In [23]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability",
                                          labelCol="purchase",
                                          metricName='areaUnderROC')

In [24]:
lg = model_lg.fit(vector_feature_train)

In [25]:
predictions = lg.transform(vector_feature_train)

In [26]:
res = evaluator.evaluate(predictions)

In [27]:
res

0.8531401909620384

# 5. Predict and Submit

In [28]:
predictions_test = lg.transform(vector_feature_test)

In [29]:
predictions_test.select('probability', 'prediction').show(1)

+--------------------+----------+
|         probability|prediction|
+--------------------+----------+
|[0.99803564934271...|       0.0|
+--------------------+----------+
only showing top 1 row



In [31]:
%%time
ress = predictions_test.select('user_id', 'item_id', col('probability').alias('purchase')).toPandas()

CPU times: user 42.5 s, sys: 1.53 s, total: 44 s
Wall time: 1min 15s


In [32]:
ress_pd = ress.sort_values(['user_id', 'item_id']).reset_index(drop=True)
ress_pd['purchase'] = ress_pd['purchase'].apply(lambda x: x[1])

In [33]:
ress_pd.to_csv('~/lab03.csv', index=False)

In [34]:
spark.stop()