In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Denis.Ivashchenko_lab03")
conf.set('spark.executor.instances', '16')

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
! hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2021-02-27 22:12 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2021-02-27 22:12 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2021-02-27 22:12 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2021-02-27 22:12 /labs/slaba03/laba03_views_programmes.csv


In [4]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, DoubleType, FloatType
import pyspark.sql.functions as f

In [5]:
schema_it = StructType([
    StructField("item_id", IntegerType()),
    StructField("channel_id", DoubleType()),
    StructField("avail_start", StringType()),
    StructField("avail_stop", StringType()),
    StructField("show_start", StringType()),
    StructField("show_stop", StringType()),
    StructField("type", StringType()),
    StructField("title", StringType()),
    StructField("year", StringType()),
    StructField("genres", StringType()),
    StructField("region_id", IntegerType())
])

schema_view = StructType([
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("ts_start", IntegerType()),
    StructField("ts_stop", IntegerType()),
    StructField("item_type", StringType())
])

schema_set = StructType([
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", IntegerType())
])

## Обработка  items

In [6]:
import re
regex = re.compile(r'[\w ]+')

In [7]:
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import CountVectorizer,ElementwiseProduct
from pyspark.ml.feature import OneHotEncoder,StringIndexer, RegexTokenizer, HashingTF

In [8]:
@f.udf(VectorUDT())
def vec_to_arr(a):
    return Vectors.dense(a)


In [9]:
items = spark.read.csv("/labs/slaba03/laba03_items.csv", schema=schema_it, sep='\t',header=True)
views = spark.read.csv("/labs/slaba03/laba03_views_programmes.csv", schema=schema_view, header=True)

In [10]:
items = items.drop('year','title','region','avail_stop','avail_start','region_id')
items = items.withColumn('stop', f.regexp_replace(f.col('show_stop'), r'[-ZT:]','-')).\
                    withColumn('start', f.regexp_replace(f.col('show_start'), r'[-ZT:]','-')).\
                    withColumn('stop_ux', f.unix_timestamp(f.col('stop'), format='yyyy-MM-dd-HH-mm-ss-')).\
                    withColumn('start_ux', f.unix_timestamp(f.col('start'), format='yyyy-MM-dd-HH-mm-ss-')).\
                    na.fill({'genres': 'General', 'channel_id': '0', 'start_ux': '1485624600', 'stop_ux':'1485627935'}).\
                    withColumn('duration', f.col('stop_ux') - f.col('start_ux')).\
                    withColumn('genres', f.udf(lambda x: regex.findall(x.lower()),ArrayType(StringType()))('genres')).\
                    drop('show_start','channel_id','show_stop','start' , 'stop')
                    

In [11]:
cv = CountVectorizer(inputCol="genres", outputCol="gen_vec")

model = cv.fit(items)

items = model.transform(items)
items = items.filter(f.col('type') == 1 ).drop('genres')

In [12]:
items = items.withColumn('gen_arr', vec_to_arr(f.col('gen_vec')))

## Обрабтка views

In [13]:
from pyspark.sql import Window
from pyspark.ml.linalg import SparseVector, DenseVector, VectorUDT, Vectors

In [14]:
win_user = Window.partitionBy('user_id')
views = views.withColumn('duration', (f.col('ts_stop') - f.col('ts_start'))/60/60/24).\
                withColumn('days_on_service', (f.max(f.col('ts_stop')).over(win_user) - f.min(f.col('ts_start')).over(win_user))/60/60/24)

In [15]:
views_user_data = views.groupBy('user_id').agg(f.sum('duration').alias('total_time'),f.avg('days_on_service').alias('days_on_service'))

## Обработка train

In [16]:
train = spark.read.csv("/labs/slaba03/laba03_train.csv", schema=schema_set, header=True)

In [17]:
item_stats = train.withColumnRenamed(existing='item_id',new='item_id1').\
    groupBy('item_id1').agg(f.count('purchase').alias('views'),f.sum('purchase').alias('buys')).sort(f.col('buys').desc())

In [18]:
item_stats = item_stats.join(items, item_stats.item_id1 == items.item_id, 'inner').drop('item_id')

In [19]:
item_cat = item_stats.withColumnRenamed(existing='gen_vec',new='gen_vec1').\
            groupBy('gen_vec1').agg(f.sum('buys').alias('buys_c'), f.sum('views').alias('views_c')).sort(f.col('buys_c').desc())

In [20]:
item_stats = item_stats.join(item_cat, item_stats.gen_vec == item_cat.gen_vec1, 'inner').drop('stop_ux','start_ux','type','duration','gen_vec1')

In [21]:
item_stats = item_stats.withColumn('ibr', f.col('buys') / f.col('views')).\
                        withColumn('bic', f.col('buys') / (f.col('buys_c')+1)).\
                        withColumn('vic', f.col('views') / f.col('views_c'))

In [22]:
user_stats = train.withColumnRenamed(existing='user_id',new='user_id1').\
    groupBy('user_id1').agg(f.count('purchase').alias('views'),f.sum('purchase').alias('buys')).sort(f.col('buys').desc())

In [23]:
user_stats = user_stats.withColumnRenamed(existing='user_id', new='user_id1').\
            join(views_user_data, views_user_data.user_id == user_stats.user_id1, 'inner').drop('user_id')

In [24]:
user_stats = user_stats.withColumn('ubr', f.col('buys') / (f.col('views')))\
            .withColumn('vpd', f.col('views') / (f.col('days_on_Service') + 1))\
            .withColumn('bpd', f.col('buys') / (f.col('days_on_Service') + 1))\
            .withColumn('tpv', f.col('total_time') / f.col('views'))\
            .withColumn('bpt', f.col('buys') / f.col('total_time'))\
            .withColumn('tpd', f.col('total_time') / (f.col('days_on_Service') + 1))\
            .drop('total_time','days_on_service')

In [25]:
train = train.join(item_stats, item_stats.item_id1 == train.item_id, 'inner').\
    select(train.user_id, train.item_id, train.purchase, item_stats.ibr, item_stats.bic,  item_stats.vic )

In [26]:
train_res = train.join(user_stats, user_stats.user_id1 == train.user_id, 'inner').\
    select(train.user_id, train.item_id, train.purchase, train.ibr, train.bic, train.vic, user_stats.ubr ,user_stats.vpd, user_stats.bpd, user_stats.tpv ,user_stats.bpt, user_stats.tpd)

## LogisticRegression  Model 

In [27]:
train_m = train_res.sampleBy("purchase", fractions={0: 0.8, 1: 0.8}, seed=123)

In [28]:
test_m = train_res.join(train_m, on=["user_id",'item_id'], how="leftanti")

In [29]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler

In [30]:
vecAssembler = VectorAssembler(inputCols=['ibr','ubr'], outputCol="features")
#vecAssembler = VectorAssembler(inputCols=['ibr','bic','vic','ubr','vpd','bpd','tpv','bpt','tpd'], outputCol="features")


In [31]:
train_m = vecAssembler.transform(train_m)

In [32]:
#0.9269945352919823
lr = LogisticRegression(featuresCol='features',\
                        labelCol="purchase",\
                        tol=1e-5,\
                        maxIter=15,\
                        regParam=0.025, \
                        weightCol='ibr',\
                        elasticNetParam=0.035)

In [34]:
lr_model = lr.fit(train_m)

In [35]:
lr_model

LogisticRegressionModel: uid = LogisticRegression_89f73a17efb4, numClasses = 2, numFeatures = 2

In [36]:
test_m = vecAssembler.transform(test_m)

In [37]:
predictions = lr_model.transform(test_m)

In [38]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator 

In [39]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="purchase", metricName='areaUnderROC')

In [40]:
evaluator.evaluate(predictions)

0.9269536789885084

In [265]:
#lr_model.write().overwrite().save(os.path.curdir + '/lr1')

## Делаем предсказание 

In [41]:
test = spark.read.csv("/labs/slaba03/laba03_test.csv", header=True)

In [42]:
test = test.join(item_stats, item_stats.item_id1 == test.item_id, 'inner').\
    select(test.user_id, test.item_id, item_stats.ibr, item_stats.bic, item_stats.vic)

In [43]:
test = test.join(user_stats, user_stats.user_id1 == test.user_id, 'left').\
    select(test.user_id, test.item_id, test.ibr, test.bic, test.vic, user_stats.ubr ,user_stats.vpd, user_stats.bpd, user_stats.tpv ,user_stats.bpt, user_stats.tpd)

In [44]:
stats = test.describe()

In [45]:
ibr_m , bic_m, vic_m, ubr_m, vpd_m, bpd_m, tpv_m, bpt_m, tpd_m = stats.rdd.collect()[3][3:12]

In [46]:
test = test.na.fill({'ibr': ibr_m, 'bic': bic_m, 'vic': vic_m, 'ubr': ubr_m, 'vpd': vpd_m, 'bpd' : bpd_m, 'tpv': tpv_m, 'bpt': bpt_m, 'tpd': tpd_m})

In [47]:
test = vecAssembler.transform(test)

In [48]:
pred_res = lr_model.transform(test)

In [49]:
@f.udf()
def vec_to_arr(a):
    return (lambda x : float(Vectors.dense(x)[1]))(a)

In [50]:
out = pred_res.withColumn('proba_pos', vec_to_arr(f.col('probability'))).\
    select(f.col('user_id').cast('int'),f.col('item_id').cast('int'),f.col('proba_pos').cast('float').alias('purchase')).sort(f.col('user_id').asc(), f.col('item_id').asc())

## Сохраняем результат

In [51]:
import pandas as pd

In [52]:
pddf = out.toPandas()

In [53]:
pddf.to_csv('lab03.csv')

In [54]:
pddf.shape

(2156840, 3)

In [55]:
spark.stop()