In [None]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [None]:
! hdfs dfs -ls /labs/slaba03/

In [None]:
from pyspark.sql.types import *
from pyspark.sql import Window
from pyspark.sql.functions import to_date, udf

from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.functions import vector_to_array
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier

#  Чтение файлов

In [None]:
item = spark.read.csv('/labs/slaba03/laba03_items.csv', sep='\t', header=True)

In [None]:
test = spark.read.csv('/labs/slaba03/laba03_test.csv', sep=',', header=True)

In [None]:
train = spark.read.csv('/labs/slaba03/laba03_train.csv', sep=',', header=True)

In [None]:
views_programmes = spark.read.csv('/labs/slaba03/laba03_views_programmes.csv', sep=',', header=True)

# Feats

## Жанры

In [None]:
# Выделяем первый (возможно основной жанр)
item = item.withColumn("First", F.split(F.col("genres"), ",").getItem(0))

## История

In [None]:
# Функция расчета агрегатов 
def get_aggs_all(tbl, field, aggs, alias=''):
    if alias!='':
        alias='_{}'.format(alias)
    df_tbl = tbl
    df_values = df_tbl.select('user_id', *[df_tbl[field].alias(x) for x in aggs])
    df_values = df_values.groupby('user_id').agg(dict(zip(aggs,aggs)))
    schema = StructType([StructField(x.name, StringType(), False) if x.name=='user_id' else \
                  StructField('agg_all_' + str(x.name).split('(')[0] + '{}'.format(alias), FloatType(), True)\
                  for x in df_values.schema])
    df_values = df_values.rdd.map(lambda row: [row.user_id]+ [float(x) if x!=None else None for x in row[1:]])\
    .toDF(schema).fillna(0)
    return df_values

In [None]:
# Расчет длительности просмотров
views_programmes = views_programmes.withColumn('DiffInSeconds',F.col("ts_end").cast("long") - F.col('ts_start').cast("long"))\
.select('user_id', 'item_id', F.from_unixtime('ts_start').alias('ts_start'), 
                        F.from_unixtime('ts_end').alias('ts_end'), 'item_type', 'DiffInSeconds')\
.withColumn('DiffInMins', F.round(F.col('DiffInSeconds')/60))

In [None]:
res1 = get_aggs_all(views_programmes, 'DiffInMins', ['sum', 'count', 'avg', 'max', 'min'])

tbl = views_programmes.groupby('user_id', 'item_id').agg(F.sum('DiffInMins').alias('col'))
res2 = get_aggs_all(tbl, 'col', ['sum', 'count', 'avg', 'max', 'min'], 'group')

tbl = views_programmes.filter(F.col('item_type')=='pvr')
res3 = get_aggs_all(tbl, 'DiffInMins', ['sum', 'count', 'avg', 'max', 'min'], 'pvr')

tbl = views_programmes.filter(F.col('item_type')=='live')
res4 = get_aggs_all(tbl, 'DiffInMins', ['sum', 'count', 'avg', 'max', 'min'], 'live')

res = res1.join(res2, 'user_id', 'inner').join(res3, 'user_id', 'left').join(res4, 'user_id', 'left').fillna(0)

# Киллер фича

In [None]:
killer = get_aggs_all(train, 'purchase', ['sum', 'avg'], 'killer')

# Модель

In [None]:
alls = train.join(test, train.columns, 'outer')

In [None]:
alls = alls.join(killer, 'user_id', 'left').join(res, 'user_id', 'left')\
.join(item.select(*[x for x in item.columns if x not in ['channel_id',
 'datetime_availability_start',
 'datetime_availability_stop',
 'datetime_show_start',
 'datetime_show_stop',
 'content_type',
 'title',
 'genres',
 'region_id',]]), 'item_id', 'left').distinct()

In [None]:
indexer = StringIndexer(inputCol="First", outputCol="first_le", ) 
alls = indexer.setHandleInvalid("keep").fit(alls).transform(alls) 

In [None]:
alls = alls.withColumn('year_n', F.col('year').cast(FloatType()))

In [None]:
alls.printSchema()

In [None]:
cols = [x for x in alls.columns if x not in ['item_id', 'user_id', 'purchase', 'First', 'year']]

In [None]:
alls_tr = alls

In [None]:
assembler = VectorAssembler(inputCols=cols, outputCol='features')
stream_df = assembler.transform(alls_tr.fillna(0))

In [None]:
stream_df.count()

In [None]:
stream_df.select('item_id', 'user_id', 'purchase', 'features').head(1)

In [None]:
lr = LogisticRegression(featuresCol='features', labelCol="purchase", maxIter=15)

In [None]:
rf = RandomForestClassifier(numTrees=100, maxDepth=4, featuresCol='features', labelCol="purchase", seed=42)

In [None]:
gbt = GBTClassifier(featuresCol='features', labelCol='purchase', maxDepth=3, maxIter=350)

In [None]:
train_sp = stream_df.select('item_id', 'user_id', F.col('purchase').cast(FloatType()), 'features')\
.join(train.select('item_id', 'user_id'), ['item_id', 'user_id'], 'inner')
test_sp = stream_df.select('item_id', 'user_id', 'features')\
.join(test.select('item_id', 'user_id'), ['item_id', 'user_id'], 'inner')

In [None]:
rf_model = rf.fit(train)

In [None]:
lr_model = lr.fit(train)

In [None]:
gbt_model = gbt.fit(train_sp)

In [None]:
predictions_test = gbt_model.transform(test_sp)
predictions_train = gbt_model.transform(train_sp)

In [None]:
predictions_test.select('user_id', 'item_id', 'probability')\
.withColumn('purchase', vector_to_array(F.col('probability')).getItem(1))

In [None]:
predictions_test.repartition(1).write.mode('overwrite').csv('lab03', header=True)

In [None]:
spark.stop()