In [None]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Konstantin Diakvnishvili lab 3 app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
spark

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, CountVectorizer, StopWordsRemover, OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.sql.functions import lower, col, udf, pandas_udf, round, split, concat_ws, explode, mean
from pyspark.sql.types import StructType,StructField, StringType, IntegerType 
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, FloatType
import pandas as pd
import re

In [None]:
schema = StructType() \
      .add("user_id", IntegerType(), True) \
      .add("item_id", IntegerType(), True) \
      .add("purchase", IntegerType(), True)
      
df_user = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("/labs/slaba03/laba03_train.csv")

In [None]:
schema = StructType() \
      .add("user_id", IntegerType(), True) \
      .add("item_id", IntegerType(), True)
      
      
df_user_test = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("/labs/slaba03/laba03_test.csv")

In [None]:
read_items_schema = StructType(fields=[StructField('item_id', IntegerType()), 
StructField('channel_id', IntegerType()),
StructField('datetime_availability_start', StringType()),
StructField('datetime_availability_stop', StringType()),
StructField('datetime_show_start', StringType()),
StructField('datetime_show_stop', StringType()),
StructField('content_type', IntegerType()),
StructField('title', StringType(), nullable=True),
StructField('year', FloatType(), nullable=True),
StructField('genres', StringType()),
StructField('region_id', IntegerType()),
]) 

df_items = spark.read.format("csv") \
      .option("header", True) \
      .option("sep", "\t")\
      .schema(read_items_schema) \
      .load("/labs/slaba03/laba03_items.csv")

In [None]:
read_users_schema = StructType(fields=[StructField('user_id', IntegerType()), 
StructField('item_id', IntegerType()),
StructField('ts_start', IntegerType()),
StructField('ts_end', IntegerType()),
StructField('item_type', StringType()),
]) 

df_views_programmes = spark.read.format("csv") \
      .option("header", True) \
      .schema(read_users_schema) \
      .load("/labs/slaba03/laba03_views_programmes.csv")

In [None]:
df_items = df_items.filter(df_items.content_type == 1)\
                        .withColumn("genres", lower(col('genres'))) \
                        .withColumn("title", lower(col('title'))) \
                        .drop('channel_id', 'datetime_availability_start', 'datetime_availability_stop', \
                              'datetime_show_start', 'datetime_show_stop', 'content_type', 'region_id') \
                        .na.fill("",["genres"])

In [None]:
@pandas_udf(ArrayType(StringType()))
def tokenizer_udf(series):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    words = series.str.findall(regex)
    return words

In [None]:
df_items = df_items.withColumn("genres_words", split('genres', ','))\
#                   .withColumn("title_words", tokenizer_udf('title'))

In [None]:
stop_words = StopWordsRemover.loadDefaultStopWords("english") + \
             StopWordsRemover.loadDefaultStopWords("russian")
list_add = ['сурдоперевод', '0', '2', '3', '4', '5', '6', '7', '8','9', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x']
stop_words = stop_words + list_add

swr = StopWordsRemover(inputCol="title_words", outputCol="title_words_filtered", stopWords=stop_words)    

In [None]:
cv = CountVectorizer(inputCol="genres_words", outputCol="genres_count", binary=True)

In [None]:
df_items = cv.fit(df_items).transform(df_items)

In [None]:
df_user_items = df_user.join(df_items, ['item_id'])
df_user_test_items = df_user_test.join(df_items, ['item_id'])

In [None]:
#df_sample = df_user_items.sampleBy('purchase', fractions={0: 0.5, 1:0.5}, seed=5757)
df_sample = df_user_items

In [None]:
df_user_stat = df_sample.groupby('user_id').agg(mean('purchase').alias('user_purchase_rate')) \
#                        .withColumn('user_purchase_rate', round('user_purchase_rate', 2))
df_item_stat = df_sample.groupby('item_id').agg(mean('purchase').alias('item_purchase_rate'))
#                        .withColumn('item_purchase_rate', round('item_purchase_rate', 2))

In [None]:
df_user_items_stat = df_user_items \
                         .join(df_user_stat, on=['user_id'], how='left') \
                         .join(df_item_stat, on=['item_id'], how='left') \
                         .na.fill(0, ['user_purchase_rate', 'item_purchase_rate']) \
                         .repartition(10) \
                         .cache()
df_user_test_items_stat = df_user_test_items\
                         .join(df_user_stat, on=['user_id'], how='left') \
                         .join(df_item_stat, on=['item_id'], how='left') \
                         .na.fill(0, ['user_purchase_rate', 'item_purchase_rate']) \
                         .repartition(10) \
                         .cache()

In [None]:
#hasher = HashingTF(numFeatures=5000, binary=False, inputCol="title_words", outputCol="title_freq")
assembler = VectorAssembler(inputCols=["user_purchase_rate","item_purchase_rate", "genres_count"], outputCol="features")
gbt = GBTClassifier(featuresCol="features", labelCol="purchase")

In [None]:
pipeline = Pipeline(stages=[
#    swr,
#    hasher,
    assembler,
    gbt
])

pipeline_model = pipeline.fit(df_user_items_stat)

In [None]:
prediction = pipeline_model.transform(df_user_test_items_stat)

In [None]:
test_predictions_df = prediction.select('user_id', 'item_id', 'probability').toPandas()
test_predictions_df['purchase'] = test_predictions_df['probability'].apply(lambda x: x[1])
test_predictions_df = test_predictions_df.drop(['probability'], axis=1).sort_values(['user_id', 'item_id'])

In [None]:
test_predictions_df.to_csv('lab03.csv')

In [None]:
spark.stop()