In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 6 --executor-memory 4g --executor-cores 2 --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, ArrayType

raw = spark.read.json('/labs/slaba02/DO_record_per_line.json').drop("provider")
raw = raw.repartition(12)

#Ниже код для разбора категории текста, он в нашем случае не поможет, так как тестовые данные ее не содержат
#def filter_empty(l):
#    return [int(i) for i in (filter(lambda x: x is not None and len(x) > 0, l))]

#filter_empty_udf = F.udf(filter_empty, ArrayType(StringType()))

#raw = raw.withColumn("cat_code", filter_empty_udf(F.split(raw.cat, "\/\w+\|*")))\
#        .drop("provider", "cat")\
#        .withColumnRenamed("cat_code", "cat")\
#        .drop("cat_code")


#df = df.filter((df.lang == 'en') & (df.cat.isin(['3', '6', '14'])))


In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover

#tokenizer = Tokenizer(inputCol="desc", outputCol="words")
tokenizer = RegexTokenizer(inputCol="desc", outputCol="raw_words", pattern=u'[\p{L}+]{2,}', gaps=False)

stop_words = list(set.union(set(StopWordsRemover.loadDefaultStopWords("russian")),
set(StopWordsRemover.loadDefaultStopWords("spanish")),
set(StopWordsRemover.loadDefaultStopWords("english"))))

remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol = "words")

from pyspark.ml.feature import HashingTF
tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures", numFeatures=10000)

#Альтернативный вариант посчитать разряженного вектора bag-of-words CountVectorizer
#from pyspark.ml.feature import CountVectorizer
#tf = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="rawFeatures")


from pyspark.ml.feature import IDF
idf = IDF(inputCol=tf.getOutputCol(), outputCol="features")


from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[
    tokenizer,
    remover,
    tf,
    idf
])

pipeline_model = pipeline.fit(raw)
df = pipeline_model.transform(raw).select("id", "lang", "name", "features").cache()

In [4]:
dept = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]
test_data = spark.createDataFrame(data=dept, schema=["id", "lang", "name"]).drop("name", "lang")
s_df = test_data.join(df, "id").\
                            withColumnRenamed("id", "id_").\
                            withColumnRenamed("lang", "lang_").\
                            withColumnRenamed("name", "name_").\
                            withColumnRenamed("features", "features_")
s_df.printSchema()


root
 |-- id_: long (nullable = true)
 |-- lang_: string (nullable = true)
 |-- name_: string (nullable = true)
 |-- features_: vector (nullable = true)



In [5]:
from pyspark.sql.types import FloatType

distance_udf = F.udf(lambda u, v: float((v.dot(u) / (v.norm(2) * u.norm(2)))), FloatType())
tup = s_df.crossJoin(df).filter("id != id_").filter("lang == lang_").\
                withColumn("rating", distance_udf(F.col("features"), F.col("features_"))).\
                select("id", "id_", "rating", "name").cache()

In [6]:
from operator import itemgetter
from pyspark.sql.types import ArrayType, IntegerType
def top10(x, y, z):
    res = sorted(list(zip(x, [-i for i in y], z)), key=itemgetter(1, 2, 0),)
    return [x[0] for x in res[:10]]

top10_udf = F.udf(top10, ArrayType(IntegerType()))

res = tup.groupBy("id_").\
                    agg(F.collect_list("id").alias("ids"), F.collect_list("rating").alias("ratings"), F.collect_list("name").alias("names")).\
                    withColumn("top10", top10_udf(F.col("ids"), F.col("ratings"), F.col("names"))).\
                    select("id_", "top10").cache()
output = dict([(x[0], x[1]) for x in res.collect()])

In [8]:
import json
with open("lab02.json", 'w') as f:
    json.dump(output, f)

In [9]:
spark.stop()