In [1]:
import os
import sys

os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [20]:
data = spark.read.json('/labs/slaba02/DO_record_per_line.json')

In [3]:
data.count()

28153

In [4]:
data.show()

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
|9/humanities|15/m...|This game-based c...|  9|  en|College Foundatio...|Canvas Network|
|  14/social_sciences|What’s in your di...| 10|  en|Digital Literacies I|Canvas Network|
|  14/social_sciences|The goal of the D...| 11|  en|Digital Literacie...|Canvas Network|
|  14/social_sciences

In [5]:
# Курсы, по которым нужно выдать решение
courses = [
[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
[21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
[16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
[11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
[16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
[13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']
]

In [14]:
from pyspark.ml.feature import HashingTF, IDF, Normalizer, Tokenizer, StopWordsRemover
from pyspark.ml import Pipeline
from pyspark.sql.types import ArrayType, StringType, FloatType
import pyspark.sql.functions as f
from pyspark.sql.functions import pandas_udf
import re

In [19]:
search_courses_ids = sorted([row[0] for row in courses])

In [23]:
data = data.withColumn("desc", f.lower(f.col("desc")))

In [26]:
tokenizer = Tokenizer(inputCol="desc", outputCol="desc_words")

In [27]:
eng_stop_words = StopWordsRemover.loadDefaultStopWords("english")

In [28]:
swr = StopWordsRemover(inputCol="desc_words", outputCol="words_filtered", stopWords=eng_stop_words)

In [29]:
# Hashing trick
hashingTF = HashingTF(inputCol="words_filtered", outputCol="rawFeatures", numFeatures=10000, binary=False)

In [30]:
# IDF
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)

In [33]:
# Собираем pipeline
pipeline = Pipeline(stages=[tokenizer, swr, hashingTF, idf])

In [34]:
data_idf = pipeline.fit(data).transform(data)

In [35]:
data_idf = data_idf.select("id", "lang", "name", "words_filtered", "features")

In [44]:
%%time
# Выбираем вектора IDF по курсам для поиска рекомендаций
search_courses_df = data_idf.where(f.col("id").isin(search_courses_ids))

# Переименовымваем поля для джойна, чтобы не повторялись
search_courses_idf = search_courses_df.select("id", "lang", "name", "features") \
    .withColumnRenamed("id", "search_id") \
    .withColumnRenamed("lang", "search_lang") \
    .withColumnRenamed("name", "search_name") \
    .withColumnRenamed("features", "search_features") \

# Перемножаем (cross-join) список всех курсов со списком поиска, чтобы просчитать все в один заход
joined_data = data_idf.join(f.broadcast(search_courses_idf), data_idf.lang == search_courses_idf.search_lang) \
    .filter("lang = search_lang") \
    .filter("id != search_id")  

# Создаем и регистриуем UDF для косинуса угла (cosine simularity )
def cos_sim(a,b):
    return float(a.dot(b) / (a.norm(2) * b.norm(2)))

cos_sim_udf = f.udf(cos_sim, FloatType())

# Считаем меру похожести для каждой пары - cosine simularity
joined_data = joined_data.withColumn("cos_sim", cos_sim_udf(f.col("features"), f.col("search_features"))) \
                .filter((f.isnan(f.col("cos_sim")) == False) & (f.isnull(f.col("cos_sim")) == False))

joined_data.cache()

# Собираем все рекомендации в dict
result = dict()
for course_id in search_courses_ids:
    res = joined_data.filter(f.col("search_id") == course_id).orderBy(f.desc("cos_sim"), "name", "id").limit(10).select("id").collect()
    ids = [row[0] for row in res]
    result[str(course_id)] = ids

CPU times: user 28 ms, sys: 13.1 ms, total: 41 ms
Wall time: 32.8 s


In [45]:
result

{'11556': [16488, 13461, 468, 23357, 19330, 16929, 387, 10447, 11554, 9289],
 '13702': [864, 1216, 7173, 1052, 8313, 17017, 19613, 21017, 17015, 8082],
 '16627': [11431, 12247, 13021, 25010, 11575, 5687, 5372, 12863, 9598, 22680],
 '16704': [1365, 20645, 1426, 20105, 8217, 1236, 1164, 1219, 8123, 875],
 '21617': [21609,
  21608,
  21616,
  21492,
  21624,
  21623,
  21630,
  21628,
  21508,
  21703],
 '23126': [13782, 13665, 24419, 20638, 2724, 25782, 2633, 2723, 13348, 15909]}

In [46]:
# Пишем в JSON
with open('lab02.json', 'w', encoding='utf-8') as f:
    json.dump(result, f, indent=4)

In [None]:
!cat lab02.json

In [47]:
spark.stop()