In [1]:
# Получаем контекст Spark
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.5
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
spark

In [3]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf, col, isnan, isnull, broadcast, desc, lower
from pyspark.sql.types import FloatType, ArrayType, StringType
import json
import re

In [4]:
# Курсы для поиска рекомендаций (вариант #10)
search_courses = [[158, u'en', u'C++ For C Programmers'], [11286, u'en', u'The YouTube OneChannel Blueprint by Paul Colligan'], [12157, u'es', u'Word 2013. Nivel medio.'], [13396, u'es', u'Aprende a programar en PHP y desarrolla tu propio CMS by Eduardo Rodriguez Pati\xf1o'], [1078, u'ru', u'\u0422\u0435\u043e\u0440\u0438\u044f \u0438\u043d\u0444\u043e\u0440\u043c\u0430\u0446\u0438\u043e\u043d\u043d\u044b\u0445 \u0441\u0438\u0441\u0442\u0435\u043c'], [17128, u'ru', u'\u041f\u043e\u0436\u0438\u043b\u043e\u0439 \u0447\u0435\u043b\u043e\u0432\u0435\u043a \u0432 \u0441\u043e\u0432\u0440\u0435\u043c\u0435\u043d\u043d\u043e\u043c \u043e\u0431\u0449\u0435\u0441\u0442\u0432\u0435']]

In [5]:
search_courses

[[158, 'en', 'C++ For C Programmers'],
 [11286, 'en', 'The YouTube OneChannel Blueprint by Paul Colligan'],
 [12157, 'es', 'Word 2013. Nivel medio.'],
 [13396,
  'es',
  'Aprende a programar en PHP y desarrolla tu propio CMS by Eduardo Rodriguez Patiño'],
 [1078, 'ru', 'Теория информационных систем'],
 [17128, 'ru', 'Пожилой человек в современном обществе']]

In [6]:
#  Список id курсов
search_courses_ids = sorted([row[0] for row in search_courses])

In [7]:
search_courses_ids 

[158, 1078, 11286, 12157, 13396, 17128]

In [8]:
# Читаем дата-файл с курсами
data = spark.read.json("/labs/laba02/DO_record_per_line.json")

In [9]:
data = data.withColumn("desc", lower(col("desc")))

In [10]:
data.show(10)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|this course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|this online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|this course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|we live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|this self-paced c...|  8|  en|           Bioethics|Canvas Network|
|9/humanities|15/m...|this game-based c...|  9|  en|College Foundatio...|Canvas Network|
|  14/social_sciences|what’s in your di...| 10|  en|Digital Literacies I|Canvas Network|
|  14/social_sciences|the goal of the d...| 11|  en|Digital Literacie...|Canvas Network|
|  14/social_sciences

In [11]:
# Токенизатор
tokenizer = Tokenizer(inputCol="desc", outputCol="desc_words")

In [12]:
# Стоп-слова 
eng_stop_words = StopWordsRemover.loadDefaultStopWords("english")

In [13]:
swr = StopWordsRemover(inputCol="desc_words", outputCol="words_filtered", stopWords=eng_stop_words)

In [14]:
# Hashing trick
hashingTF = HashingTF(inputCol="words_filtered", outputCol="rawFeatures", numFeatures=10000, binary=False)

In [15]:
# IDF
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)

In [16]:
# Собираем pipeline
pipeline = Pipeline(stages=[tokenizer, swr, hashingTF, idf])

In [17]:
data_idf = pipeline.fit(data).transform(data)

In [18]:
data_idf = data_idf.select("id", "lang", "name", "words_filtered", "features")

In [19]:
data_idf.show(5, True)

+---+----+--------------------+--------------------+--------------------+
| id|lang|                name|      words_filtered|            features|
+---+----+--------------------+--------------------+--------------------+
|  4|  en|Accounting Cycle:...|[course, introduc...|(10000,[36,42,63,...|
|  5|  en|American Counter ...|[online, course, ...|(10000,[32,222,29...|
|  6|  fr|Arithmétique: en ...|[course, taught, ...|(10000,[30,41,246...|
|  7|  en|Becoming a Dynami...|[live, digitally,...|(10000,[493,721,8...|
|  8|  en|           Bioethics|[self-paced, cour...|(10000,[32,65,115...|
+---+----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [20]:
%%time
# Выбираем вектора IDF по курсам для поиска рекомендаций
search_courses_df = data_idf.where(col("id").isin(search_courses_ids))

# Переименовымваем поля для джойна, чтобы не повторялись
search_courses_idf = search_courses_df.select("id", "lang", "name", "features") \
    .withColumnRenamed("id", "search_id") \
    .withColumnRenamed("lang", "search_lang") \
    .withColumnRenamed("name", "search_name") \
    .withColumnRenamed("features", "search_features") \

# Перемножаем (cross-join) список всех курсов со списком поиска, чтобы просчитать все в один заход
joined_data = data_idf.join(broadcast(search_courses_idf), data_idf.lang == search_courses_idf.search_lang) \
    .filter("lang = search_lang") \
    .filter("id != search_id")  

# Создаем и регистриуем UDF для косинуса угла (cosine simularity )
def cos_sim(a,b):
    return float(a.dot(b) / (a.norm(2) * b.norm(2)))

cos_sim_udf = udf(cos_sim, FloatType())

# Считаем меру похожести для каждой пары - cosine simularity
joined_data = joined_data.withColumn("cos_sim", cos_sim_udf(col("features"), col("search_features"))) \
                .filter((isnan(col("cos_sim")) == False) & (isnull(col("cos_sim")) == False))

joined_data.cache()

# Собираем все рекомендации в dict
result = dict()
for course_id in search_courses_ids:
    res = joined_data.filter(col("search_id") == course_id).orderBy(desc("cos_sim"), "name", "id").limit(10).select("id").collect()
    ids = [row[0] for row in res]
    result[str(course_id)] = ids

CPU times: user 28 ms, sys: 12 ms, total: 40 ms
Wall time: 15.2 s


In [21]:
result

{'158': [18170, 27094, 19278, 20260, 17802, 24967, 13398, 3041, 23129, 12666],
 '1078': [909, 20100, 933, 21403, 1080, 887, 1056, 1058, 869, 20329],
 '11286': [25819, 7760, 25820, 25818, 25816, 16906, 11469, 4638, 4139, 5199],
 '12157': [26336, 26670, 16859, 10640, 21337, 18030, 22284, 12486, 6730, 387],
 '13396': [13365, 4096, 9470, 18823, 10035, 25399, 17750, 20157, 4743, 20070],
 '17128': [17012, 20417, 17014, 17009, 7630, 17016, 8832, 17127, 18943, 17215]}

In [22]:
# Пишем в JSON
with open('lab02.json', 'w', encoding='utf-8') as f:
    json.dump(result, f, indent=4)

In [23]:
# Проверка файла
!cat lab02.json

{
    "158": [
        18170,
        27094,
        19278,
        20260,
        17802,
        24967,
        13398,
        3041,
        23129,
        12666
    ],
    "1078": [
        909,
        20100,
        933,
        21403,
        1080,
        887,
        1056,
        1058,
        869,
        20329
    ],
    "11286": [
        25819,
        7760,
        25820,
        25818,
        25816,
        16906,
        11469,
        4638,
        4139,
        5199
    ],
    "12157": [
        26336,
        26670,
        16859,
        10640,
        21337,
        18030,
        22284,
        12486,
        6730,
        387
    ],
    "13396": [
        13365,
        4096,
        9470,
        18823,
        10035,
        25399,
        17750,
        20157,
        4743,
        20070
    ],
    "17128": [
        17012,
        20417,
        17014,
        17009,
        7630,
        1701

In [24]:
# Стопим контекст
sc.stop()