In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row

from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark.ml import Pipeline
from pyspark.mllib.linalg import SparseVector

import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [3]:
test_courses = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'],
                [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'],
                [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
                [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
                [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'],
                [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

test_ids = [item[0] for item in test_courses]

In [95]:
data = spark.read.json('/labs/slaba02/DO_record_per_line.json')
data.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows



In [96]:
data.filter(data.id == 7527).collect()

[Row(cat='15/mathematics_statistics_and_data_analysis', desc='\n\t  \t     \n\t  \t  ', id=7527, lang='en', name='Writing in the Sciences', provider='Harvard Extension School')]

In [5]:
# паттерн поиска токенов
pattern = r"[a-zA-Zа-яА-Яё0-9]{2,}"

# трансформеры и эстиматоры
regexTokenizer = RegexTokenizer(inputCol='desc', outputCol='desc_tokens', gaps=False, pattern=pattern)
remover = StopWordsRemover(inputCol=regexTokenizer.getOutputCol(), outputCol='desc_filtered')
tf = HashingTF(inputCol=remover.getOutputCol(), outputCol='raw_features', numFeatures=10000)
idf = IDF(inputCol=tf.getOutputCol(), outputCol='features')

In [6]:
# определяем список стоп-слов для удаления из описаний курсов
eng_stopwords = StopWordsRemover.loadDefaultStopWords("english")
rus_stopwords = StopWordsRemover.loadDefaultStopWords("russian")
esp_stopwords = StopWordsRemover.loadDefaultStopWords("spanish")

stopwords = {
    'en': eng_stopwords,
    'es': esp_stopwords,
    'ru': rus_stopwords,
}

In [68]:
# preprocessing
#
# выбираем курсы на языках из задания, отфильтруем описания, содержащие только пробелы или спец.символы
# разбиваем тексты на токены
processed_data = data.filter(data.lang.isin(['en', 'es', 'ru']))\
                     .select(data.id, data.lang, data.name, data.desc)
processed_data = processed_data.withColumn('desc', F.regexp_replace('desc', '\\n|\\r|\\t', ''))\
                               .withColumn('desc', F.regexp_replace('desc', '^\s+$', ''))\
                               .filter("desc != ''")
processed_data = regexTokenizer.transform(processed_data)

# удаляем стоп-слова - для каждого языка используем свой набор данных и стоп-слов
rm_stopwords_temp = spark.createDataFrame([], processed_data.schema.add(StructField("desc_filtered", ArrayType(StringType()), True)))
for key, value in stopwords.items():
    remover.setInputCol(regexTokenizer.getOutputCol())
    remover.setOutputCol('desc_filtered')
    remover.setStopWords(value)
    temp = remover.transform(processed_data.filter(F.col('lang') == key))
    rm_stopwords_temp = rm_stopwords_temp.union(temp)
    
processed_data = rm_stopwords_temp

In [69]:
# считаем tf-idf
featurizedData = tf.transform(processed_data)
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [256]:
# вектор features хранит значения tf-idf для слов
rescaledData.first()

Row(id=4, lang='en', name='Accounting Cycle: The Foundation of Business Measurement and Reporting', desc='This course introduces the basic financial statements used by most businesses, as well as the essential tools used to prepare them. This course will serve as a resource to help business students succeed in their upcoming university-level accounting classes, and as a refresher for upper division accounting students who are struggling to recall elementary concepts essential to more advanced accounting topics. Business owners will also benefit from this class by gaining essential skills necessary to organize and manage information pertinent to operating their business. At the conclusion of the class, students will understand the balance sheet, income statement, and cash flow statement. They will be able to differentiate between cash basis and accrual basis techniques, and know when each is appropriate. They’ll also understand the accounting equation, how to journalize and post transac

In [81]:
# функция расчета косинусного сходства
def get_cosine_similarity(v1, v2):
    return float(v1.dot(v2) / (v1.norm(2) * v2.norm(2)))

In [91]:
# словарь результатов
recommendations = {}

# схема временного датафрейма для хранения id курса и косинусного расстояния
schema = StructType([
  StructField('id', IntegerType(), True), # course id
  StructField('cs', FloatType(), True) # cosine similarity
])

# для каждого курса из задания
for course in test_courses:
    id_, lang, desc = course
    dists = []
    
    # находим tf-idf вектор, берем все курсы необходимого языка
    course_vec = rescaledData.filter(rescaledData.id == id_).first().features
    courses_to_search = rescaledData.filter(rescaledData.lang == lang).repartition(4)
    
    # считаем косинусные расстояния курса со всеми остальными курсами (плохо использовать toLocalIterator в случае очень больших партиций)
    for row in courses_to_search.rdd.toLocalIterator():
        dists.append((row.id, get_cosine_similarity(course_vec, row.features)))
    
    # делаем временный датафрейм с косинусами
    course_dists = spark.createDataFrame(dists, schema).repartition(4)
    # джойним по id
    courses_to_search = courses_to_search.join(course_dists, on='id', how='inner').repartition(4)
    # сортируем согласно заданию, берем топ 11 (т.к. топ 1 - сам курс, для которого ищем рекомендации)
    result_for_course = courses_to_search.orderBy(['cs', 'name', 'id'], ascending=[0, 1, 1]).limit(11).collect()
    
    # собираем id рекомендованных курсов (выкинули первый курс)
    recommendations[id_] = [row.id for row in result_for_course[1:]]

In [92]:
recommendations

{23126: [13665, 14760, 13782, 20638, 24419, 15909, 2724, 25782, 17499, 19270],
 21617: [21617, 21616, 22298, 21608, 21081, 21630, 21628, 19417, 21623, 21508],
 16627: [11431, 17964, 12247, 5687, 16694, 12660, 5558, 17961, 11575, 9563],
 11556: [16488, 468, 19330, 22710, 13461, 10447, 21707, 23357, 19279, 10384],
 16704: [1236, 1247, 1365, 1164, 1273, 20288, 8186, 1233, 8203, 18331],
 13702: [13702, 28074, 21079, 1041, 8300, 13057, 8313, 21025, 1033, 1111]}

In [94]:
# пишем результат
with open('lab02.json', 'w') as f:
    json.dump(recommendations, f)

Вопрос: как можно передать один вектор в строковом представлении в качестве аргумента, чтобы использовать UDF, а именно преобразовать строку в вектор SparseVector.parse(v1), и вычислить косинусное расстояние

In [416]:
@F.udf(returnType=FloatType())
def get_cosine_similarity(v1, v2):
    #temp_v = SparseVector.parse(v1)
    return float(v1.dot(v2) / (v1.norm(2) * v2.norm(2)))

In [None]:
f.withColumn('cs', get_cosine_similarity(F.col('temp'), F.col('features'))).show(2, vertical=True)

In [371]:
SparseVector.parse(str(v))

SparseVector(10000, {87: 2.6541, 246: 4.0504, 258: 3.5174, 263: 16.527, 341: 5.7166, 419: 2.4797, 524: 2.1249, 721: 0.9962, 727: 2.116, 814: 4.9313, 870: 2.6432, 937: 2.4997, 1022: 2.0174, 1072: 3.0586, 1073: 5.6459, 1169: 2.5727, 1197: 3.5644, 1218: 5.6451, 1272: 7.6908, 1312: 0.7891, 1368: 1.9217, 1443: 3.077, 1463: 16.444, 1470: 48.7977, 1645: 3.6414, 1652: 3.1637, 1682: 3.4965, 1770: 3.3281, 1851: 3.009, 1882: 2.6432, 1959: 0.6588, 1990: 3.6015, 2080: 0.8154, 2412: 2.5418, 2460: 4.8259, 2587: 6.0241, 2691: 4.2987, 2801: 2.2498, 2971: 5.0504, 3102: 2.9043, 3115: 2.7285, 3145: 2.6116, 3154: 8.7203, 3162: 3.015, 3202: 2.7623, 3330: 1.649, 3434: 3.981, 3444: 3.7052, 3491: 5.3653, 3525: 2.6804, 3624: 9.9838, 3721: 3.0858, 3757: 6.0121, 3767: 3.0484, 3772: 3.1698, 3783: 2.2084, 3849: 2.0785, 3855: 0.7397, 3916: 4.733, 4061: 4.5927, 4115: 3.0143, 4260: 5.7832, 4394: 5.9564, 4422: 3.9969, 4489: 5.7115, 4520: 5.8993, 4712: 4.4165, 4762: 3.7496, 4805: 26.6333, 4858: 3.0218, 4888: 5.6769, 490

In [97]:
spark.stop()