In [10]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 3g --driver-memory 2g pyspark-shell'
 
spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
    
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [11]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "GP Lab02") 
conf.set("spark.sql.crossJoin.enabled", "True") 

spark = SparkSession.builder.config(conf=conf).appName("GP Lab02").getOrCreate()

In [12]:
spark

In [13]:
my_list = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
           [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
           [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
           [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
           [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
           [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [14]:
my_list

[[23126, 'en', 'Compass - powerful SASS library that makes your life easier'],
 [21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2'],
 [16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
 [11556,
  'es',
  'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
 [16704, 'ru', 'Программирование на Lazarus'],
 [13702, 'ru', 'Математическая экономика']]

In [15]:
import re
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.sql.types import ArrayType, FloatType, StringType
from pyspark.ml.feature import HashingTF, IDF
import pyspark.sql.functions as f

In [16]:
# Читаем данные
data = spark.read.json('/labs/slaba02/DO_record_per_line.json')

In [17]:
# Проведем токенизацию текста
def tokenization(string):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    return regex.findall(string.lower())
tokenization = f.udf(tokenization, ArrayType(StringType()))

In [18]:
data = data.withColumn("desc_tokenization", tokenization(data.desc))

In [19]:
# Сдедалаем TF-IDF по поулченным векторам слов
ht = HashingTF(numFeatures=10000, inputCol="desc_tokenization", outputCol="tf_features")
tf = ht.transform(data)
idf = IDF(inputCol="tf_features", outputCol="tfidf_features").fit(tf)
tfidf = idf.transform(tf)

In [9]:
# # Здесь указан мой вариант строчек
# courses_to_make_recommendations = \
# [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
#  [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
#  [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
#  [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
#  [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [21]:
# функция косинусного расстояния
@f.udf(returnType=DoubleType())
def sim_cos(v1,v2):
    try:
        p = 2
        return float(v1.dot(v2))/float(v1.norm(p)*v2.norm(p))
    except:
        return 0

In [22]:
# Здесь расчитываем топ10 по косинусному расстоянию с каждой из строчек выше и записываем в словарь
result = {}
for j in my_list:
    tfidf2 = tfidf[(tfidf.id != j[0])&(tfidf.lang == j[1])]
    tfidf_j = tfidf[tfidf.id == j[0]][['tfidf_features']]
    
    tfidf_final = tfidf2.join(tfidf_j.withColumnRenamed('tfidf_features', 'tfidf_features_2'))
    tfidf_final = tfidf_final.withColumn('sim_cos', sim_cos(tfidf_final.tfidf_features, tfidf_final.tfidf_features_2))
    tfidf_final = tfidf_final.orderBy(tfidf_final.sim_cos.desc(), tfidf_final.name, tfidf_final.id).limit(10)[['id']].toPandas()
    result[j[0]] = list(tfidf_final['id'])
print(result)

{23126: [14760, 13665, 13782, 20638, 24419, 15909, 2724, 25782, 17499, 13348], 21617: [21609, 21616, 21608, 22298, 21630, 21628, 21623, 21508, 21081, 19417], 16627: [11431, 11575, 12247, 17964, 5687, 17961, 16694, 12660, 25010, 5558], 11556: [16488, 468, 13461, 23357, 19330, 7833, 9289, 10447, 22710, 11340], 16704: [1236, 1247, 1365, 1273, 20288, 1164, 8186, 1233, 8203, 875], 13702: [864, 21079, 8313, 1041, 28074, 8300, 1033, 13057, 21025, 1111]}


In [23]:
import json
with open('lab02.json', 'w') as f:
    json.dump(result, f)

In [8]:
spark.stop()