In [24]:
import os
import sys
import re
import json
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

conf = SparkConf()
conf.set("spark.app.name", "nazim lab02") 
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [4]:
courses_to_make_recs = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
 [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
 [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
 [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
 [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
 [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [5]:
courses_rdd = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [6]:
tokenizer = Tokenizer(inputCol="desc", outputCol="words_desc")

In [9]:
words = tokenizer.transform(courses_rdd)

In [10]:
hashingTF = HashingTF(inputCol="words_desc", outputCol="rawFeatures", numFeatures=10000)

In [11]:
featurizedData = hashingTF.transform(words)

In [12]:
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [14]:
idfModel = idf.fit(featurizedData)

In [15]:
rescaledData = idfModel.transform(featurizedData)

In [16]:
normalizer = Normalizer(inputCol="features", outputCol="norm")
data = normalizer.transform(rescaledData)

In [17]:
courses_id = [str(i[0]) for i in courses_to_make_recs]

In [18]:
dot_udf = f.udf(lambda x,y: float(x.dot(y)), DoubleType())

In [20]:
submit_dict = {}

In [21]:
for selected_id in courses_id:
    selected_vector = data.filter(f.col('id') == selected_id).select(f.col('norm').alias('target_norm'))

    joined_df = data.filter(f.col('id') != selected_id) \
                    .select('id', 'norm') \
                    .crossJoin(selected_vector)

    dot_df = joined_df.select(dot_udf('norm', 'target_norm').alias("dot_norm"),
                 'id')

    most_similar_id_list = dot_df.sort(f.desc('dot_norm')).select('id').take(10)
    
    most_similar_id_list = [i.id for i in most_similar_id_list]
    
    submit_dict[selected_id] = most_similar_id_list

In [22]:
submit_dict

{'23126': [13782, 13665, 24419, 20638, 2724, 25782, 2633, 2723, 13348, 15909],
 '21617': [21609,
  21608,
  21616,
  21492,
  21624,
  21623,
  21630,
  21628,
  21508,
  21857],
 '16627': [11431, 12247, 13021, 25010, 11575, 5687, 9598, 5372, 12863, 16769],
 '11556': [16488, 13461, 468, 10447, 387, 22710, 9289, 5936, 23357, 7833],
 '16704': [3864, 23407, 1365, 20645, 1426, 20105, 8217, 1236, 1164, 23864],
 '13702': [864, 1216, 7173, 8313, 1052, 17017, 19613, 21017, 17015, 8082]}

In [25]:
with open("/data/home/nazim.dzhavadov/lab02.json", mode='w') as file:
    json.dump(submit_dict, file)

In [26]:
spark.stop()