In [None]:
!hdfs dfs -ls /labs/slaba02

In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [3]:
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, udf, collect_list, rank, broadcast
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [4]:
import re
import json

In [5]:
data = spark.read.json('/labs/slaba02/DO_record_per_line.json')

In [6]:
data = data[['id', 'desc', 'lang']].cache()

In [7]:
data.count()

28153

In [8]:
@udf(returnType=ArrayType(StringType()))
def tokenize(s):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    return regex.findall(s.lower())

In [9]:
data = data.withColumn('words', tokenize(col('desc')))

In [10]:
hashingTF = HashingTF(inputCol='words', outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
pipeline = Pipeline(stages=[hashingTF, idf])

In [11]:
preprocessing_pipeline = pipeline.fit(data)
preprocessed_data = preprocessing_pipeline.transform(data)

In [12]:
given_courses = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [13]:
schema = StructType([StructField('id', LongType()),
                    StructField('lang', StringType()),
                    StructField('name', StringType())])
test_courses = spark.createDataFrame(given_courses, schema = schema)

In [14]:
preprocessed_test = preprocessed_data.join(broadcast(test_courses), on=['id'], how='right') \
                                .select('id', test_courses.lang.alias('lang'), 'desc', 'features')

In [15]:
preprocessed_test.show()

+-----+----+--------------------+--------------------+
|   id|lang|                desc|            features|
+-----+----+--------------------+--------------------+
|23126|  en| Improve your SAS...|(10000,[87,91,128...|
|16627|  es| Hazte más emplea...|(10000,[55,76,192...|
|13702|  ru|Математическая эк...|(10000,[310,942,2...|
|16704|  ru|В курсе рассматри...|(10000,[381,1144,...|
|11556|  es| La transformació...|(10000,[249,522,5...|
|21617|  en|An introduction t...|(10000,[17,128,16...|
+-----+----+--------------------+--------------------+



In [16]:
@udf(returnType=FloatType())
def cosine_similarity(X, Y):
    denom = X.norm(2) * Y.norm(2)
    if denom == 0.0:
        return -1.0
    return float(X.dot(Y) / denom)

In [24]:
result = preprocessed_data.alias("full").crossJoin(broadcast(preprocessed_test.alias("test")))\
    .where((col('test.lang')==col('full.lang'))&(col('test.id')!=col('full.id'))).select(
        col("test.id").alias("test_id"), 
        col("full.id").alias("full_id"),
        col("test.desc").alias("test_desc"), 
        col("full.desc").alias("full_desc"), 
        cosine_similarity(col("test.features"), col("full.features")).alias("cos_sim"))

In [25]:
window = Window.partitionBy(result.test_id).orderBy(result.cos_sim.desc(), result.full_desc, result.full_id)
filtered_result = result.select('*', rank().over(window).alias('rank')).filter(col('rank') <= 10)

In [26]:
grouped_result = filtered_result.groupby('test_id').agg(collect_list('full_id').alias('rec_ids'))

In [27]:
temp = grouped_result.select(col('test_id').cast(StringType()), 'rec_ids').toPandas().set_index('test_id').T.to_dict('list')

In [28]:
final_res = {k: v[0].tolist() for k, v in temp.items()}

In [29]:
final_res['16704']

[1236, 1247, 1365, 1273, 20288, 1164, 8186, 1233, 8203, 875]

In [30]:
with open('lab02.json', 'w', encoding='utf-8') as f:
    json.dump(final_res, f, ensure_ascii=False, indent=4)

In [31]:
sc.stop()