In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [5]:
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF, Normalizer

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json
import re

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [3]:
spark

In [6]:
data = spark.read.json('/labs/slaba02/DO_record_per_line.json')

###'cat', 'desc', 'id', 'lang', 'name', 'provider'

In [7]:
given_courses = [
    [23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
    [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
    [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
    [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
    [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
    [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']
]

given_courses

[[23126, 'en', 'Compass - powerful SASS library that makes your life easier'],
 [21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2'],
 [16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
 [11556,
  'es',
  'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
 [16704, 'ru', 'Программирование на Lazarus'],
 [13702, 'ru', 'Математическая экономика']]

In [8]:
ids_courses = [course[0] for course in given_courses]

ids_courses

[23126, 21617, 16627, 11556, 16704, 13702]

In [9]:
#@pandas_udf('array<string>', PandasUDFType.SCALAR) 
def text_compile(s):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    #words = regex.findall(s.lower())
    words = s.str.findall(regex)
    
    return words

text_compile_udf = pandas_udf(text_compile, ArrayType(StringType()))

dot_udf = F.udf(lambda x,y: float(x.dot(y)), DoubleType())

In [10]:
data = data.withColumn("desc_words", text_compile_udf(F.col("desc")))

word_remover = StopWordsRemover(inputCol="desc_words", outputCol="desc_fin")
data = word_remover.transform(data)

hashingTF = HashingTF(inputCol="desc_fin", outputCol="features").setNumFeatures(10000)
tf = hashingTF.transform(data)
idf = IDF(inputCol="features", outputCol="features_idf").fit(tf)
tfidf = idf.transform(tf)

normalizer = Normalizer(inputCol="features_idf", outputCol="features_norm")
data = normalizer.transform(tfidf)

In [11]:
cols_to_sel =['id','lang', 'name','features_norm']
data_courses = data.select(cols_to_sel).filter(F.col('id').isin(ids_courses))

In [12]:
result = (data.alias("i")
               .join(data_courses.alias("j"), (F.col("i.id") != F.col("j.id"))& (F.col("i.lang") == F.col("j.lang")))
               .select(
                   F.col("i.id").alias("i"), 
                   F.col("j.id").alias("j"),
                   F.col("i.lang").alias("i_lang"), 
                   F.col("j.lang").alias("j_lang"),
                   F.col("i.name").alias("i_name"), 
                   F.col("j.name").alias("j_name"), 
                   dot_udf("i.features_norm", "j.features_norm").alias("similar_cos")
               )
         )

In [13]:
result.show()

+-----+-----+------+------+--------------------+--------------------+--------------------+
|    i|    j|i_lang|j_lang|              i_name|              j_name|         similar_cos|
+-----+-----+------+------+--------------------+--------------------+--------------------+
|16308|21617|    en|    en|Up and Running wi...|Preparing for the...|0.019807275787085153|
|16308|23126|    en|    en|Up and Running wi...|Compass - powerfu...| 0.01612470740345601|
|16309|21617|    en|    en|Up and Running wi...|Preparing for the...|0.002341370521805...|
|16309|23126|    en|    en|Up and Running wi...|Compass - powerfu...|4.825378280640825E-4|
|16310|21617|    en|    en|Up and Running wi...|Preparing for the...|0.001736986952796...|
|16310|23126|    en|    en|Up and Running wi...|Compass - powerfu...|0.014177664315169359|
|16311|21617|    en|    en|Up and Running wi...|Preparing for the...|  0.0365482066672459|
|16311|23126|    en|    en|Up and Running wi...|Compass - powerfu...| 0.01297229572439117|

In [14]:
ids_courses

[23126, 21617, 16627, 11556, 16704, 13702]

In [41]:
course= 23126
ans = {}
temp = (result
        .filter((F.col('j')==course)&(F.col('i')!=course))
        .sort(F.col('similar_cos').desc())
        .select('i')
        .limit(10)
        .collect()
       )
result_list = [temp[i][0] for i in range(10)]

ans[str(course)] = result_list



In [42]:
ans

{'23126': [14760, 13665, 13782, 11978, 25782, 3819, 26864, 14380, 3919, 6206]}

In [40]:
result_list[1][0]

13665

In [43]:
ans = {}

for course in ids_courses:
    
    temp = (result
            .filter((F.col('j')==course)&(F.col('i')!=course))
            .sort(F.col('similar_cos').desc())
            .select('i')
            .limit(10)
            .collect()
           )
    
    result_list = [temp[i][0] for i in range(10)]
    ans[str(course)] = result_list
    
    print(course)

23126
21617
16627
11556
16704
13702


In [44]:
ans

{'23126': [14760, 13665, 13782, 11978, 25782, 3819, 26864, 14380, 3919, 6206],
 '21617': [21609,
  21616,
  22298,
  21608,
  21081,
  19417,
  21673,
  21630,
  21628,
  21623],
 '16627': [11431, 17964, 11575, 12247, 10738, 13021, 17961, 5558, 12660, 5687],
 '11556': [16488, 5750, 16929, 11554, 18005, 7833, 7121, 23357, 3660, 8098],
 '16704': [1247, 1236, 1365, 1164, 1273, 20288, 1233, 18331, 8186, 8203],
 '13702': [864, 21079, 8313, 28074, 1111, 13057, 1033, 21025, 8123, 1217]}

In [43]:
ans

{'23126': [14760, 13665, 13782, 11978, 25782, 3819, 26864, 14380, 3919, 6206],
 '21617': [21609,
  21616,
  22298,
  21608,
  21081,
  19417,
  21673,
  21630,
  21628,
  21623],
 '16627': [11431, 17964, 11575, 12247, 10738, 13021, 17961, 5558, 12660, 5687],
 '11556': [16488, 5750, 16929, 11554, 18005, 7833, 7121, 23357, 8098, 3660],
 '16704': [1247, 1236, 1365, 1164, 20288, 1273, 1233, 18331, 8186, 8203],
 '13702': [864, 21079, 8313, 28074, 1111, 13057, 1033, 21025, 8123, 1217]}

In [44]:
import json

with open("lab02.json", "w") as file:
    json.dump(ans , file) 

In [45]:
spark.stop()