In [1]:
import os
import sys

os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("rez_lab02")
         .getOrCreate())

In [3]:
rdd = spark.read.json('/labs/slaba02/DO_record_per_line.json')
rdd.show(10)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
|9/humanities|15/m...|This game-based c...|  9|  en|College Foundatio...|Canvas Network|
|  14/social_sciences|What’s in your di...| 10|  en|Digital Literacies I|Canvas Network|
|  14/social_sciences|The goal of the D...| 11|  en|Digital Literacie...|Canvas Network|
|  14/social_sciences

In [4]:
to_make_recommends = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]
print(to_make_recommends)

[[23126, 'en', 'Compass - powerful SASS library that makes your life easier'], [21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2'], [16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, 'es', 'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, 'ru', 'Программирование на Lazarus'], [13702, 'ru', 'Математическая экономика']]


In [5]:
from pyspark.sql.functions import udf, max as max_, col, rank, lit
from pyspark.sql.types import IntegerType, DoubleType, ArrayType, StringType
from pyspark.sql.window import Window
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.linalg import Vectors, VectorUDT
import re

In [6]:
def get_tokens(s_):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    return regex.findall(s_.lower())
get_tokens_ = udf(lambda x: get_tokens(x), ArrayType(StringType()))

def cos_(x, y):
    if(x == None or y == None):
        return np.nan
    else:
        if float(x.norm(2)*y.norm(2))!=0:
            return float(x.dot(y)/(x.norm(2)*y.norm(2)))
        else:
            return float(-1)
cosinus_ = udf(lambda x,y: cos_(x, y), DoubleType())

In [7]:
result_dict = {}
len_ = len(to_make_recommends)
for i in range(0, len_):
    cur_course = to_make_recommends[i]
    df_ = rdd\
    .filter(col('lang')==cur_course[1])

    t_ = df_\
    .withColumn('words',get_tokens_(col('desc')))
    
    ht = HashingTF(inputCol='words', outputCol='vector_tf', numFeatures=10000)
    tf_ = ht.transform(t_)

    idf_m = IDF(inputCol='vector_tf', outputCol='vector_')  
    idf_r = idf_m.fit(tf_ ) 
    idf_ = idf_r.transform(tf_)
    
    v_ = idf_\
    .filter(col('id')==cur_course[0])\
    .withColumnRenamed('id','id_base')\
    .withColumnRenamed('vector_','vector_base')\
    .select('id_base','vector_base')

    tv_ = idf_\
    .join(v_,col('id')!=col('id_base'))\
    .withColumn('cos', cosinus_(col('vector_base'), col('vector_')))\
    .select('id', 'id_base', 'name', 'cos')

    window = Window.orderBy(tv_['cos'].desc(), tv_['name'])
    t_ = tv_.select('id', 'cos', rank().over(window).alias('rank')).filter(col('rank') <= 10).collect()
    l_ = [int(row[0]) for row in t_]
    result_dict[str(cur_course[0])] = l_
    print(i)

0
1
2
3
4
5


In [8]:
print(result_dict)

{'23126': [14760, 13665, 13782, 20638, 24419, 15909, 2724, 25782, 17499, 13348], '21617': [21609, 21616, 22298, 21608, 21630, 21628, 21508, 21623, 21081, 19417], '16627': [11431, 17961, 17964, 5687, 12247, 16694, 5558, 12660, 11575, 9563], '11556': [10384, 16488, 468, 22710, 13461, 21707, 19330, 23357, 10447, 9465], '16704': [1219, 1327, 20362, 1228, 26980, 55, 1236, 1247, 1365, 913, 20095], '13702': [864, 21079, 1111, 792, 1410, 8123, 1041, 1033, 8313, 1396]}


In [9]:
import json
json_result = json.dumps(result_dict)
print(json_result)

{"23126": [14760, 13665, 13782, 20638, 24419, 15909, 2724, 25782, 17499, 13348], "21617": [21609, 21616, 22298, 21608, 21630, 21628, 21508, 21623, 21081, 19417], "16627": [11431, 17961, 17964, 5687, 12247, 16694, 5558, 12660, 11575, 9563], "11556": [10384, 16488, 468, 22710, 13461, 21707, 19330, 23357, 10447, 9465], "16704": [1219, 1327, 20362, 1228, 26980, 55, 1236, 1247, 1365, 913, 20095], "13702": [864, 21079, 1111, 792, 1410, 8123, 1041, 1033, 8313, 1396]}


In [10]:
with open ("lab02.json","w") as json_res_: json_res_.write(json_result)