In [52]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [114]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark.conf.set("spark.sql.crossJoin.enabled", True) # for cartesian product usage

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [77]:
import numpy as np
import re
import pyspark.sql.functions as F
from pyspark.ml.feature import Tokenizer, HashingTF, IDF

In [54]:
# загрузка данных
df = spark.read.json('/labs/slaba02/DO_record_per_line.json')
df.printSchema()

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)



In [76]:
# id курсов в моём варианте задачи
ids = [23126, 21617, 16627, 11556, 16704, 13702]
df2 = df.filter(df.id.isin(ids))
df2.show()

+--------------------+--------------------+-----+----+--------------------+--------+
|                 cat|                desc|   id|lang|                name|provider|
+--------------------+--------------------+-----+----+--------------------+--------+
|                    | La transformació...|11556|  es|Aprendizaje Colab...|   Udemy|
|6/economics_finan...|Математическая эк...|13702|  ru|Математическая эк...|  Intuit|
|                    | Hazte más emplea...|16627|  es|Aprende Excel: Ni...|   Udemy|
|5/computer_scienc...|В курсе рассматри...|16704|  ru|Программирование ...|  Intuit|
|  5/computer_science|An introduction t...|21617|  en|Preparing for the...|     edX|
|                    | Improve your SAS...|23126|  en|Compass - powerfu...|   Udemy|
+--------------------+--------------------+-----+----+--------------------+--------+



In [92]:
# мой вариант задачи
# насколько я понимаю, это поля id, lang и desc соответственно
courses_to_make_recommendations = \
[[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], \
 [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], \
 [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], \
 [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], \
 [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], \
 [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [79]:
# токенизация
def tokenization(string):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    return regex.findall(string.lower())
tokenization = F.udf(tokenization, ArrayType(StringType()))

tokenized_df = df.withColumn('tokens', tokenization(df.desc))

In [80]:
# term frequency
hashingTF = HashingTF(inputCol = 'tokens', outputCol = 'features').setNumFeatures(10000)
featurized_df = hashingTF.transform(tokenized_df)

In [81]:
# inverse document frequency
idf = IDF(inputCol = 'features', outputCol = 'i_features')
idfModel = idf.fit(featurized_df)
rescaled_df = idfModel.transform(featurized_df)

In [103]:
# cosine similarity
@F.udf(returnType=DoubleType())
def cosine_similarity(v1, v2):
    try:
        p = 2
        return float(v1.dot(v2)) / float(v1.norm(p) * v2.norm(p))
    except:
        return 0

In [118]:
lab_02_result = {}
for course in courses_to_make_recommendations:
    courses_sl = rescaled_df[(rescaled_df.lang == course[1]) & (rescaled_df.id != course[0])] # courses in the same language
    course_features = rescaled_df[rescaled_df.id == course[0]][['i_features']] # matching features
    resulting_set = courses_sl.join(course_features.withColumnRenamed('i_features', 'i_features_2')) # cartesian product
    resulting_set = resulting_set.withColumn('cosine_smlr', cosine_similarity(resulting_set.i_features, resulting_set.i_features_2)) # computing cosine similarity
    resulting_set = resulting_set.orderBy(resulting_set.cosine_smlr.desc(), resulting_set.name, resulting_set.id).limit(10)[['id']].toPandas() # descending order by cosine similarity
    lab_02_result[course[0]] = list(tfidf_final['id'])
print (lab_02_result)

{23126: [864, 21079, 8313, 1041, 28074, 8300, 1033, 13057, 21025, 1111], 21617: [864, 21079, 8313, 1041, 28074, 8300, 1033, 13057, 21025, 1111], 16627: [864, 21079, 8313, 1041, 28074, 8300, 1033, 13057, 21025, 1111], 11556: [864, 21079, 8313, 1041, 28074, 8300, 1033, 13057, 21025, 1111], 16704: [864, 21079, 8313, 1041, 28074, 8300, 1033, 13057, 21025, 1111], 13702: [864, 21079, 8313, 1041, 28074, 8300, 1033, 13057, 21025, 1111]}


In [None]:
with open('lab01.json', 'w', encoding = 'utf8') as output:
    json.dump(lab_02_result, output)

In [51]:
spark.stop()