In [87]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [88]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Sabanov Denis Spark Dataframe test lab02") 

spark = SparkSession.builder.config(conf=conf).appName("Sabanov Denis Spark Dataframe test lab02").getOrCreate()

In [89]:
PERSONAL_COURSES = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [90]:
coursesDataPath = "/labs/slaba02/DO_record_per_line.json"

In [91]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [92]:
coursesSchema = StructType(fields=[
    StructField("lang", StringType()),
    StructField("name", StringType()),
    StructField("cat", StringType()),
    StructField("provider", StringType()),
    StructField("id", StringType()),
    StructField("desc", StringType()),
])

In [93]:
courseData = spark.read.json(coursesDataPath, schema = coursesSchema)

In [94]:
courseData.show(5, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [95]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, Normalizer

In [96]:
tokenizer = Tokenizer(inputCol="desc", outputCol="words_desc")

In [97]:
wordData = tokenizer.transform(courseData)

In [98]:
wordData.show(5, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [99]:
hashingTF = HashingTF(inputCol="words_desc", outputCol="features", numFeatures=10000)

In [100]:
featuredData = hashingTF.transform(wordData)

In [101]:
featuredData.show(5, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [102]:
idf = IDF(inputCol="features", outputCol="resFeatures")

In [103]:
idfModel = idf.fit(featuredData)

In [104]:
rescaledData = idfModel.transform(featuredData)

In [105]:
normalizer = Normalizer(inputCol="resFeatures", outputCol="normFeatures")

In [106]:
normalizedData = normalizer.transform(rescaledData).select('lang', 'name','id','normFeatures')

In [107]:
normalizedData.show(5, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [108]:
personalCoursesIdList = [str(i[0]) for i in PERSONAL_COURSES]

In [109]:
personalCoursesIdList

['23126', '21617', '16627', '11556', '16704', '13702']

In [110]:
import pyspark.sql.functions as f

In [144]:
udfDot = f.udf(lambda x,y: float(x.dot(y)), DoubleType())

In [145]:
recomendDict = {}

In [153]:
for persId in personalCoursesIdList:
    personalCoursesDf = normalizedData.filter(normalizedData.id == persId).select(normalizedData.id.alias("pers_id"),normalizedData.lang, normalizedData.normFeatures.alias("pers_features"))
    joinDf = normalizedData.join(personalCoursesDf, normalizedData.lang == personalCoursesDf.lang)\
    .select("name", "id", "normFeatures", "pers_id", "pers_features")
    
    dfWithMetric = joinDf.withColumn('cosine_sim', udfDot('normFeatures','pers_features'))\
    .orderBy(f.desc("cosine_sim"), "name", "id").limit(11).select("id", "pers_id")
    
    recomendedList = dfWithMetric.where(dfWithMetric.id != dfWithMetric.pers_id).select("id").take(10)
    recomendedIdList = [i.id for i in recomendedList]
    recomendDict[persId] = recomendedIdList

In [154]:
recomendDict

{'23126': ['13782',
  '13665',
  '24419',
  '20638',
  '2724',
  '25782',
  '2633',
  '2723',
  '13348',
  '15909'],
 '21617': ['21609',
  '21608',
  '21616',
  '21492',
  '21624',
  '21623',
  '21630',
  '21628',
  '21508',
  '21857'],
 '16627': ['11431',
  '12247',
  '13021',
  '25010',
  '11575',
  '5687',
  '9598',
  '5372',
  '12863',
  '16769'],
 '11556': ['16488',
  '13461',
  '468',
  '10447',
  '387',
  '22710',
  '9289',
  '5936',
  '23357',
  '7833'],
 '16704': ['1365',
  '20645',
  '1426',
  '20105',
  '8217',
  '1236',
  '1164',
  '1219',
  '8123',
  '8207'],
 '13702': ['864',
  '1216',
  '7173',
  '8313',
  '1052',
  '17017',
  '19613',
  '21017',
  '17015',
  '8082']}

In [156]:
import json

In [157]:
with open("/data/home/denis.sabanov/lab02.json", mode='w') as file:
    json.dump(recomendDict, file)

In [158]:
spark.stop()