## Лаба 2. Content-based рекомендательная система образовательных курсов – Spark Dataframes

In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [16]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf, col, isnan, isnull, broadcast, desc, lower
from pyspark.sql.types import FloatType, ArrayType, StringType,DoubleType
from pyspark.sql.functions import udf, col, isnan, isnull, broadcast, desc, lower, pandas_udf, row_number
from pyspark.sql.functions import pandas_udf
from pyspark.sql.window import Window
import json
import re

In [17]:
courses = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
           [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
           [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
           [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
           [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
           [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]
courses

[[23126, 'en', 'Compass - powerful SASS library that makes your life easier'],
 [21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2'],
 [16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
 [11556,
  'es',
  'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
 [16704, 'ru', 'Программирование на Lazarus'],
 [13702, 'ru', 'Математическая экономика']]

In [18]:
data = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [19]:
data.show(1,False,True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [20]:
def clear_string(series):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    words = series.str.lower().str.findall(regex)
    return words

tokenizer_udf = pandas_udf(clear_string, ArrayType(StringType()))

In [21]:
tokenizer = Tokenizer(inputCol="desc", outputCol="word")

stop_words = (StopWordsRemover.loadDefaultStopWords('english') 
                  + StopWordsRemover.loadDefaultStopWords('russian') 
                  + StopWordsRemover.loadDefaultStopWords('spanish'))

swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered", stopWords=stop_words)

hasher = HashingTF(numFeatures=10000, binary=True, inputCol=swr.getOutputCol(), outputCol="word_vector")

idf = IDF(inputCol=hasher.getOutputCol(), outputCol="feature")

normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol="normalized_vector")

pipeline = Pipeline(stages=[
    tokenizer,
    swr,
    hasher,
    idf,
    normalizer
])

pipeline_model = pipeline.fit(data)
data_feature = pipeline_model.transform(data)
data_feature.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|                word|      words_filtered|         word_vector|             feature|   normalized_vector|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|[course, introduc...|(10000,[36,42,63,...|(10000,[36,42,63,...|(10000,[36,42,63,...|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|[this, online, co...|[online, course, ...|(10000,[32,222,29...|(10000,[32,222,29...|(10000,[32,222,29...|
|5/computer_scienc...|This cou

In [22]:
dataset_my_courses = data_feature.filter((data_feature.id  == "23126")| \
                 (data_feature.id  == "21617")| \
                 (data_feature.id  == "16627")| \
                 (data_feature.id  == "11556")| \
                 (data_feature.id  == "16704")| \
                 (data_feature.id  == "13702"))

In [23]:
dataset_courses = data_feature.join(dataset_my_courses, on="id", how="leftanti")

In [24]:
dot_udf = udf(lambda x,y: float(x.dot(y)), DoubleType())

result = dataset_my_courses.alias("i")\
    .join(dataset_courses.alias("j"), col("i.lang") == col("j.lang"))\
    .select(
        col("i.id").alias("id"), 
        col("j.id").alias("recomended_id"),
        col("j.name").alias("name"),
        dot_udf("i.normalized_vector", "j.normalized_vector").alias("dot")
    )\
    .sort(col("id"), col("dot").desc(), col("name"), col("recomended_id"))

result.show(10)

+-----+-------------+--------------------+-------------------+
|   id|recomended_id|                name|                dot|
+-----+-------------+--------------------+-------------------+
|11556|        12679|Educación para el...|0.18277959626804818|
|11556|        22710|Aplicaciones crea...|0.17304014404349366|
|11556|        16488|Aprendizaje basad...| 0.1616972976518471|
|11556|        17910|Desarrollo de ide...|0.13748244594526043|
|11556|          468|Tecnologías de in...| 0.1300013405332623|
|11556|          387|Matemáticas y Mov...|0.11516030556074294|
|11556|        19394|herramientas gest...|0.11420282232417328|
|11556|        18005|Fundamentos del M...|0.11384957033175985|
|11556|          272|Desarrollo rápido...|0.11265771324278726|
|11556|        12884|Liderazgo en gest...|0.10708631464665708|
+-----+-------------+--------------------+-------------------+
only showing top 10 rows



In [25]:
windowSpec = Window.partitionBy("id").orderBy(col("id"), col("dot").desc(), col("name"), col("recomended_id"))
result = result.withColumn("row_number", row_number().over(windowSpec)).filter(col("row_number") <= 10)

In [26]:
result_driver = result.select(col("id"), col("recomended_id")).collect()

In [27]:
result_json = {}
for x in result_driver:
    if x.id not in result_json:
        result_json[x.id] = []
    result_json[x.id].append(x.recomended_id)
result_json

{23126: [25782, 23718, 7222, 14760, 23822, 11528, 24373, 5550, 13665, 25468],
 16627: [11431, 12247, 16694, 5356, 9563, 5680, 5687, 23506, 17964, 23369],
 13702: [864, 21079, 13057, 1041, 1033, 915, 1217, 1216, 1173, 21025],
 16704: [1236, 8186, 1164, 1365, 875, 8207, 8154, 1376, 1219, 20645],
 11556: [12679, 22710, 16488, 17910, 468, 387, 19394, 18005, 272, 12884],
 21617: [21609, 21608, 21616, 21492, 21700, 21716, 21703, 21706, 21587, 21618]}

In [None]:
import json

with open('lab02.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(result_json))

In [28]:
spark.stop()