In [1]:
import os
import sys

os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [3]:
given_courses = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
                 [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
                 [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
                 [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
                 [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
                 [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [4]:
import numpy as np

np.array(given_courses)[:, 0]

array(['23126', '21617', '16627', '11556', '16704', '13702'], dtype='<U78')

In [5]:
given_courses

[[23126, 'en', 'Compass - powerful SASS library that makes your life easier'],
 [21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2'],
 [16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
 [11556,
  'es',
  'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
 [16704, 'ru', 'Программирование на Lazarus'],
 [13702, 'ru', 'Математическая экономика']]

In [6]:
data = spark.read.json('/labs/slaba02/DO_record_per_line.json')

In [7]:
data.show(1, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [35]:
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, ArrayType, StringType
from pyspark.ml.linalg import VectorUDT
from pyspark.sql import functions as f
from pyspark.ml.feature import Tokenizer, HashingTF, CountVectorizer, IDF
from pyspark.sql.functions import udf, col, isnan, isnull, broadcast, desc, lower, pandas_udf
import json
import re
from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window

In [11]:
# из документации https://spark.apache.org/docs/latest/ml-features#tf-idf
tokenizer = Tokenizer(inputCol="desc", outputCol="words")
wordsData = tokenizer.transform(data)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
featurizedData = hashingTF.transform(wordsData)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)


In [13]:
rescaledData.select('features', 'rawFeatures').show(1, truncate = False, vertical = True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Выбираю только курсы, для которых надо сделать предсказание

In [14]:
my_data = rescaledData.filter(col('id').isin(['23126', '21617', '16627', '11556', '16704', '13702']))

In [15]:
my_data.show()

+--------------------+--------------------+-----+----+--------------------+--------+--------------------+--------------------+--------------------+
|                 cat|                desc|   id|lang|                name|provider|               words|         rawFeatures|            features|
+--------------------+--------------------+-----+----+--------------------+--------+--------------------+--------------------+--------------------+
|                    | La transformació...|11556|  es|Aprendizaje Colab...|   Udemy|[, la, transforma...|(10000,[249,522,5...|(10000,[249,522,5...|
|6/economics_finan...|Математическая эк...|13702|  ru|Математическая эк...|  Intuit|[математическая, ...|(10000,[310,763,9...|(10000,[310,763,9...|
|                    | Hazte más emplea...|16627|  es|Aprende Excel: Ni...|   Udemy|[, hazte, más, em...|(10000,[30,145,19...|(10000,[30,145,19...|
|5/computer_scienc...|В курсе рассматри...|16704|  ru|Программирование ...|  Intuit|[в, курсе, рассма...|(10000,

Переименовываю колонки в общем датасете

In [16]:
rescaledData = rescaledData.withColumnRenamed('id', 'id_all')
rescaledData = rescaledData.withColumnRenamed('features', 'features_all')

In [17]:
rescaledData = rescaledData.select('id_all', 'features_all', 'lang')

In [18]:
rescaledData.show()

+------+--------------------+----+
|id_all|        features_all|lang|
+------+--------------------+----+
|     4|(10000,[36,42,63,...|  en|
|     5|(10000,[32,222,29...|  en|
|     6|(10000,[30,41,246...|  fr|
|     7|(10000,[493,572,7...|  en|
|     8|(10000,[32,65,115...|  en|
|     9|(10000,[56,91,268...|  en|
|    10|(10000,[1045,1263...|  en|
|    11|(10000,[87,157,57...|  en|
|    12|(10000,[161,164,4...|  en|
|    13|(10000,[26,1072,1...|  en|
|    14|(10000,[63,145,23...|  en|
|    15|(10000,[32,65,77,...|  en|
|    16|(10000,[32,273,30...|  en|
|    17|(10000,[695,1420,...|  en|
|    18|(10000,[307,316,3...|  en|
|    19|(10000,[572,768,8...|  en|
|    20|(10000,[91,273,31...|  en|
|    21|(10000,[148,157,1...|  en|
|    22|(10000,[128,177,2...|  en|
|    23|(10000,[91,332,52...|  en|
+------+--------------------+----+
only showing top 20 rows



Соединяю курсы со всем датасетом по языку

In [19]:
joined_data = my_data.join(rescaledData, how = 'left', on = 'lang')

In [20]:
joined_data = joined_data.filter(col('id') != col('id_all'))

In [62]:
joined_data.show(5)

+----+------------------+--------------------+-----+--------------------+--------+--------------------+--------------------+--------------------+------+--------------------+
|lang|               cat|                desc|   id|                name|provider|               words|         rawFeatures|            features|id_all|        features_all|
+----+------------------+--------------------+-----+--------------------+--------+--------------------+--------------------+--------------------+------+--------------------+
|  en|5/computer_science|An introduction t...|21617|Preparing for the...|     edX|[an, introduction...|(10000,[213,360,4...|(10000,[213,360,4...|     4|(10000,[36,42,63,...|
|  en|5/computer_science|An introduction t...|21617|Preparing for the...|     edX|[an, introduction...|(10000,[213,360,4...|(10000,[213,360,4...|     5|(10000,[32,222,29...|
|  en|5/computer_science|An introduction t...|21617|Preparing for the...|     edX|[an, introduction...|(10000,[213,360,4...|(10000

Применяю функцию косинусной близости

In [29]:
def cos_sim(v1, v2):
    product = float(v1.dot(v2))
    norm1 = v1.norm(2)
    norm2 = v2.norm(2)
    if norm1 * norm2 == 0:
        return 0
    return product / float(norm1 * norm2)

cos_sim = udf(cos_sim, DoubleType())

In [32]:
result_data = joined_data.select('id', 'id_all', 'features', 'features_all', cos_sim('features', 'features_all').alias("cos_sim"))

In [33]:
result_data.show(5)

+-----+------+--------------------+--------------------+--------------------+
|   id|id_all|            features|        features_all|             cos_sim|
+-----+------+--------------------+--------------------+--------------------+
|21617|     4|(10000,[213,360,4...|(10000,[36,42,63,...| 0.07806882371748854|
|21617|     5|(10000,[213,360,4...|(10000,[32,222,29...| 0.04676028409292413|
|21617|     7|(10000,[213,360,4...|(10000,[493,572,7...|0.036594215419073484|
|21617|     8|(10000,[213,360,4...|(10000,[32,65,115...|   0.069150176490545|
|21617|     9|(10000,[213,360,4...|(10000,[56,91,268...| 0.11014821826251854|
+-----+------+--------------------+--------------------+--------------------+
only showing top 5 rows



Нумерую курсы по косинусной близости

In [48]:
result_data = result_data.select('id', 'id_all', 'features', 'features_all', 'cos_sim', F.row_number().over(Window.partitionBy('id').orderBy(col('cos_sim').desc())).alias('row_number'))

In [50]:
result_data.show() 

+-----+------+--------------------+--------------------+-------------------+----------+
|   id|id_all|            features|        features_all|            cos_sim|row_number|
+-----+------+--------------------+--------------------+-------------------+----------+
|23126| 13782|(10000,[87,91,96,...|(10000,[1263,1470...| 0.4615945735430714|         1|
|23126| 13665|(10000,[87,91,96,...|(10000,[51,93,128...| 0.4472198354497587|         2|
|23126| 24419|(10000,[87,91,96,...|(10000,[1,50,77,8...|0.42547827554646006|         3|
|23126| 20638|(10000,[87,91,96,...|(10000,[34,3775,4...| 0.4137012231575676|         4|
|23126|  2724|(10000,[87,91,96,...|(10000,[26,173,36...| 0.3740908853197787|         5|
|23126| 25782|(10000,[87,91,96,...|(10000,[15,24,91,...| 0.2915507168229196|         6|
|23126|  2633|(10000,[87,91,96,...|(10000,[246,376,4...|0.26427746430872295|         7|
|23126|  2723|(10000,[87,91,96,...|(10000,[246,427,4...| 0.2623679510012487|         8|
|23126| 13348|(10000,[87,91,96,.

Оставляю только топ-10 рекомендованных курсов

In [51]:
result_data.filter(col('row_number') <= 10).show(100)

+-----+------+--------------------+--------------------+-------------------+----------+
|   id|id_all|            features|        features_all|            cos_sim|row_number|
+-----+------+--------------------+--------------------+-------------------+----------+
|23126| 13782|(10000,[87,91,96,...|(10000,[1263,1470...| 0.4615945735430714|         1|
|23126| 13665|(10000,[87,91,96,...|(10000,[51,93,128...| 0.4472198354497587|         2|
|23126| 24419|(10000,[87,91,96,...|(10000,[1,50,77,8...|0.42547827554646006|         3|
|23126| 20638|(10000,[87,91,96,...|(10000,[34,3775,4...| 0.4137012231575676|         4|
|23126|  2724|(10000,[87,91,96,...|(10000,[26,173,36...| 0.3740908853197787|         5|
|23126| 25782|(10000,[87,91,96,...|(10000,[15,24,91,...| 0.2915507168229196|         6|
|23126|  2633|(10000,[87,91,96,...|(10000,[246,376,4...|0.26427746430872295|         7|
|23126|  2723|(10000,[87,91,96,...|(10000,[246,427,4...| 0.2623679510012487|         8|
|23126| 13348|(10000,[87,91,96,.

In [52]:
result_data = result_data.filter(col('row_number') <= 10)

In [63]:
res = {}

for i in np.array(given_courses)[:, 0]:
    recommend = [row['id_all'] for row in result_data.filter(col('id') == i).collect()]
    res[i] = recommend

In [60]:
res

{'23126': [13782, 13665, 24419, 20638, 2724, 25782, 2633, 2723, 13348, 15909],
 '21617': [21609,
  21608,
  21616,
  21492,
  21624,
  21623,
  21630,
  21628,
  21508,
  21857],
 '16627': [11431, 12247, 13021, 25010, 11575, 5687, 9598, 5372, 12863, 16769],
 '11556': [16488, 13461, 468, 10447, 387, 22710, 9289, 5936, 23357, 7833],
 '16704': [1365, 20645, 1426, 20105, 8217, 1236, 1164, 1219, 8123, 875],
 '13702': [864, 1216, 7173, 8313, 1052, 17017, 19613, 21017, 17015, 8082]}

In [61]:
with open("lab02.json", "w") as outfile:
    json.dump(res, outfile)

In [64]:
spark.stop()