In [1]:
import os
import sys

os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
    
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover
from pyspark import Row

import re
import json

conf = SparkConf()

spark = (SparkSession
        .builder
        .config(conf=conf)
        .appName('Lab02')
        .getOrCreate())

In [3]:
spark

In [4]:
def clear_string(series):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    words = series.str.findall(regex)
    return words

tokenizer_udf = F.pandas_udf(clear_string, ArrayType(StringType()))

In [20]:
df = spark.read.json('/labs/slaba02/DO_record_per_line.json')

In [21]:
df.printSchema()

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)



In [22]:
df.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows



In [23]:
# токенизируем
df = df.withColumn("token", tokenizer_udf(F.col("desc")))

# убираем стоп-слова
remover = StopWordsRemover(inputCol="token", outputCol="remove")
df = remover.transform(df)

# TF
hashingTF = HashingTF(
    inputCol="remove", 
    outputCol="tf", 
    numFeatures=10000
)
tf = hashingTF.transform(df)

# TF-IDF
idf = IDF(inputCol="tf", outputCol="tfidf").fit(tf)
tfidf = idf.transform(tf)
tfidf = tfidf.drop('token','remove','tf')

# нормализуем
normalizer = Normalizer(inputCol="tfidf", outputCol="norm")
data = normalizer.transform(tfidf)
data = data.drop('cat','desc','provider', 'tfidf')

# косинусня мера близости
dot_udf = F.udf(lambda x,y: float(x.dot(y)), DoubleType())
result_data = data.alias("i").join(data.alias("j"), F.col("i.id") < F.col("j.id"))\
    .select(
        F.col("i.id").alias("i"), 
        F.col("j.id").alias("j"),
        F.col("i.lang").alias("i_lang"), 
        F.col("j.lang").alias("j_lang"),
        F.col("i.name").alias("i_name"), 
        F.col("j.name").alias("j_name"), 
        dot_udf("i.norm", "j.norm").alias("cos"))

In [24]:
result_data.show(5)

+---+---+------+------+--------------------+--------------------+--------------------+
|  i|  j|i_lang|j_lang|              i_name|              j_name|                 cos|
+---+---+------+------+--------------------+--------------------+--------------------+
|  4|  5|    en|    en|Accounting Cycle:...|American Counter ...|0.004165624020100...|
|  4|  6|    en|    fr|Accounting Cycle:...|Arithmétique: en ...|0.011528231112801742|
|  4|  7|    en|    en|Accounting Cycle:...|Becoming a Dynami...|0.027471109713127377|
|  4|  8|    en|    en|Accounting Cycle:...|           Bioethics|0.027650154233236635|
|  4|  9|    en|    en|Accounting Cycle:...|College Foundatio...| 0.03125132417217193|
+---+---+------+------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [25]:
del df
del tf
del tfidf
del data

In [30]:
# курсы, для которых нужны рекомендации
courses_to_rec = [
    [23126, u'en', u'Compass - powerful SASS library that makes your life easier'],
    [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'],
    [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
    [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
    [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
    [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']
]

# список их id
id_list = [x[0] for x in courses_to_rec]

In [31]:
# оставляем только строки, в которых есть наши курсы
result_df = result_data.filter(
    (F.col('i').isin(id_list)|F.col('j').isin(id_list))&(F.col('i_lang')==F.col('j_lang'))
)

In [32]:
del result_data

In [33]:
result_df.show(5)

+---+-----+------+------+--------------------+--------------------+--------------------+
|  i|    j|i_lang|j_lang|              i_name|              j_name|                 cos|
+---+-----+------+------+--------------------+--------------------+--------------------+
|  4|21617|    en|    en|Accounting Cycle:...|Preparing for the...| 0.04708163917463968|
|  4|23126|    en|    en|Accounting Cycle:...|Compass - powerfu...|0.011433001452314673|
|  5|21617|    en|    en|American Counter ...|Preparing for the...|0.024248589506925412|
|  5|23126|    en|    en|American Counter ...|Compass - powerfu...|0.006758205222059656|
|  7|21617|    en|    en|Becoming a Dynami...|Preparing for the...|0.015767482428724364|
+---+-----+------+------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [35]:
# собираем рекомендации

result = {}

for course in id_list:
    
    print(course)


    temp = result_df.filter((F.col('i')==course)|(F.col('j')==course)).sort(F.col('cos').desc()).limit(10)

    i = [int(row.i) for row in temp.collect()]
    j = [int(row.j) for row in temp.collect()]

    result_list = []
    for count in range(10):
        if i[count]!=course:
            result_list.append(i[count])
        else:
            result_list.append(j[count])
            
    result[str(course)] = result_list

23126
21617
16627
11556
16704
13702


In [36]:
result

{'23126': [14760, 13665, 13782, 11978, 25782, 3819, 26864, 14380, 3919, 6206],
 '21617': [21609,
  21616,
  22298,
  21608,
  21081,
  19417,
  21673,
  21628,
  21630,
  21623],
 '16627': [11431, 17964, 11575, 12247, 10738, 13021, 17961, 5558, 12660, 5687],
 '11556': [16488, 5750, 16929, 11554, 18005, 7833, 7121, 23357, 8098, 3660],
 '16704': [1247, 1236, 1365, 1164, 1273, 20288, 1233, 18331, 8186, 8203],
 '13702': [864, 21079, 8313, 28074, 1111, 13057, 1033, 21025, 8123, 1217]}

In [39]:
with open("lab02.json", "w") as fp:
    json.dump(result , fp) 

In [38]:
spark.stop()