### Евгений Шенк

## Лабораторная раота №2. Content-based рекомендации

In [1]:
# Params
courses_to_recommend = [
    [23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
    [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
    [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
    [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
    [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
    [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']
]

In [2]:
import json
import os
import sys
import re
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


### Spark Session

In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
from pyspark.ml.feature import HashingTF, IDF, Normalizer, StopWordsRemover

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("ESShenk_spark_session")
         .getOrCreate())

### Data

In [4]:
!hdfs dfs -ls /labs/slaba02  # /DO_record_per_line.json

Found 1 items
-rw-r--r--   3 hdfs hdfs   69519728 2022-01-06 18:46 /labs/slaba02/DO_record_per_line.json


In [5]:
# Загрузка
dfSetOfCourses = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [6]:
dfSetOfCourses.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows



### Преобразования

In [7]:
def get_words(string):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    return regex.findall(string.lower())

get_words_udf = F.udf(get_words, ArrayType(StringType()))

In [8]:
documents = dfSetOfCourses.select("cat", "desc", "id", "lang", "name", "provider", get_words_udf(F.col("desc")).alias("words"))
documents.show(3)

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|               words|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|[this, online, co...|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|[this, course, is...|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
only showing top 3 rows



#### TFIDF

In [9]:
# Убираем стоп-слова из текста
eng_stopwords = StopWordsRemover.loadDefaultStopWords("english")
rus_stopwords = StopWordsRemover.loadDefaultStopWords("russian")
spa_stopwords = StopWordsRemover.loadDefaultStopWords("spanish")
stopWords = eng_stopwords + rus_stopwords + spa_stopwords

In [10]:
swRemover = StopWordsRemover(inputCol="words", outputCol="pure_words", stopWords=stopWords)

In [11]:
pure_documents = swRemover.transform(documents)

In [12]:
hashingTF = HashingTF(numFeatures=10000, inputCol="pure_words", outputCol="tf")

In [13]:
tf = hashingTF.transform(pure_documents)

In [14]:
idf = IDF(inputCol="tf", outputCol="idf").fit(tf)

In [15]:
tfidf = idf.transform(tf)

In [16]:
# tfidf.show(10, truncate=False, vertical=True)

In [17]:
# tfidf.select("*").filter(F.col("id") == 23126).show(1, truncate=False, vertical=True)

In [19]:
vector_23126 = tfidf.select(F.col("idf")).filter(F.col("id") == 23126).limit(1).collect()[0][0]
vector_21617 = tfidf.select(F.col("idf")).filter(F.col("id") == 21617).limit(1).collect()[0][0]
vector_16627 = tfidf.select(F.col("idf")).filter(F.col("id") == 16627).limit(1).collect()[0][0]
vector_11556 = tfidf.select(F.col("idf")).filter(F.col("id") == 11556).limit(1).collect()[0][0]
vector_16704 = tfidf.select(F.col("idf")).filter(F.col("id") == 16704).limit(1).collect()[0][0]
vector_13702 = tfidf.select(F.col("idf")).filter(F.col("id") == 13702).limit(1).collect()[0][0]

In [18]:
def cos_similarity(x, idx):
    vectors_list = [vector_23126, vector_21617, vector_16627, vector_11556, vector_16704, vector_13702]
    z = float(x.norm(2) * vectors_list[idx].norm(2))
    if z == 0:
        return 0.0
    result = float(x.dot(vectors_list[idx]) / z)
    return round(result, 3)

cos_similarity_udf = F.udf(cos_similarity)

In [20]:
dfWithSimilarities = tfidf.select("id", "lang", "name", "words", "tf", "idf")\
.withColumn("cos_23126", cos_similarity_udf(F.col("idf"), F.lit(0)))\
.withColumn("cos_21617", cos_similarity_udf(F.col("idf"), F.lit(1)))\
.withColumn("cos_16627", cos_similarity_udf(F.col("idf"), F.lit(2)))\
.withColumn("cos_11556", cos_similarity_udf(F.col("idf"), F.lit(3)))\
.withColumn("cos_16704", cos_similarity_udf(F.col("idf"), F.lit(4)))\
.withColumn("cos_13702", cos_similarity_udf(F.col("idf"), F.lit(5)))

In [21]:
# dfWithSimilarities.show(1, truncate=False, vertical=True)

In [22]:
ids = [23126, 21617, 16627, 11556, 16704, 13702]

In [23]:
courses_to_recommend

[[23126, 'en', 'Compass - powerful SASS library that makes your life easier'],
 [21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2'],
 [16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
 [11556,
  'es',
  'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
 [16704, 'ru', 'Программирование на Lazarus'],
 [13702, 'ru', 'Математическая экономика']]

In [24]:
dfWithSimilarities.select("id", "cos_23126", "name")\
.filter(F.col("lang") == courses_to_recommend[0][1])\
.orderBy(F.col(f"id").asc())\
.orderBy(F.col(f"cos_23126").desc())\
.limit(10)\
.show(10, truncate=False, vertical=True)

-RECORD 0----------------------------------------------------------------------------
 id        | 23126                                                                   
 cos_23126 | 1.0                                                                     
 name      | Compass - powerful SASS library that makes your life easier             
-RECORD 1----------------------------------------------------------------------------
 id        | 13665                                                                   
 cos_23126 | 0.622                                                                   
 name      | The Next Step with Sass and Compass by Lisa Catalano                    
-RECORD 2----------------------------------------------------------------------------
 id        | 14760                                                                   
 cos_23126 | 0.598                                                                   
 name      | Foundation 4: Incorporating Sass and Comp

In [25]:
results = {}
for i, idx in enumerate(ids):
    results[f"{idx}"] = dfWithSimilarities.select("id")\
    .filter(F.col("lang") == courses_to_recommend[i][1])\
    .orderBy(F.col(f"id").asc())\
    .orderBy(F.col(f"cos_{idx}").desc())\
    .limit(10)\
    .rdd.flatMap(lambda x: x).collect()

In [26]:
results

{'23126': [23126,
  13665,
  14760,
  13782,
  20638,
  24419,
  15909,
  2724,
  25782,
  17499],
 '21617': [21609,
  21617,
  21616,
  22298,
  21608,
  21628,
  21630,
  21081,
  21623,
  19417],
 '16627': [16627, 11431, 5687, 17964, 12660, 12247, 17961, 16694, 5558, 11575],
 '11556': [11556, 16488, 468, 19330, 10447, 23357, 21707, 22710, 13461, 10384],
 '16704': [16704, 1236, 1247, 1365, 1273, 1164, 8186, 20288, 1233, 8203],
 '13702': [864, 13702, 28074, 1041, 21079, 8300, 13057, 8313, 21025, 1033]}

### Сохранить и выйти

In [27]:
with open ("../lab02.json", "w") as file:
    json.dump(results, file)

In [28]:
spark.stop()