## Вариант 10

In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --driver-memory 3g pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Yury Perevezentsev app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
spark

#### Грузим исходные данные

In [4]:
!hdfs dfs -ls /labs/slaba02/DO_record_per_line.json

-rw-r--r--   3 hdfs hdfs   69519728 2022-01-06 18:46 /labs/slaba02/DO_record_per_line.json


In [5]:
 #!hdfs dfs -cat /labs/slaba02/DO_record_per_line.json

In [6]:
# Это список списков курсов, к которым мы должны найти рекомендации
given_courses = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
                 [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
                 [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
                 [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
                 [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
                 [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

given_courses


[[23126, 'en', 'Compass - powerful SASS library that makes your life easier'],
 [21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2'],
 [16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
 [11556,
  'es',
  'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
 [16704, 'ru', 'Программирование на Lazarus'],
 [13702, 'ru', 'Математическая экономика']]

In [7]:
# Create DataFrame

dataset = spark.read.json("/labs/slaba02/DO_record_per_line.json")

dataset.show(5)

# desc - поле, которое мы будеи преобразовывать и на его основании делать рекомендацию

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows



In [8]:
dataset.rdd.getNumPartitions()

5

In [9]:
dataset.show(1, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#### План решения Лабы

Пайплайн базового алгоритма:
1. Препроцессинг столбца desc (привести слова к нижнему регистру, удалить ненужные символы, удалить стоп-слова и т.д.) + токенизация текста (разбиваем обработанные предложения из desc в отдельные слова)
2. Получаем TF на основе HashingTF (TF - превращаем массивы слов в вектор фичей)
3. Получаем IDF. TF-IDF считается для описания курсов desc (знаки препинания и прочие символы не должны учитываться)
4. Сформировать датасет, где будут кандидаты в рекомендуемые курсы, и курсы, для которых нужно осуществить рекомндацию (используем join и broadcast, так как иодна из табличек маленькая)
5. На соновании двух векторов (векторов для курсов, которые являются кандидатами в рекомендации, и курсов, для которых составляем рекомендацию) посчитать косинус близости при помощи udf-функции cos_sim (косинус близости между tf-idf векторами для разных курсов)
6. На сонове результатов udf-функции сформировать рекомендации для каждого курса
7. Для каждого курса формирует тот-10 наиболее похожих на него курсов. 
    - Рекомендуемые курсы должны быть того же языка, что и курс, для которого строится рекомендация
    - Курсы сортируются: 1) по убыванию метрики, 2) по названию (лексикографически по возрастанию), 3) по возрастанию id
    
    

In [10]:
# Грузим библиотеки
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf, col, isnan, isnull, broadcast, desc, lower
from pyspark.sql.types import FloatType, ArrayType, StringType, DoubleType
import json 
import re


#### 1. Препроцессинг столбца desc

In [11]:
dataset = dataset[["id", "lang", "name", "desc"]]
dataset = dataset.withColumn('desc', lower(col('desc'))) # переврдим в нижный регистр
dataset.show(2)

+---+----+--------------------+--------------------+
| id|lang|                name|                desc|
+---+----+--------------------+--------------------+
|  4|  en|Accounting Cycle:...|this course intro...|
|  5|  en|American Counter ...|this online cours...|
+---+----+--------------------+--------------------+
only showing top 2 rows



In [12]:
# Udf-функция для очистки и препроцессинга текста
# Убираем ненужные символы и знаки пунктуации из поля desc, которое будем использовать для потсроения рекомендаций
import pyspark.sql.functions as f
from pyspark.sql.functions import pandas_udf
import re
 
def clear_string(series):
    regex = re.compile(u'[\w\d]{2,}', re.U) # убираем ненужные символы и пунктуацию
    words = series.str.findall(regex) 
    return words

tokenizer_udf = pandas_udf(clear_string, ArrayType(StringType()))

dataset = dataset.withColumn("desc_cleared", tokenizer_udf("desc")).cache()

dataset.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [13]:
# stopWordsRemover
stopWordsRemover = StopWordsRemover(inputCol="desc_cleared", outputCol="desc_cleared2")
dataset_SW = stopWordsRemover.transform(dataset)
dataset_SW = dataset_SW.drop("desc", "desc_cleared")
dataset_SW.show(1, vertical = True, truncate = False)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id            | 4                                      

#### 2. Hashing TF

In [14]:
ht = HashingTF(inputCol = "desc_cleared2", outputCol = "features", numFeatures = 10000)
tf = ht.transform(dataset_SW)
tf = tf.drop("desc_cleared", "desc_cleared2", "desc")
tf.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id       | 4                                                                                                                                                                                                                                                                                        

#### 3. TF-IDF + Normalizer

In [15]:
# IDF
idf = IDF(inputCol= "features", outputCol= "features_tfidf")
idf_model = idf.fit(tf)
tfidf = idf_model.transform(tf)

tfidf = tfidf.drop("features")
tfidf.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [16]:
# Normalizer
normalizer = Normalizer(inputCol= "features_tfidf", outputCol= "tfidf")
tfidf_n = normalizer.transform(tfidf)
tfidf_n = tfidf_n.drop("features_tfidf")
tfidf_n.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#### 4. JOIN двух датасетов

In [17]:
list_yury = [item[0] for item in given_courses]
list_yury = sorted(list_yury)
list_yury

[11556, 13702, 16627, 16704, 21617, 23126]

In [18]:
tfidf_yury = tfidf_n.filter(tfidf_n.id.isin(list_yury))
tfidf_yury.show(7, vertical = True, truncate = True)

-RECORD 0---------------------
 id    | 11556                
 lang  | es                   
 name  | Aprendizaje Colab... 
 tfidf | (10000,[249,522,5... 
-RECORD 1---------------------
 id    | 13702                
 lang  | ru                   
 name  | Математическая эк... 
 tfidf | (10000,[310,942,2... 
-RECORD 2---------------------
 id    | 16627                
 lang  | es                   
 name  | Aprende Excel: Ni... 
 tfidf | (10000,[55,76,192... 
-RECORD 3---------------------
 id    | 16704                
 lang  | ru                   
 name  | Программирование ... 
 tfidf | (10000,[381,1144,... 
-RECORD 4---------------------
 id    | 21617                
 lang  | en                   
 name  | Preparing for the... 
 tfidf | (10000,[17,161,36... 
-RECORD 5---------------------
 id    | 23126                
 lang  | en                   
 name  | Compass - powerfu... 
 tfidf | (10000,[87,246,25... 



In [19]:
tfidf_joined = tfidf_n.crossJoin(tfidf_yury
                        .withColumnRenamed("id", "id_t")
                        .withColumnRenamed("lang", "lang_t")
                        .withColumnRenamed("name", "name_t")
                        .withColumnRenamed("tfidf", "tfidf_t"))\
                        .orderBy(col("id"),col("id_t"))
tfidf_joined.show(10)

+---+----+--------------------+--------------------+-----+------+--------------------+--------------------+
| id|lang|                name|               tfidf| id_t|lang_t|              name_t|             tfidf_t|
+---+----+--------------------+--------------------+-----+------+--------------------+--------------------+
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|11556|    es|Aprendizaje Colab...|(10000,[249,522,5...|
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|13702|    ru|Математическая эк...|(10000,[310,942,2...|
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|16627|    es|Aprende Excel: Ni...|(10000,[55,76,192...|
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|16704|    ru|Программирование ...|(10000,[381,1144,...|
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|21617|    en|Preparing for the...|(10000,[17,161,36...|
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|23126|    en|Compass - powerfu...|(10000,[87,246,25...|
|  5|  en|American Counter .

####  5. Косинус близости

In [20]:
def cos_sim(v1, v2):
    return float((v1.dot(v2))/(v1.norm(2)*v2.norm(2)))
cos_wrapper = f.udf(lambda x,y: cos_sim(x,y), FloatType() )

In [21]:
tfidf_joined.rdd.getNumPartitions()

200

In [22]:
tfidf_joined.show()

+---+----+--------------------+--------------------+-----+------+--------------------+--------------------+
| id|lang|                name|               tfidf| id_t|lang_t|              name_t|             tfidf_t|
+---+----+--------------------+--------------------+-----+------+--------------------+--------------------+
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|11556|    es|Aprendizaje Colab...|(10000,[249,522,5...|
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|13702|    ru|Математическая эк...|(10000,[310,942,2...|
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|16627|    es|Aprende Excel: Ni...|(10000,[55,76,192...|
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|16704|    ru|Программирование ...|(10000,[381,1144,...|
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|21617|    en|Preparing for the...|(10000,[17,161,36...|
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|23126|    en|Compass - powerfu...|(10000,[87,246,25...|
|  5|  en|American Counter .

In [23]:
tfidf_joined = tfidf_joined.coalesce(10).cache()

In [25]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
tfidf_joined = tfidf_joined.select("*", cos_wrapper(f.col("tfidf"), f.col("tfidf_t")).alias("cos_sim"))
tfidf_joined.show(5)

+---+----+--------------------+--------------------+-----+------+--------------------+--------------------+-----------+
| id|lang|                name|               tfidf| id_t|lang_t|              name_t|             tfidf_t|    cos_sim|
+---+----+--------------------+--------------------+-----+------+--------------------+--------------------+-----------+
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|11556|    es|Aprendizaje Colab...|(10000,[249,522,5...|6.960418E-4|
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|13702|    ru|Математическая эк...|(10000,[310,942,2...|        0.0|
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|16627|    es|Aprende Excel: Ni...|(10000,[55,76,192...|8.212152E-5|
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|16704|    ru|Программирование ...|(10000,[381,1144,...|        0.0|
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|21617|    en|Preparing for the...|(10000,[17,161,36...|0.052303135|
+---+----+--------------------+---------

In [26]:
tfidf_joined.count()

168918

In [27]:
pandasDF = tfidf_joined.toPandas()
pandasDF.head(10)

Unnamed: 0,id,lang,name,tfidf,id_t,lang_t,name_t,tfidf_t,cos_sim
0,4,en,Accounting Cycle: The Foundation of Business M...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",11556,es,Aprendizaje Colaborativo by UNID Universidad I...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000696
1,4,en,Accounting Cycle: The Foundation of Business M...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",13702,ru,Математическая экономика,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
2,4,en,Accounting Cycle: The Foundation of Business M...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",16627,es,Aprende Excel: Nivel Intermedio by Alfonso Rin...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8.2e-05
3,4,en,Accounting Cycle: The Foundation of Business M...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",16704,ru,Программирование на Lazarus,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
4,4,en,Accounting Cycle: The Foundation of Business M...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",21617,en,Preparing for the AP* Computer Science A Exam ...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.052303
5,4,en,Accounting Cycle: The Foundation of Business M...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",23126,en,Compass - powerful SASS library that makes you...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.008291
6,5,en,American Counter Terrorism Law,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",11556,es,Aprendizaje Colaborativo by UNID Universidad I...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0001
7,5,en,American Counter Terrorism Law,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",13702,ru,Математическая экономика,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
8,5,en,American Counter Terrorism Law,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",16627,es,Aprende Excel: Nivel Intermedio by Alfonso Rin...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.003755
9,5,en,American Counter Terrorism Law,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",16704,ru,Программирование на Lazarus,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0


In [70]:
strings = [str(x) for x in list_yury]
strings

['11556', '13702', '16627', '16704', '21617', '23126']

In [71]:
dct = dict.fromkeys(strings)
dct

{'11556': None,
 '13702': None,
 '16627': None,
 '16704': None,
 '21617': None,
 '23126': None}

In [75]:
for i in list_yury:
    list = pandasDF[(pandasDF["id_t"] == i) & (pandasDF["lang_t"] == pandasDF["lang"])]\
            .sort_values(["cos_sim", "name", "id"], ascending=[False, True, True])['id'].iloc[0:10].tolist()
    dct.update({str(i): list})
print(dct)

{'11556': [11556, 16488, 468, 13461, 23357, 19330, 7833, 9289, 10447, 10384], '13702': [864, 13702, 21079, 8313, 1041, 28074, 8300, 13057, 1033, 21025], '16627': [16627, 11431, 12247, 17964, 5687, 11575, 17961, 12660, 25010, 5558], '16704': [16704, 1236, 1247, 1365, 1273, 20288, 1164, 8186, 1233, 8203], '21617': [21609, 21617, 21616, 22298, 21608, 21630, 21628, 21081, 21623, 19417], '23126': [23126, 13665, 14760, 13782, 20638, 24419, 15909, 2724, 25782, 17499]}


In [78]:
import json
with open('lab02.json', 'w') as f:
    json.dump(dct, f)