In [308]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [309]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "alexey gurov lab2") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [310]:
MY_COURSES_IDs = [23126, 21617, 16627, 11556, 16704, 13702] # ID курсов, для которых нужно сделать рекомендации

In [311]:
spark

### Считываем данные из hdfs

In [312]:
from pyspark.sql.types import StructType, StructField, StringType, LongType

In [313]:
!hdfs dfs -ls /labs/slaba02/DO_record_per_line.json

-rw-r--r--   3 hdfs hdfs   69519728 2021-02-27 21:58 /labs/slaba02/DO_record_per_line.json


In [314]:
!hdfs dfs -head /labs/slaba02/DO_record_per_line.json

{"lang": "en", "name": "Accounting Cycle: The Foundation of Business Measurement and Reporting", "cat": "3/business_management|6/economics_finance", "provider": "Canvas Network", "id": 4, "desc": "This course introduces the basic financial statements used by most businesses, as well as the essential tools used to prepare them. This course will serve as a resource to help business students succeed in their upcoming university-level accounting classes, and as a refresher for upper division accounting students who are struggling to recall elementary concepts essential to more advanced accounting topics. Business owners will also benefit from this class by gaining essential skills necessary to organize and manage information pertinent to operating their business. At the conclusion of the class, students will understand the balance sheet, income statement, and cash flow statement. They will be able to differentiate between cash basis and accrual basis techniques, and know when each is appro

In [315]:
dataset = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [316]:
dataset

DataFrame[cat: string, desc: string, id: bigint, lang: string, name: string, provider: string]

In [317]:
schema = StructType([
    StructField("cat", StringType()),
    StructField("desc", StringType()),
    StructField("id", LongType()),
    StructField("lang", StringType()),
    StructField("name", StringType()),
    StructField("provider", StringType())
])

In [318]:
dataset = spark.read.json("/labs/slaba02/DO_record_per_line.json", schema=schema)

In [319]:
dataset = dataset.na.drop(subset='desc')

In [320]:
dataset.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows



In [321]:
dataset.show(2, vertical=True)

-RECORD 0------------------------
 cat      | 3/business_manage... 
 desc     | This course intro... 
 id       | 4                    
 lang     | en                   
 name     | Accounting Cycle:... 
 provider | Canvas Network       
-RECORD 1------------------------
 cat      | 11/law               
 desc     | This online cours... 
 id       | 5                    
 lang     | en                   
 name     | American Counter ... 
 provider | Canvas Network       
only showing top 2 rows



In [322]:
type(dataset)

pyspark.sql.dataframe.DataFrame

##### Убедимся, что курсы, для которых нужно сделать рекомендации, присутствуют в датасете:

In [323]:
dataset.filter('id in {}'.format(str(MY_COURSES_IDs).replace('[', '(').replace(']', ')'))).show()

+--------------------+--------------------+-----+----+--------------------+--------+
|                 cat|                desc|   id|lang|                name|provider|
+--------------------+--------------------+-----+----+--------------------+--------+
|                    | La transformació...|11556|  es|Aprendizaje Colab...|   Udemy|
|6/economics_finan...|Математическая эк...|13702|  ru|Математическая эк...|  Intuit|
|                    | Hazte más emplea...|16627|  es|Aprende Excel: Ni...|   Udemy|
|5/computer_scienc...|В курсе рассматри...|16704|  ru|Программирование ...|  Intuit|
|  5/computer_science|An introduction t...|21617|  en|Preparing for the...|     edX|
|                    | Improve your SAS...|23126|  en|Compass - powerfu...|   Udemy|
+--------------------+--------------------+-----+----+--------------------+--------+



### Посчитаем TF-IDF для курсов (используя атрибут desc)

In [538]:
from pyspark.ml.feature import HashingTF, IDF
import pyspark.sql.functions as f

from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StringType, ArrayType, DoubleType

import re

In [492]:
@udf(returnType=ArrayType(StringType()))
def regexp_func(x):
    regex = re.compile(u'[\w\d]{4,}', re.U)
    return regex.findall(x.lower())


@udf(returnType=StringType())
def show_type(x):
    return type(x)


@udf(returnType=DoubleType())
def cosine(a, b):
    return float(a.dot(b) / a.norm(2) / b.norm(2))

# cosine = f.udf(lambda a, b: float(a.dot(b)/ a.norm(2) / b.norm(2)), DoubleType())

In [326]:
df = dataset.select('id', 'lang', 'desc', regexp_func(dataset.desc).alias('words'))

In [327]:
type(df)

pyspark.sql.dataframe.DataFrame

In [328]:
df.show()

+---+----+--------------------+--------------------+
| id|lang|                desc|               words|
+---+----+--------------------+--------------------+
|  4|  en|This course intro...|[this, course, in...|
|  5|  en|This online cours...|[this, online, co...|
|  6|  fr|This course is ta...|[this, course, ta...|
|  7|  en|We live in a digi...|[live, digitally,...|
|  8|  en|This self-paced c...|[this, self, pace...|
|  9|  en|This game-based c...|[this, game, base...|
| 10|  en|What’s in your di...|[what, your, digi...|
| 11|  en|The goal of the D...|[goal, digital, l...|
| 12|  en|Ready to explore ...|[ready, explore, ...|
| 13|  en|This self-paced c...|[this, self, pace...|
| 14|  en|What is “interpro...|[what, interprofe...|
| 15|  en|This course prese...|[this, course, pr...|
| 16|  en|Chemistry is an i...|[chemistry, integ...|
| 17|  en|Are you consideri...|[considering, car...|
| 18|  en|Princess stories ...|[princess, storie...|
| 19|  en|This first instal...|[this, first, i

In [329]:
df.filter('id in {}'.format(str(MY_COURSES_IDs).replace('[', '(').replace(']', ')'))).show()

+-----+----+--------------------+--------------------+
|   id|lang|                desc|               words|
+-----+----+--------------------+--------------------+
|11556|  es| La transformació...|[transformación, ...|
|13702|  ru|Математическая эк...|[математическая, ...|
|16627|  es| Hazte más emplea...|[hazte, empleable...|
|16704|  ru|В курсе рассматри...|[курсе, рассматри...|
|21617|  en|An introduction t...|[introduction, co...|
|23126|  en| Improve your SAS...|[improve, your, s...|
+-----+----+--------------------+--------------------+



In [330]:
ht = HashingTF(inputCol="words", outputCol="tf", numFeatures=10000)

In [331]:
tf = ht.transform(df)
tf.show()

+---+----+--------------------+--------------------+--------------------+
| id|lang|                desc|               words|                  tf|
+---+----+--------------------+--------------------+--------------------+
|  4|  en|This course intro...|[this, course, in...|(10000,[36,63,138...|
|  5|  en|This online cours...|[this, online, co...|(10000,[32,222,36...|
|  6|  fr|This course is ta...|[this, course, ta...|(10000,[30,118,12...|
|  7|  en|We live in a digi...|[live, digitally,...|(10000,[493,721,8...|
|  8|  en|This self-paced c...|[this, self, pace...|(10000,[32,115,13...|
|  9|  en|This game-based c...|[this, game, base...|(10000,[56,300,30...|
| 10|  en|What’s in your di...|[what, your, digi...|(10000,[1045,1263...|
| 11|  en|The goal of the D...|[goal, digital, l...|(10000,[87,157,15...|
| 12|  en|Ready to explore ...|[ready, explore, ...|(10000,[233,461,8...|
| 13|  en|This self-paced c...|[this, self, pace...|(10000,[26,696,10...|
| 14|  en|What is “interpro...|[what, 

In [332]:
tf.show(truncate=False)

+---+----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [333]:
idf = IDF(minDocFreq=8, inputCol='tf', outputCol='tfidf')
idf = idf.fit(tf)
tfidf = idf.transform(tf)
tfidf.show()

+---+----+--------------------+--------------------+--------------------+--------------------+
| id|lang|                desc|               words|                  tf|               tfidf|
+---+----+--------------------+--------------------+--------------------+--------------------+
|  4|  en|This course intro...|[this, course, in...|(10000,[36,63,138...|(10000,[36,63,138...|
|  5|  en|This online cours...|[this, online, co...|(10000,[32,222,36...|(10000,[32,222,36...|
|  6|  fr|This course is ta...|[this, course, ta...|(10000,[30,118,12...|(10000,[30,118,12...|
|  7|  en|We live in a digi...|[live, digitally,...|(10000,[493,721,8...|(10000,[493,721,8...|
|  8|  en|This self-paced c...|[this, self, pace...|(10000,[32,115,13...|(10000,[32,115,13...|
|  9|  en|This game-based c...|[this, game, base...|(10000,[56,300,30...|(10000,[56,300,30...|
| 10|  en|What’s in your di...|[what, your, digi...|(10000,[1045,1263...|(10000,[1045,1263...|
| 11|  en|The goal of the D...|[goal, digital, l..

In [334]:
tfidf.show(truncate=False)

+---+----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

##### Получим Sparse-векторы курсов, для которых нужно сделать рекомендации:

In [335]:
# ПОРЯДОК НЕ СОХРАНЯЕТСЯ!!!
my_tfidf = tfidf.filter(f.col('id').isin(MY_COURSES_IDs))\
               .rdd.map(lambda x: x.tfidf).collect()

##### Считаем косинусные расстояния для первого курса:

In [443]:
my_tfidf = tfidf.filter('id = 23126')\
            .withColumnRenamed('id', 'my_id')\
            .withColumnRenamed('lang', 'my_lang')\
            .withColumnRenamed('desc', 'my_desc')\
            .withColumnRenamed('words', 'my_words')\
            .withColumnRenamed('tf', 'my_tf')\
            .withColumnRenamed('tfidf', 'my_tfidf')

In [502]:
result = tfidf.filter(f.col('id') != 23126).filter(f.col('lang') == 'en')\
        .join(my_tfidf, f.col('lang') == f.col('my_lang'))\
        .withColumn('cos', cosine(f.col('tfidf'), f.col('my_tfidf')))\
        .sort(f.col('cos').desc())\
        .filter(f.col('cos') != NaN)\
        .select(f.col('id'), f.col('cos'))

In [507]:
result.show()

+-----+-------------------+
|   id|                cos|
+-----+-------------------+
|14760|  0.628465633964639|
|13665| 0.6256939032764202|
|13782| 0.5798970258819175|
|20638|0.46129654253713603|
|24419|0.44534188070808867|
|15909|0.39933984552713897|
| 2724|0.37392213839850286|
|25782|0.33098963288038774|
|17499| 0.2975971407829536|
|13348|0.29420860124338716|
|19270|0.27535612135710824|
|25071| 0.2746250102802335|
|23756| 0.2678726858832214|
| 7153| 0.2557250001659006|
| 2723|0.24977952097982095|
| 2633|0.24724126159154142|
|17208|0.23926192311171465|
|26507|0.22681041796475807|
|13781|0.22659913261747383|
|17329|0.20216168420388891|
+-----+-------------------+
only showing top 20 rows



In [515]:
result_list = result.collect()

In [518]:
cosine_list1 = [(result_list[i][0], result_list[i][1]) for i in range(len(result_list))][:10]

In [519]:
cosine_list1

[(14760, 0.6284656339646387),
 (13665, 0.6256939032764199),
 (13782, 0.5798970258819176),
 (20638, 0.4612965425371362),
 (24419, 0.44534188070808883),
 (15909, 0.3993398455271389),
 (2724, 0.37392213839850286),
 (25782, 0.3309896328803878),
 (17499, 0.2975971407829535),
 (13348, 0.29420860124338705)]

##### Считаем косинусные расстояния для второго курса:

In [528]:
my_tfidf = tfidf.filter('id = 21617')\
            .withColumnRenamed('id', 'my_id')\
            .withColumnRenamed('lang', 'my_lang')\
            .withColumnRenamed('desc', 'my_desc')\
            .withColumnRenamed('words', 'my_words')\
            .withColumnRenamed('tf', 'my_tf')\
            .withColumnRenamed('tfidf', 'my_tfidf')

In [529]:
result = tfidf.filter(f.col('id') != 21617).filter(f.col('lang') == 'en')\
        .join(my_tfidf, f.col('lang') == f.col('my_lang'))\
        .withColumn('cos', cosine(f.col('tfidf'), f.col('my_tfidf')))\
        .sort(f.col('cos').desc())\
        .filter(f.col('cos') != NaN)\
        .select(f.col('id'), f.col('cos'))

In [530]:
result_list = result.collect()

In [531]:
cosine_list2 = [(result_list[i][0], result_list[i][1]) for i in range(len(result_list))][:10]

In [532]:
cosine_list2

[(21609, 1.0),
 (21673, 0.491948868844124),
 (21081, 0.48638944050665694),
 (19417, 0.4831898434234299),
 (380, 0.47970883480484444),
 (22298, 0.4785195848462882),
 (21616, 0.47436194916961966),
 (8110, 0.47085284779865844),
 (336, 0.4705548372106196),
 (16971, 0.4693555050534698)]

##### Считаем косинусные расстояния для третьего курса:

In [280]:
cosine_list = []
my_tfidf = tfidf.filter('id = 16627').rdd.map(lambda x: x.tfidf).collect()[0]

for i, vec in zip(tfidf.filter('id != 16627 and lang = \'es\'').rdd.map(lambda x: x.id).collect(), 
                  tfidf.filter('id != 16627 and lang = \'es\'').rdd.map(lambda x: x.tfidf).collect()):
    cosine_list.append((i, vec.dot(my_tfidf) / (vec.norm(2) * my_tfidf.norm(2))))
    
cosine_list3 = sorted(cosine_list, key=lambda x: x[1], reverse=True)[:10]

In [281]:
cosine_list3

[(11431, 0.5906017840038786),
 (17964, 0.41297921838796253),
 (12660, 0.41191759369454667),
 (5687, 0.4098397426807551),
 (12247, 0.3938563076277931),
 (16694, 0.37335521556223733),
 (17961, 0.36258396182003444),
 (5558, 0.3490278768361378),
 (11575, 0.33631459306825184),
 (13551, 0.3325585280437034)]

##### Считаем косинусные расстояния для четвертого курса:

In [282]:
cosine_list = []
my_tfidf = tfidf.filter('id = 11556').rdd.map(lambda x: x.tfidf).collect()[0]

for i, vec in zip(tfidf.filter('id != 11556 and lang = \'es\'').rdd.map(lambda x: x.id).collect(), 
                  tfidf.filter('id != 11556 and lang = \'es\'').rdd.map(lambda x: x.tfidf).collect()):
    cosine_list.append((i, vec.dot(my_tfidf) / (vec.norm(2) * my_tfidf.norm(2))))
    
cosine_list4 = sorted(cosine_list, key=lambda x: x[1], reverse=True)[:10]

In [283]:
cosine_list4

[(16488, 0.5409959207330158),
 (468, 0.36593982792167357),
 (19330, 0.3216175604510969),
 (23357, 0.30961446578069773),
 (13461, 0.2931691761153784),
 (21707, 0.2926537307753657),
 (10447, 0.2899151545507826),
 (22710, 0.2897593753581081),
 (10384, 0.26945562012605506),
 (19279, 0.2127300104264589)]

##### Считаем косинусные расстояния для пятого курса:

In [284]:
cosine_list = []
my_tfidf = tfidf.filter('id = 16704').rdd.map(lambda x: x.tfidf).collect()[0]

for i, vec in zip(tfidf.filter('id != 16704 and lang = \'ru\'').rdd.map(lambda x: x.id).collect(), 
                  tfidf.filter('id != 16704 and lang = \'ru\'').rdd.map(lambda x: x.tfidf).collect()):
    cosine_list.append((i, vec.dot(my_tfidf) / (vec.norm(2) * my_tfidf.norm(2))))
    
cosine_list5 = sorted(cosine_list, key=lambda x: x[1], reverse=True)[:10]

In [285]:
cosine_list5

[(1236, 0.2875892477464737),
 (1247, 0.26877899359095647),
 (1164, 0.26475668018202814),
 (1365, 0.24460465093793834),
 (8186, 0.2432547037885661),
 (1273, 0.2375996030603118),
 (20288, 0.2375996030603118),
 (1233, 0.22320015036980265),
 (18331, 0.20748650502797214),
 (8203, 0.2058417303139107)]

##### Считаем косинусные расстояния для шестого курса:

In [286]:
cosine_list = []
my_tfidf = tfidf.filter('id = 13702').rdd.map(lambda x: x.tfidf).collect()[0]

for i, vec in zip(tfidf.filter('id != 13702 and lang = \'ru\'').rdd.map(lambda x: x.id).collect(), 
                  tfidf.filter('id != 13702 and lang = \'ru\'').rdd.map(lambda x: x.tfidf).collect()):
    cosine_list.append((i, vec.dot(my_tfidf) / (vec.norm(2) * my_tfidf.norm(2))))
    
cosine_list6 = sorted(cosine_list, key=lambda x: x[1], reverse=True)[:10]

In [287]:
cosine_list6

[(864, 1.0),
 (1111, 0.13669591486580385),
 (13057, 0.13109446777621409),
 (1410, 0.11601344433775057),
 (1033, 0.11135726266571647),
 (22053, 0.10287783580922624),
 (1217, 0.10201734385157518),
 (895, 0.09226029948768745),
 (1216, 0.0922335445703671),
 (8123, 0.08970538903332233)]

### Запишем ответ в json-файл

In [533]:
import json

In [534]:
answer = {
  '23126': [item[0] for item in cosine_list1],
  '21617': [item[0] for item in cosine_list2],
  '16627': [item[0] for item in cosine_list3],
  '11556': [item[0] for item in cosine_list4],
  '16704': [item[0] for item in cosine_list5],
  '13702': [item[0] for item in cosine_list6]
}

In [535]:
answer

{'23126': [14760,
  13665,
  13782,
  20638,
  24419,
  15909,
  2724,
  25782,
  17499,
  13348],
 '21617': [21609, 21673, 21081, 19417, 380, 22298, 21616, 8110, 336, 16971],
 '16627': [11431, 17964, 12660, 5687, 12247, 16694, 17961, 5558, 11575, 13551],
 '11556': [16488, 468, 19330, 23357, 13461, 21707, 10447, 22710, 10384, 19279],
 '16704': [1236, 1247, 1164, 1365, 8186, 1273, 20288, 1233, 18331, 8203],
 '13702': [864, 1111, 13057, 1410, 1033, 22053, 1217, 895, 1216, 8123]}

In [536]:
with open('lab02.json', 'w') as f:
    json.dump(answer, f)

In [537]:
with open('lab02.json', 'r') as f:
    print(f.read())

{"23126": [14760, 13665, 13782, 20638, 24419, 15909, 2724, 25782, 17499, 13348], "21617": [21609, 21673, 21081, 19417, 380, 22298, 21616, 8110, 336, 16971], "16627": [11431, 17964, 12660, 5687, 12247, 16694, 17961, 5558, 11575, 13551], "11556": [16488, 468, 19330, 23357, 13461, 21707, 10447, 22710, 10384, 19279], "16704": [1236, 1247, 1164, 1365, 8186, 1273, 20288, 1233, 18331, 8203], "13702": [864, 1111, 13057, 1410, 1033, 22053, 1217, 895, 1216, 8123]}


In [539]:
spark.stop()