In [2]:
#! hdfs dfs -ls /labs/slaba02/

In [3]:
#!hdfs dfs -cat /labs/slaba02/DO_record_per_line.json

In [4]:
#cp hdfs dfs /labs/slaba02/DO_record_per_line.json ~/

In [5]:
#! hdfs dfs -copyToLocal /labs/slaba02/DO_record_per_line.json c:/temp/

In [6]:
#!hdfs dfs -cat /labs/laba01/ml-100k/u.data

In [7]:
#! hadoop fs -copyToLocal /labs/laba01/ml-100k/u.data 

In [8]:
#! hadoop fs -copyToLocal /labs/slaba02/DO_record_per_line.json

In [9]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [10]:
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as f_

from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
from pyspark.sql.types import FloatType 
from pyspark.sql.window import Window
from pyspark.ml import Pipeline
import json

In [11]:
path = '/labs/slaba02/DO_record_per_line.json' # r'DO_record_per_line.json'

conf = SparkConf() 
spark = SparkSession.builder.appName("LABA2").config(conf=conf).getOrCreate()

df = spark.read.json(path)
df.show(10)
df.count()

courses_list = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]
pc_ids = [x[0] for x in courses_list]

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
|9/humanities|15/m...|This game-based c...|  9|  en|College Foundatio...|Canvas Network|
|  14/social_sciences|What’s in your di...| 10|  en|Digital Literacies I|Canvas Network|
|  14/social_sciences|The goal of the D...| 11|  en|Digital Literacie...|Canvas Network|
|  14/social_sciences

In [12]:
@f_.udf(FloatType())
def cosine_similarity(v, u):
    return float(v.dot(u) / (v.norm(2) * u.norm(2)))

regexp_clear = f_.lower(f_.regexp_replace('desc',r'[^\pL0-9\p{Space}]','' ))
stop_words = list(set(StopWordsRemover.loadDefaultStopWords("english") + StopWordsRemover.loadDefaultStopWords("russian") + StopWordsRemover.loadDefaultStopWords("spanish")))
len(stop_words)

640

In [13]:
data = spark.read.json(path)
data2 = data.select(*data.columns ,regexp_clear.alias('desc2')).drop('desc')

In [14]:
tokenizer = Tokenizer(inputCol = "desc2", outputCol = "words")
stopwordsremover = StopWordsRemover(inputCol = "words", outputCol = "words_censored", stopWords = stop_words)
tf = HashingTF(inputCol = "words_censored", outputCol="tf")
tfidf = IDF(inputCol = "tf", outputCol = "idf")

pipeline = Pipeline(stages=[tokenizer, stopwordsremover, tf, tfidf ])
pipeline_fit = pipeline.fit(data2)
tfidf_data = pipeline_fit.transform(data2)
tfidf_data.show(10)

+--------------------+---+----+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 cat| id|lang|                name|      provider|               desc2|               words|      words_censored|                  tf|                 idf|
+--------------------+---+----+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|3/business_manage...|  4|  en|Accounting Cycle:...|Canvas Network|this course intro...|[this, course, in...|[course, introduc...|(262144,[4211,753...|(262144,[4211,753...|
|              11/law|  5|  en|American Counter ...|Canvas Network|this online cours...|[this, online, co...|[online, course, ...|(262144,[1598,172...|(262144,[1598,172...|
|5/computer_scienc...|  6|  fr|Arithmétique: en ...|Canvas Network|this course is ta...|[this, course, is...|[course, taught, ...|(2621

In [16]:
cross = tfidf_data.alias('a').crossJoin(tfidf_data.where(f_.col('id').isin(pc_ids)).alias('b'))\
.select('a.*', f_.col('b.id').alias('pc_id'), f_.col('b.lang').alias('pc_lang'), f_.col('b.name').alias('pc_name'), f_.col('b.idf').alias('pc_tfidf') )\
.filter(''' a.lang=pc_lang and pc_id !=id ''')
cross.show(5)

cross2 = cross.select('*',cosine_similarity('idf','pc_tfidf').alias('cos')).filter(''' cos !='NaN' ''')
cross2.show(5)

cross3 = cross2.select('*', f_.row_number().over(Window.partitionBy("pc_id")\
.orderBy(f_.col('cos').desc(), f_.col('name'), f_.col('id'))).alias('n'))\
.filter('n<=10')
cross3.show(5)


+--------------------+-----+----+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+-------+--------------------+--------------------+
|                 cat|   id|lang|                name|provider|               desc2|               words|      words_censored|                  tf|                 idf|pc_id|pc_lang|             pc_name|            pc_tfidf|
+--------------------+-----+----+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+-------+--------------------+--------------------+
|3/business_manage...| 9852|  en|How to Make Easy ...|   Udemy| its not luck and...|[, its, not, luck...|[, luck, accident...|(262144,[14,619,1...|(262144,[14,619,1...|21617|     en|Preparing for the...|(262144,[170,1079...|
|3/business_manage...| 9852|  en|How to Make Easy ...|   Udemy| its not luck and...|[, its, not, luc

In [17]:
recomendation = cross3.groupBy(f_.col('pc_id')).agg(f_.collect_list(f_.col('id')).alias('top_10_ids')).orderBy('pc_id').collect()
res = {}
for r in recomendation:
    res[str(r[0])] = r[1]
    print(str(r[0]),':',r[1])

with open('lab02.json', 'w') as f:
    json.dump(res, f, indent=3)


11556 : [16488, 468, 13461, 22710, 23357, 10447, 19330, 21707, 11523, 9465]
13702 : [864, 21079, 8123, 1041, 28074, 13057, 1396, 1052, 1033, 8300]
16627 : [11431, 12247, 12660, 5687, 17964, 16694, 12598, 11575, 12863, 21704]
16704 : [1236, 1247, 1228, 1365, 1164, 1233, 1273, 20288, 8186, 8203]
21617 : [21609, 21608, 21616, 21492, 21624, 21623, 21703, 21630, 21628, 21508]
23126 : [14760, 13665, 13782, 15909, 19270, 25782, 17499, 13348, 7153, 25071]


In [None]:
spark.stop()