In [44]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [45]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()

spark = SparkSession.builder.config(conf=conf).appName("Sabilov lab2").getOrCreate()

In [46]:
spark

In [47]:
data = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [48]:
data.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows



In [49]:
my_courses = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
              [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
              [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
              [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
              [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
              [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [50]:
data.rdd.getNumPartitions()

3

In [51]:
data.where('id = 16704').show()

+--------------------+--------------------+-----+----+--------------------+--------+
|                 cat|                desc|   id|lang|                name|provider|
+--------------------+--------------------+-----+----+--------------------+--------+
|5/computer_scienc...|В курсе рассматри...|16704|  ru|Программирование ...|  Intuit|
+--------------------+--------------------+-----+----+--------------------+--------+



In [52]:
from pyspark.ml.feature import HashingTF, IDF
from pyspark.sql import functions as f 

langs = list(set([x[1] for x in my_courses]))
dfs = {}
for lng in langs:
    dfs[lng + '_data'] = data.filter(data.lang == lng)
    dfs[lng + '_data'] = dfs[lng + '_data'].withColumn('splited', f.split('desc', '[,. ]'))
    
    ht = HashingTF(inputCol='splited', outputCol='tf', numFeatures=10000)
    dfs[lng + '_data'] = ht.transform(dfs[lng + '_data'])
    
    idf = IDF(inputCol='tf', outputCol='tfidf').fit(dfs[lng + '_data'])
    dfs[lng + '_data'] = idf.transform(dfs[lng + '_data'])

dfs['en_data'].show(1, vertical=True, truncate=False)

for lng in langs:
    dfs[lng + '_data'].coalesce(3)
    print(dfs[lng + '_data'].rdd.getNumPartitions())
    dfs[lng + '_data'].cache()

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

3
3


In [53]:
for x in my_courses:
    lng = x[1]
    cj_df = dfs[lng + '_data'].where('id = ' + str(x[0])).select(f.col('tfidf').alias(str(x[0])))
    dfs[lng + '_data'] = dfs[lng + '_data'].crossJoin(cj_df)
dfs['en_data'].show(1, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [54]:
from pyspark.sql.types import DoubleType

dotProductUdf = f.udf(lambda v, u: float(v.dot(u) / (v.norm(2) * u.norm(2))), DoubleType())
for x in my_courses:
    dfs[x[1] + '_data'] = dfs[x[1] + '_data'].withColumn('dist_to_' + str(x[0]), dotProductUdf('tfidf', str(x[0])))
dfs['en_data'].show(1, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [55]:
dfs['en_data'].printSchema()

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)
 |-- splited: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tf: vector (nullable = true)
 |-- tfidf: vector (nullable = true)
 |-- 23126: vector (nullable = true)
 |-- 21617: vector (nullable = true)
 |-- dist_to_23126: double (nullable = true)
 |-- dist_to_21617: double (nullable = true)



In [56]:
dfs['en_data'].orderBy(['dist_to_21617', 'name', 'id'], ascending=[1, 1, 1]).show(5, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [57]:
langs = list(set([x[1] for x in my_courses]))
for lng in langs:
    dfs[lng + '_data'].coalesce(3)
    print(dfs[lng + '_data'].rdd.getNumPartitions())
    dfs[lng + '_data'].cache()

27
9
9


In [58]:
res = {}
for x in sorted(my_courses, key=lambda x: (x[0], x[2])):
    res[x[0]] = dfs[x[1] + '_data'].orderBy(['dist_to_' + str(x[0]), 
                                             'name',
                                             'id'], 
                                           ascending=[0, 1, 1]).select('id').rdd.flatMap(lambda x: x).collect()[:11]
    if x[0] in res[x[0]]:
        res[x[0]].remove(x[0])
    res[x[0]] = res[x[0]][:10]
print(res)

{11556: [16488, 9465, 22710, 21611, 19330, 17910, 12679, 13290, 23441, 22284], 13702: [864, 21079, 1150, 792, 1410, 1111, 8123, 8083, 1266, 8082], 16627: [11431, 17964, 12660, 17961, 16694, 5687, 12247, 5558, 12863, 9563], 16704: [26980, 823, 1219, 1365, 8186, 913, 20095, 1247, 927, 20096], 21617: [21609, 21616, 21608, 21492, 21508, 21624, 21854, 21676, 21623, 21630], 23126: [14760, 11978, 13782, 14380, 26864, 3819, 25782, 23257, 6206, 3919]}


In [59]:
import json
with open('/data/home/farhad.sabilov/lab02.json', 'w') as fout:
    json.dump(res, fout)

In [60]:
spark.stop()