In [1]:
import os
import sys


os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("lab 2")
         .getOrCreate())

In [3]:
sc = spark.sparkContext

In [94]:
df = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [95]:
df.printSchema()

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)



In [96]:
df.show()

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
|9/humanities|15/m...|This game-based c...|  9|  en|College Foundatio...|Canvas Network|
|  14/social_sciences|What’s in your di...| 10|  en|Digital Literacies I|Canvas Network|
|  14/social_sciences|The goal of the D...| 11|  en|Digital Literacie...|Canvas Network|
|  14/social_sciences

In [97]:
cources_for_res = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
                   [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
                   [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
                   [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
                   [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
                   [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']
                  ]

In [98]:
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import udf

In [99]:
import re

regex = re.compile(u'[\w\d]{2,}', re.U)
list(regex.findall(u'\u041c\u0430\u0442fghfghf ydsrfgy 4r5 5, gh -- r'.lower()))

['матfghfghf', 'ydsrfgy', '4r5', 'gh']

In [100]:
get_words = udf(lambda x: list(regex.findall(x.lower())), ArrayType(StringType()))

In [101]:
df = df.withColumn('words', get_words(F.col('desc')))

In [102]:
remover = StopWordsRemover(inputCol="words", outputCol="words_filtered")
df = remover.transform(df)

In [103]:
df.printSchema()

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- words_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [104]:
df.show()

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|               words|      words_filtered|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|[course, introduc...|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|[this, online, co...|[online, course, ...|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|[this, course, is...|[course, taught, ...|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|[we, live, in, di...|[live, digitally,...|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Net

In [105]:
hashingTF = HashingTF(inputCol="words_filtered", outputCol="tf")
tf = hashingTF.transform(df)

tf.cache()
idf = IDF(inputCol="tf", outputCol="idf").fit(tf)
tfidf = idf.transform(tf)

In [106]:
tfidf.cache()

DataFrame[cat: string, desc: string, id: bigint, lang: string, name: string, provider: string, words: array<string>, words_filtered: array<string>, tf: vector, idf: vector]

In [107]:
res_ids = []
 
for x in cources_for_res:
    id = x[0]
    idf_v = tfidf.filter(f'id = {id}').select('idf').collect()[0][0]
    res_ids.append((id, idf_v))

In [108]:
for id, idf_v in res_ids:
    if idf_v.norm(2) == 0:
        print(id)
    cos = udf(lambda x: float(idf_v.dot(x) / (idf_v.norm(2) * x.norm(2))), FloatType())
    tfidf = tfidf.withColumn(f'cos_{id}', cos(F.col('idf')))

In [109]:
tfidf.show(5, vertical=True)

-RECORD 0------------------------------
 cat            | 3/business_manage... 
 desc           | This course intro... 
 id             | 4                    
 lang           | en                   
 name           | Accounting Cycle:... 
 provider       | Canvas Network       
 words          | [this, course, in... 
 words_filtered | [course, introduc... 
 tf             | (262144,[4211,753... 
 idf            | (262144,[4211,753... 
 cos_23126      | 0.004825294          
 cos_21617      | 0.03899095           
 cos_16627      | 6.1973136E-5         
 cos_11556      | 6.356649E-5          
 cos_16704      | 0.0                  
 cos_13702      | 0.0                  
-RECORD 1------------------------------
 cat            | 11/law               
 desc           | This online cours... 
 id             | 5                    
 lang           | en                   
 name           | American Counter ... 
 provider       | Canvas Network       
 words          | [this, online, co... 


In [110]:
norm2 = udf(lambda x: float(x.norm(2)), FloatType())

tfidf = tfidf.withColumn('norm2', norm2(F.col('idf')))

In [111]:
tfidf = tfidf.filter(F.col('norm2') != 0)

In [112]:
res = dict()
for id, _ in res_ids:
    top11 = tfidf.orderBy(f'cos_{id}', ascending=False).limit(11).select('id', f'cos_{id}').cache()
    top11.show()
    top11_data = top11.collect()
    top11.unpersist()
    top10 = [x.id for x in top11_data if x.id != id]
    print(top10)
    res[str(id)] = top10

+-----+----------+
|   id| cos_23126|
+-----+----------+
|23126|       1.0|
|14760| 0.6735877|
|13665| 0.6418198|
|13782|0.63236856|
|15909|0.45687088|
|25782|0.31653267|
|17499| 0.2987613|
|19270|   0.28898|
|13348|0.28562516|
|25071| 0.2473614|
| 7153|0.23640837|
+-----+----------+

[14760, 13665, 13782, 15909, 25782, 17499, 19270, 13348, 25071, 7153]
+-----+----------+
|   id| cos_21617|
+-----+----------+
|21609|       1.0|
|21617|       1.0|
|21616|0.49029008|
|22298| 0.4733144|
|21608|0.47302905|
|21628|0.45244095|
|21630|0.45244095|
|21623| 0.4457215|
|21081|0.44166374|
|19417|0.43981743|
|21624|0.43226188|
+-----+----------+

[21609, 21616, 22298, 21608, 21628, 21630, 21623, 21081, 19417, 21624]
+-----+----------+
|   id| cos_16627|
+-----+----------+
|16627|       1.0|
|11431|  0.655696|
|12247| 0.5208319|
|17964| 0.5011261|
|11575|0.49228275|
|12660|0.48544678|
| 5687| 0.4791413|
|25010|0.47385496|
| 5558| 0.4738044|
|10738| 0.4723437|
|17961| 0.4721328|
+-----+----------+

[

In [113]:
import json


with open('lab02.json', 'w') as f:
    json.dump(res, f)

In [None]:
# time: 2022-26-10 22:16
# file lab02.json exists in your dir: True
# file has required number of the fields: True
# top10 courses are correct: [[11556, 0.9], [13702, 0.6], [16627, 0.8], [16704, 0.7], [21617, 1.0], [23126, 0.3]]
# lab is correct: True

In [38]:
tf.unpersist()
tfidf.unpersist()

DataFrame[cat: string, desc: string, id: bigint, lang: string, name: string, provider: string, words: array<string>, words_filtered: array<string>, tf: vector, idf: vector, cos_23126: float, cos_21617: float, cos_16627: float, cos_11556: float, cos_16704: float, cos_13702: float, norm2: float]

In [42]:
langs = ['en', 'es', 'ru']

In [50]:
df = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [51]:
df = df.withColumn('words', get_words(F.col('desc')))
remover = StopWordsRemover(inputCol="words", outputCol="words_filtered")
df = remover.transform(df)

In [52]:
df_en = df.filter(F.col('lang') == 'en')
df_ru = df.filter(F.col('lang') == 'ru')
df_es = df.filter(F.col('lang') == 'es')

In [53]:
df_en.show()

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|               words|      words_filtered|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|[course, introduc...|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|[this, online, co...|[online, course, ...|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|[we, live, in, di...|[live, digitally,...|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|[this, self, pace...|[self, paced, cou...|
|9/humanities|15/m...|This game-based c...|  9|  en|College Foundatio...|Canvas Net

In [54]:
hashingTF = HashingTF(inputCol="words_filtered", outputCol="tf")
tf_en = hashingTF.transform(df_en)

tf_en.cache()
idf_en = IDF(inputCol="tf", outputCol="idf").fit(tf_en)
tfidf_en = idf.transform(tf_en)
tfidf_en.cache()

DataFrame[cat: string, desc: string, id: bigint, lang: string, name: string, provider: string, words: array<string>, words_filtered: array<string>, tf: vector, idf: vector]

In [59]:
hashingTF = HashingTF(inputCol="words_filtered", outputCol="tf")
tf_es = hashingTF.transform(df_es)

tf_es.cache()
idf_es = IDF(inputCol="tf", outputCol="idf").fit(tf_es)
tfidf_es = idf.transform(tf_es)
tfidf_es.cache()

DataFrame[cat: string, desc: string, id: bigint, lang: string, name: string, provider: string, words: array<string>, words_filtered: array<string>, tf: vector, idf: vector]

In [60]:
hashingTF = HashingTF(inputCol="words_filtered", outputCol="tf")
tf_ru = hashingTF.transform(df_ru)

tf_ru.cache()
idf_ru = IDF(inputCol="tf", outputCol="idf").fit(tf_ru)
tfidf_ru = idf.transform(tf_ru)
tfidf_ru.cache()

DataFrame[cat: string, desc: string, id: bigint, lang: string, name: string, provider: string, words: array<string>, words_filtered: array<string>, tf: vector, idf: vector]

In [61]:
res_ids_en = []
res_ids_es = []
res_ids_ru = []
 
for x in cources_for_res:
    id = x[0]
    if x[1] == 'en':
        idf_v = tfidf_en.filter(f'id = {id}').select('idf').collect()[0][0]
        res_ids_en.append((id, idf_v))
    elif x[1] == 'es':
        idf_v = tfidf_es.filter(f'id = {id}').select('idf').collect()[0][0]
        res_ids_es.append((id, idf_v))
    else:
        idf_v = tfidf_ru.filter(f'id = {id}').select('idf').collect()[0][0]
        res_ids_ru.append((id, idf_v))

In [62]:
for id, idf_v in res_ids_en:
    if idf_v.norm(2) == 0:
        print(id)
    cos = udf(lambda x: float(idf_v.dot(x) / (idf_v.norm(2) * x.norm(2))), FloatType())
    tfidf_en = tfidf_en.withColumn(f'cos_{id}', cos(F.col('idf')))

In [64]:
for id, idf_v in res_ids_es:
    if idf_v.norm(2) == 0:
        print(id)
    cos = udf(lambda x: float(idf_v.dot(x) / (idf_v.norm(2) * x.norm(2))), FloatType())
    tfidf_es = tfidf_es.withColumn(f'cos_{id}', cos(F.col('idf')))

In [66]:
for id, idf_v in res_ids_ru:
    if idf_v.norm(2) == 0:
        print(id)
    cos = udf(lambda x: float(idf_v.dot(x) / (idf_v.norm(2) * x.norm(2))), FloatType())
    tfidf_ru = tfidf_ru.withColumn(f'cos_{id}', cos(F.col('idf')))

In [67]:
tfidf_en = tfidf_en.withColumn('norm2', norm2(F.col('idf')))
tfidf_en = tfidf_en.filter(F.col('norm2') != 0)

tfidf_es = tfidf_es.withColumn('norm2', norm2(F.col('idf')))
tfidf_es = tfidf_es.filter(F.col('norm2') != 0)

tfidf_ru = tfidf_ru.withColumn('norm2', norm2(F.col('idf')))
tfidf_ru = tfidf_ru.filter(F.col('norm2') != 0)

In [70]:
res = dict()

In [71]:

for id, _ in res_ids_en:
    top11 = tfidf_en.orderBy(f'cos_{id}', ascending=False).limit(11).select('id', f'cos_{id}').cache()
    top11.show()
    top11_data = top11.collect()
    top11.unpersist()
    top10 = [x.id for x in top11_data if x.id != id]
    print(top10)
    res[str(id)] = top10

+-----+----------+
|   id| cos_23126|
+-----+----------+
|23126|       1.0|
|14760| 0.6735877|
|13665| 0.6418198|
|13782|0.63236856|
|15909|0.45687088|
|25782|0.31653267|
|17499| 0.2987613|
|19270|   0.28898|
|13348|0.28562516|
|25071| 0.2473614|
| 7153|0.23640837|
+-----+----------+

[14760, 13665, 13782, 15909, 25782, 17499, 19270, 13348, 25071, 7153]
+-----+----------+
|   id| cos_21617|
+-----+----------+
|21609|       1.0|
|21617|       1.0|
|21616|0.49029008|
|22298| 0.4733144|
|21608|0.47302905|
|21628|0.45244095|
|21630|0.45244095|
|21623| 0.4457215|
|21081|0.44166374|
|19417|0.43981743|
|21624|0.43226188|
+-----+----------+

[21609, 21616, 22298, 21608, 21628, 21630, 21623, 21081, 19417, 21624]


In [72]:

for id, _ in res_ids_es:
    top11 = tfidf_es.orderBy(f'cos_{id}', ascending=False).limit(11).select('id', f'cos_{id}').cache()
    top11.show()
    top11_data = top11.collect()
    top11.unpersist()
    top10 = [x.id for x in top11_data if x.id != id]
    print(top10)
    res[str(id)] = top10

+-----+----------+
|   id| cos_16627|
+-----+----------+
|16627|       1.0|
|11431|  0.655696|
|12247| 0.5208319|
|17964| 0.5011261|
|11575|0.49228275|
|12660|0.48544678|
| 5687| 0.4791413|
|25010|0.47385496|
| 5558| 0.4738044|
|10738| 0.4723437|
|17961| 0.4721328|
+-----+----------+

[11431, 12247, 17964, 11575, 12660, 5687, 25010, 5558, 10738, 17961]
+-----+----------+
|   id| cos_11556|
+-----+----------+
|11556|       1.0|
|16488|0.48849323|
|13461| 0.4322609|
|  468|0.43003628|
|23357|0.42081407|
|19330|0.38868964|
| 7833| 0.3867411|
| 9289|0.38331467|
|16929|0.37522885|
|22710|0.37061185|
|10447| 0.3696478|
+-----+----------+

[16488, 13461, 468, 23357, 19330, 7833, 9289, 16929, 22710, 10447]


In [73]:

for id, _ in res_ids_ru:
    top11 = tfidf_ru.orderBy(f'cos_{id}', ascending=False).limit(11).select('id', f'cos_{id}').cache()
    top11.show()
    top11_data = top11.collect()
    top11.unpersist()
    top10 = [x.id for x in top11_data if x.id != id]
    print(top10)
    res[str(id)] = top10

+-----+----------+
|   id| cos_16704|
+-----+----------+
|16704|       1.0|
| 1247|0.18616317|
| 1236|0.18573165|
| 1228|0.16142292|
| 1365|0.15735969|
| 1164|0.15335916|
| 1273|0.14504014|
|20288|0.14504014|
| 1233|0.14204966|
| 8203|0.13480455|
| 8186|0.13441549|
+-----+----------+

[1247, 1236, 1228, 1365, 1164, 1273, 20288, 1233, 8203, 8186]
+-----+-----------+
|   id|  cos_13702|
+-----+-----------+
|  864|        1.0|
|13702|        1.0|
|21079| 0.13550484|
| 8313|0.105693325|
| 8123| 0.09601235|
| 1041| 0.08766277|
|28074| 0.08516081|
|13057| 0.08026643|
|21987|0.077369004|
| 1033|0.068252556|
|17076| 0.06614203|
+-----+-----------+

[864, 21079, 8313, 8123, 1041, 28074, 13057, 21987, 1033, 17076]


In [74]:
import json


with open('lab02.json', 'w') as f:
    json.dump(res, f)

In [62]:

spark.stop()