In [1]:
!hdfs dfs -ls /labs/slaba02/

Found 1 items
-rw-r--r--   3 hdfs hdfs   69519728 2021-02-27 21:58 /labs/slaba02/DO_record_per_line.json


In [2]:
import os
import sys


os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [3]:
from pyspark import SparkContext, SparkConf

conf = SparkConf()
conf.set("spark.app.name", "lab2") 

sc = SparkContext(conf=conf)

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(conf=conf).appName("natasha pritykovskaya Spark Dataframe app").getOrCreate()

In [5]:
df = spark.read\
          .format("json")\
          .load("/labs/slaba02/DO_record_per_line.json")

In [6]:
df.show(10)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
|9/humanities|15/m...|This game-based c...|  9|  en|College Foundatio...|Canvas Network|
|  14/social_sciences|What’s in your di...| 10|  en|Digital Literacies I|Canvas Network|
|  14/social_sciences|The goal of the D...| 11|  en|Digital Literacie...|Canvas Network|
|  14/social_sciences

In [18]:
task_arr = [23126, 21617, 16627, 11556, 16704, 13702]

In [8]:
df.printSchema()

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)



In [9]:
import re

regex = re.compile(u'[\w\d]{2,}', re.U)

df2 = df\
        .rdd\
        .map(lambda x : (x.id, x.lang, x.name, regex.findall(x.desc.lower())))\
        .toDF()\
        .withColumnRenamed("_1","id")\
        .withColumnRenamed("_2","lang")\
        .withColumnRenamed("_3","name")\
        .withColumnRenamed("_4","desc")

In [10]:
df2.show()

+---+----+--------------------+--------------------+
| id|lang|                name|                desc|
+---+----+--------------------+--------------------+
|  4|  en|Accounting Cycle:...|[this, course, in...|
|  5|  en|American Counter ...|[this, online, co...|
|  6|  fr|Arithmétique: en ...|[this, course, is...|
|  7|  en|Becoming a Dynami...|[we, live, in, di...|
|  8|  en|           Bioethics|[this, self, pace...|
|  9|  en|College Foundatio...|[this, game, base...|
| 10|  en|Digital Literacies I|[what, in, your, ...|
| 11|  en|Digital Literacie...|[the, goal, of, t...|
| 12|  en|Digital Tools for...|[ready, to, explo...|
| 13|  en|Discover Your Val...|[this, self, pace...|
| 14|  en|Enhancing Patient...|[what, is, interp...|
| 15|  en|Ethics and Values...|[this, course, pr...|
| 16|  en| Exploring Chemistry|[chemistry, is, a...|
| 17|  en|Exploring Enginee...|[are, you, consid...|
| 18|  en|Fairy Tales: Orig...|[princess, storie...|
| 19|  en|First Peoples to ...|[this, first, i

In [11]:
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import IDF
from pyspark.sql.types import DoubleType

htf = HashingTF(inputCol="desc", outputCol="tf", numFeatures=10000)
tf = htf.transform(df2)

In [12]:
tf.show()

+---+----+--------------------+--------------------+--------------------+
| id|lang|                name|                desc|                  tf|
+---+----+--------------------+--------------------+--------------------+
|  4|  en|Accounting Cycle:...|[this, course, in...|(10000,[36,63,138...|
|  5|  en|American Counter ...|[this, online, co...|(10000,[32,222,36...|
|  6|  fr|Arithmétique: en ...|[this, course, is...|(10000,[30,118,12...|
|  7|  en|Becoming a Dynami...|[we, live, in, di...|(10000,[493,572,7...|
|  8|  en|           Bioethics|[this, self, pace...|(10000,[32,115,13...|
|  9|  en|College Foundatio...|[this, game, base...|(10000,[56,91,300...|
| 10|  en|Digital Literacies I|[what, in, your, ...|(10000,[1045,1263...|
| 11|  en|Digital Literacie...|[the, goal, of, t...|(10000,[87,157,57...|
| 12|  en|Digital Tools for...|[ready, to, explo...|(10000,[233,461,8...|
| 13|  en|Discover Your Val...|[this, self, pace...|(10000,[26,696,10...|
| 14|  en|Enhancing Patient...|[what, 

In [13]:
idf = IDF(inputCol="tf", outputCol="idf")
tfidf = idf.fit(tf).transform(tf)

In [14]:
tfidf.show()

+---+----+--------------------+--------------------+--------------------+--------------------+
| id|lang|                name|                desc|                  tf|                 idf|
+---+----+--------------------+--------------------+--------------------+--------------------+
|  4|  en|Accounting Cycle:...|[this, course, in...|(10000,[36,63,138...|(10000,[36,63,138...|
|  5|  en|American Counter ...|[this, online, co...|(10000,[32,222,36...|(10000,[32,222,36...|
|  6|  fr|Arithmétique: en ...|[this, course, is...|(10000,[30,118,12...|(10000,[30,118,12...|
|  7|  en|Becoming a Dynami...|[we, live, in, di...|(10000,[493,572,7...|(10000,[493,572,7...|
|  8|  en|           Bioethics|[this, self, pace...|(10000,[32,115,13...|(10000,[32,115,13...|
|  9|  en|College Foundatio...|[this, game, base...|(10000,[56,91,300...|(10000,[56,91,300...|
| 10|  en|Digital Literacies I|[what, in, your, ...|(10000,[1045,1263...|(10000,[1045,1263...|
| 11|  en|Digital Literacie...|[the, goal, of, t..

In [17]:
from pyspark.ml.feature import Normalizer

normalizer = Normalizer(inputCol="idf", outputCol="norm")
df3 = normalizer.transform(tfidf)

In [22]:
df3.show()

+---+----+--------------------+--------------------+--------------------+--------------------+--------------------+
| id|lang|                name|                desc|                  tf|                 idf|                norm|
+---+----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  4|  en|Accounting Cycle:...|[this, course, in...|(10000,[36,63,138...|(10000,[36,63,138...|(10000,[36,63,138...|
|  5|  en|American Counter ...|[this, online, co...|(10000,[32,222,36...|(10000,[32,222,36...|(10000,[32,222,36...|
|  6|  fr|Arithmétique: en ...|[this, course, is...|(10000,[30,118,12...|(10000,[30,118,12...|(10000,[30,118,12...|
|  7|  en|Becoming a Dynami...|[we, live, in, di...|(10000,[493,572,7...|(10000,[493,572,7...|(10000,[493,572,7...|
|  8|  en|           Bioethics|[this, self, pace...|(10000,[32,115,13...|(10000,[32,115,13...|(10000,[32,115,13...|
|  9|  en|College Foundatio...|[this, game, base...|(10000,[56,91,300...

In [20]:
import pyspark.sql.functions as f

cos_udf = f.udf(lambda x,y: float(x.dot(y)), DoubleType())

In [32]:
def calc_for_one(task):
    task_df = df3.filter(f.col('id') == task).select(f.col('idf').alias('norm2'))
    df4 = df3.filter(f.col('id') != task).select('id', 'idf').crossJoin(task_df)
    df5 = df4.select(cos_udf('idf', 'norm2').alias("cos"), 'id')
    lst = df5.sort(f.desc('cos')).select('id').take(10)
    res = []
    for x in lst:
        res.append(x.id)
    return res

In [33]:
result = {}

for task in task_arr:
    result[task] = calc_for_one(task)

In [34]:
result

{23126: [13665, 24419, 25782, 11063, 6938, 23756, 22652, 12465, 11616, 25845],
 21617: [16971, 22366, 17221, 6776, 17101, 380, 7597, 22297, 19848, 20592],
 16627: [26336, 26670, 7944, 17839, 10749, 21053, 13275, 23303, 18979, 11575],
 11556: [26336, 26670, 7944, 13275, 16929, 17839, 21053, 10749, 18979, 8098],
 16704: [23407, 3864, 25723, 25726, 23864, 18023, 25627, 25724, 22666, 20933],
 13702: [25502, 28074, 23769, 18215, 5399, 864, 11319, 10926, 467, 25846]}