In [1]:
import os
import sys

home = os.environ["HOME"]
spark_home = '/usr/hdp/current/spark2-client'

os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell --num-executors 2'

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

In [2]:
from pyspark import SparkConf, SparkContext, HiveContext
from pyspark.sql.types import DoubleType, StringType, ArrayType
from pyspark.sql.functions import udf, lit, col, row_number, collect_list, concat, concat_ws
from pyspark.sql.window import Window
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, Normalizer

conf = SparkConf().setAppName("lab02").setMaster("yarn").set("spark.executor.instances", 2) \
.set("spark.sql.crossJoin.enabled", True)

sc = SparkContext.getOrCreate(conf)
spark = HiveContext(sc)

sc.getConf().get("spark.app.id")

'application_1665753715559_2711'

In [3]:
import re
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable 

class CustomTokenizer(Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(CustomTokenizer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)
        
    @keyword_only
    def setParams(self, inputCol=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    
    def _transform(self, dataset):
        def f(s):
            regex = re.compile(u'[\w\d]{2,}', re.U)
            return regex.findall(s.lower())
        
        t = ArrayType(StringType())
        out_col = self.getOutputCol()
        in_col = dataset[self.getInputCol()]
        return dataset.withColumn(out_col, udf(f, t)(in_col))

cosine_udf = udf(lambda i, j: float(i.dot(j) / (i.norm(2) * j.norm(2))), DoubleType())

In [4]:
courses_variant = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]
courses_ids = [i[0] for i in courses_variant]
courses_ids

[23126, 21617, 16627, 11556, 16704, 13702]

In [5]:
# import pandas as pd
# courses_variant_df = pd.DataFrame(courses_variant, columns=['id', 'lang','name'])
# courses_variant_sdf = spark.createDataFrame(data=courses_variant_df)
# courses_ids = courses_variant_sdf.select('id').rdd.flatMap(lambda x: x).collect()
# courses_ids

[23126, 21617, 16627, 11556, 16704, 13702]

In [6]:
courses_raw = spark.read.json("/labs/slaba02/DO_record_per_line.json")
courses_raw.show()

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
|9/humanities|15/m...|This game-based c...|  9|  en|College Foundatio...|Canvas Network|
|  14/social_sciences|What’s in your di...| 10|  en|Digital Literacies I|Canvas Network|
|  14/social_sciences|The goal of the D...| 11|  en|Digital Literacie...|Canvas Network|
|  14/social_sciences

In [7]:
courses_ru = courses_raw.filter(col("lang") == "ru")
courses_en = courses_raw.filter(col("lang") == "en")
courses_other = courses_raw.filter(~col("lang").isin(["ru", "en"]))
courses_langs = [courses_ru, courses_en, courses_other]

In [8]:
# tokenizer = Tokenizer(inputCol="desc", outputCol="words")
# words = tokenizer.transform(courses_raw)
# stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="words_filter")
# words_filter = stopWordsRemover.transform(words)
# hashingTF = HashingTF(inputCol="words_filter", outputCol="tf")
# tf = hashingTF.transform(words_filter)
# idf = IDF(inputCol="tf", outputCol="features").fit(tf)
# tfidf = idf.transform(tf)
# normalizer = Normalizer(inputCol="features", outputCol="norm")
# data = normalizer.transform(tfidf)
# data.select("id", "words", "words_filter", "features").show()

+---+--------------------+--------------------+--------------------+
| id|               words|        words_filter|            features|
+---+--------------------+--------------------+--------------------+
|  4|[this, course, in...|[course, introduc...|(262144,[7532,891...|
|  5|[this, online, co...|[online, course, ...|(262144,[1598,172...|
|  6|[this, course, is...|[course, taught, ...|(262144,[1244,106...|
|  7|[we, live, in, a,...|[live, digitally,...|(262144,[836,4525...|
|  8|[this, self-paced...|[self-paced, cour...|(262144,[619,3535...|
|  9|[this, game-based...|[game-based, cour...|(262144,[1889,286...|
| 10|[what’s, in, your...|[what’s, digital,...|(262144,[12250,27...|
| 11|[the, goal, of, t...|[goal, digital, l...|(262144,[7416,183...|
| 12|[ready, to, explo...|[ready, explore, ...|(262144,[20457,20...|
| 13|[this, self-paced...|[self-paced, cour...|(262144,[37750,43...|
+---+--------------------+--------------------+--------------------+
only showing top 10 rows



In [9]:
tokenizer = CustomTokenizer(inputCol="desc", outputCol="words")
stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="words_filter")
hashingTF = HashingTF(inputCol="words_filter", outputCol="tf")
idf = IDF(minDocFreq=1, inputCol="tf", outputCol="features")
normalizer = Normalizer(inputCol="features", outputCol="norm")

preProcStages = [tokenizer, stopWordsRemover, hashingTF, idf, normalizer]
pipeline = Pipeline(stages=preProcStages)

model = pipeline.fit(courses_raw)
data = model.transform(courses_raw)
data.select("id", "words", "words_filter", "features").show(10)

+---+--------------------+--------------------+
| id|            features|                norm|
+---+--------------------+--------------------+
|  4|(262144,[4211,753...|(262144,[4211,753...|
|  5|(262144,[1598,172...|(262144,[1598,172...|
|  6|(262144,[7601,106...|(262144,[7601,106...|
|  7|(262144,[836,4525...|(262144,[836,4525...|
|  8|(262144,[619,2042...|(262144,[619,2042...|
|  9|(262144,[1889,402...|(262144,[1889,402...|
| 10|(262144,[12250,12...|(262144,[12250,12...|
| 11|(262144,[7416,183...|(262144,[7416,183...|
| 12|(262144,[16121,17...|(262144,[16121,17...|
| 13|(262144,[43996,48...|(262144,[43996,48...|
+---+--------------------+--------------------+
only showing top 10 rows



In [12]:
from functools import reduce
from pyspark.sql import DataFrame

tokenizer = CustomTokenizer(inputCol="desc", outputCol="words")
stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="words_filter")
hashingTF = HashingTF(inputCol="words_filter", outputCol="tf")
idf = IDF(minDocFreq=1, inputCol="tf", outputCol="features")
normalizer = Normalizer(inputCol="features", outputCol="norm")

preProcStages = [tokenizer, stopWordsRemover, hashingTF, idf, normalizer]
pipeline = Pipeline(stages=preProcStages)

courses_list = []
for courses_lang in courses_langs:
    model = pipeline.fit(courses_lang)
    data = model.transform(courses_lang)
    courses_list.append(data)

courses = reduce(DataFrame.unionAll, courses_list)

In [13]:
courses_cosine = courses.filter(col("id").isin(courses_ids)).alias("i") \
.join(courses.filter(~col("id").isin(courses_ids)).alias("j")) \
.select(col("i.id").alias("i"), 
        col("j.id").alias("j"), 
        cosine_udf("i.norm", "j.norm").alias("cosine")) \
.na.drop("any").sort("i", "j").cache()
courses_cosine.show(10, False)

+-----+---+---------------------+
|i    |j  |cosine               |
+-----+---+---------------------+
|11556|4  |1.1588395991968858E-4|
|11556|5  |1.2107104601292938E-4|
|11556|6  |0.0056017904338261735|
|11556|7  |0.006421368255308077 |
|11556|8  |2.165499882280121E-4 |
|11556|9  |0.004655588580904408 |
|11556|10 |1.4134647607303173E-4|
|11556|11 |1.5108864886008257E-4|
|11556|12 |0.001955294352430839 |
|11556|13 |0.00665510819623961  |
|11556|14 |7.251027573754172E-4 |
|11556|15 |1.8650100219093103E-4|
|11556|16 |1.4047445252391848E-4|
|11556|17 |1.2472397625644356E-4|
|11556|18 |0.0                  |
|11556|19 |1.1835813111225394E-4|
|11556|20 |0.001938116219564554 |
|11556|21 |5.644162541220802E-5 |
|11556|22 |5.186429992949809E-4 |
|11556|23 |0.007832670387017144 |
+-----+---+---------------------+
only showing top 20 rows



In [14]:
result_df = courses_cosine.withColumn("row_number", row_number().over(Window.partitionBy("i").orderBy(col("cosine").desc()))) \
.filter(col("row_number") <= 10).groupBy("i").agg(collect_list(col("j")).alias("j")).orderBy(col("i")).cache()
result_df.show()

+-----+--------------------+
|    i|                   j|
+-----+--------------------+
|11556|[16488, 468, 1346...|
|13702|[864, 21079, 1594...|
|16627|[11431, 12247, 17...|
|16704|[4592, 1327, 2036...|
|21617|[21609, 21616, 22...|
|23126|[14760, 13665, 13...|
+-----+--------------------+



In [15]:
import json
result_list = result_df.toJSON().map(lambda j: json.loads(j)).collect()
json_dict = {x["i"]: x["j"] for x in result_list}
json_dict

{11556: [16488, 468, 13461, 11523, 22710, 23357, 19330, 10447, 21707, 9465],
 13702: [864, 21079, 15946, 8617, 8123, 1396, 1041, 1033, 22053, 8313],
 16627: [11431, 12247, 17964, 17961, 5687, 5558, 16694, 12660, 27487, 27879],
 16704: [4592, 1327, 20362, 1228, 1236, 1247, 1365, 26980, 8186, 875],
 21617: [21609, 21616, 22298, 21608, 21628, 21630, 21081, 21623, 19417, 21624],
 23126: [14760, 13665, 13782, 15909, 17864, 25782, 17499, 13348, 19270, 25071]}

In [16]:
# import pprint
# json_pformat = pprint.pformat(json_dict, indent=2).replace("'", '"')
# print(json_pformat)

{ 11556: [16488, 468, 13461, 11523, 22710, 23357, 19330, 10447, 21707, 9465],
  13702: [864, 21079, 15946, 8617, 8123, 1396, 1041, 1033, 22053, 8313],
  16627: [11431, 12247, 17964, 17961, 5687, 5558, 16694, 12660, 27487, 27879],
  16704: [4592, 1327, 20362, 1228, 1236, 1247, 1365, 26980, 8186, 875],
  21617: [21609, 21616, 22298, 21608, 21628, 21630, 21081, 21623, 19417, 21624],
  23126: [14760, 13665, 13782, 15909, 17864, 25782, 17499, 13348, 19270, 25071]}


In [17]:
with open(os.path.join(home, "lab02.json"), "w") as f:
    f.write(json.dumps(json_dict))

In [18]:
# result_str = result_df.select("i", concat(lit("["), concat_ws(",", col("j")), lit("]")).alias("j"))
# result_str.show()

+-----+--------------------+
|    i|                   j|
+-----+--------------------+
|11556|[16488,468,13461,...|
|13702|[864,21079,15946,...|
|16627|[11431,12247,1796...|
|16704|[4592,1327,20362,...|
|21617|[21609,21616,2229...|
|23126|[14760,13665,1378...|
+-----+--------------------+



In [19]:
# result_str.coalesce(1).orderBy("i").write.mode("overwrite").option("header", False).csv("lab02.json")

In [20]:
# ! rm -f lab02.json
# ! hdfs dfs -get lab02.json/part-* lab02.json

In [21]:
sc.stop()