In [3]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [4]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("bea")
         .getOrCreate())

In [3]:
spark

In [5]:
df = spark.read\
          .format("json")\
          .load("/labs/slaba02/DO_record_per_line.json")

In [6]:
df.show(2)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 2 rows



In [33]:
my_en = [23126, 16627, 21617]
my_es = [16627, 11556]
my_ru = [16704, 13702]
my = my_en + my_es + my_ru

In [34]:
my

[23126, 16627, 21617, 16627, 11556, 16704, 13702]

+---+----+--------------------+--------------------+
| id|lang|                desc|      split(desc,  )|
+---+----+--------------------+--------------------+
|  4|  en|This course intro...|[This, course, in...|
|  5|  en|This online cours...|[This, online, co...|
+---+----+--------------------+--------------------+
only showing top 2 rows



In [135]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark.sql.functions import col, udf
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, IDF
from pyspark.sql.window import Window


In [8]:
regex = RegexTokenizer (inputCol = 'desc', outputCol = 'words', pattern=r"[,\s]")

In [9]:
tokenized = regex.transform(df)

In [10]:
tokenized.select ("id", "lang", "desc", "words").show (5)

+---+----+--------------------+--------------------+
| id|lang|                desc|               words|
+---+----+--------------------+--------------------+
|  4|  en|This course intro...|[this, course, in...|
|  5|  en|This online cours...|[this, online, co...|
|  6|  fr|This course is ta...|[this, course, is...|
|  7|  en|We live in a digi...|[we, live, in, a,...|
|  8|  en|This self-paced c...|[this, self-paced...|
+---+----+--------------------+--------------------+
only showing top 5 rows



In [140]:
stop_word = StopWordsRemover.loadDefaultStopWords ("english") +  StopWordsRemover.loadDefaultStopWords ("russian") + StopWordsRemover.loadDefaultStopWords ("spanish")

In [143]:
st = StopWordsRemover(inputCol="words", outputCol="words1")
st1 = st.transform(tokenized)

In [146]:
tokenized = st1

In [147]:
ht = HashingTF(inputCol="words1", outputCol="features", numFeatures=10000)

In [148]:
result = ht.transform(tokenized)

In [149]:
idf = IDF(inputCol="features", outputCol="features2").fit(result)

In [150]:
result_tfidf = idf.transform(result)

In [79]:
result_tfidf.select("id", "lang", "desc", "words", "features2").show(2)

+---+----+--------------------+--------------------+--------------------+
| id|lang|                desc|               words|           features2|
+---+----+--------------------+--------------------+--------------------+
|  4|  en|This course intro...|[this, course, in...|(10000,[36,63,138...|
|  5|  en|This online cours...|[this, online, co...|(10000,[32,222,36...|
+---+----+--------------------+--------------------+--------------------+
only showing top 2 rows



In [151]:
result_my = result_tfidf.where("id in (23126, 16627, 21617, 16627, 11556, 16704, 13702)").\
select(col("id").alias("id_my"), col("features2").alias("features_my"), col("lang"))

In [152]:
result1 = result_tfidf.join(result_my, ["lang"], "inner").where ("id <> id_my")

In [153]:
result2 = result1.select("lang", "id", "id_my", "features2", "features_my", cos_udf ("features2", "features_my").\
                         alias("cos")).where("cos <> 'NaN'").orderBy ("lang","id_my", col("cos").desc())

In [83]:
@f.udf (FloatType ())
def cos_udf (v1, v2):
    return float (v1.dot(v2) / (v1.norm(2) * v2.norm(2)))

In [84]:
result2.show (20)

+----+-----+-----+--------------------+--------------------+----------+
|lang|   id|id_my|           features2|         features_my|       cos|
+----+-----+-----+--------------------+--------------------+----------+
|  en|21609|21617|(10000,[161,213,3...|(10000,[161,213,3...| 0.9907613|
|  en|21608|21617|(10000,[161,173,2...|(10000,[161,213,3...|0.45537543|
|  en|21616|21617|(10000,[161,173,2...|(10000,[161,213,3...|0.45081937|
|  en|21492|21617|(10000,[128,161,4...|(10000,[161,213,3...|0.40419912|
|  en|21624|21617|(10000,[9,20,32,1...|(10000,[161,213,3...|0.34354025|
|  en|21623|21617|(10000,[9,20,32,1...|(10000,[161,213,3...| 0.3395119|
|  en|21630|21617|(10000,[9,20,32,1...|(10000,[161,213,3...|0.33804896|
|  en|21628|21617|(10000,[9,20,32,1...|(10000,[161,213,3...|0.33794484|
|  en|21857|21617|(10000,[157,161,3...|(10000,[161,213,3...|0.33376485|
|  en|21506|21617|(10000,[156,157,1...|(10000,[161,213,3...|0.32997456|
|  en|21675|21617|(10000,[157,161,3...|(10000,[161,213,3...|0.32

In [154]:
windowSpec = Window.partitionBy("id_my").orderBy(col("cos").desc())
final = result2.select ("lang", "id", "id_my", "features2", "features_my", "cos", f.row_number().\
                over(windowSpec).alias("row_num")).where("row_num <= 10").cache()

In [111]:
final.show()

+----+-----+-----+--------------------+--------------------+----------+-------+
|lang|   id|id_my|           features2|         features_my|       cos|row_num|
+----+-----+-----+--------------------+--------------------+----------+-------+
|  en|13665|23126|(10000,[51,93,128...|(10000,[87,91,96,...|0.47085336|      1|
|  en|13782|23126|(10000,[1263,1470...|(10000,[87,91,96,...| 0.4523038|      2|
|  en|24419|23126|(10000,[1,50,77,8...|(10000,[87,91,96,...| 0.4126953|      3|
|  en|20638|23126|(10000,[34,3775,4...|(10000,[87,91,96,...|0.40465704|      4|
|  en| 2724|23126|(10000,[26,173,36...|(10000,[87,91,96,...|0.36599666|      5|
|  en|25782|23126|(10000,[15,24,91,...|(10000,[87,91,96,...| 0.3037805|      6|
|  en|15909|23126|(10000,[476,982,1...|(10000,[87,91,96,...|0.26506677|      7|
|  en|23756|23126|(10000,[12,40,45,...|(10000,[87,91,96,...|0.26087302|      8|
|  en|13348|23126|(10000,[32,242,46...|(10000,[87,91,96,...| 0.2598476|      9|
|  en|17499|23126|(10000,[32,242,61...|(

In [155]:
final1 = final.groupBy("id_my").agg(f.collect_list("id")).collect()

In [156]:
final_export = {str(x[0]): x[1] for x in final1}




In [130]:
final_export

{'23126': [13665,
  13782,
  24419,
  20638,
  2724,
  25782,
  15909,
  23756,
  13348,
  17499],
 '16627': [11431, 12247, 11575, 13021, 5687, 25010, 9598, 12660, 17964, 12863],
 '13702': [864, 8082, 8313, 1216, 19613, 1052, 7173, 17017, 21017, 915],
 '16704': [1365, 1236, 1426, 20105, 20645, 1164, 8217, 1119, 1247, 1219],
 '11556': [16488, 13461, 468, 23357, 9289, 7833, 10447, 22710, 19330, 10384],
 '21617': [21609,
  21608,
  21616,
  21492,
  21624,
  21623,
  21630,
  21628,
  21857,
  21506]}

In [157]:
import json


with open('lab02.json', 'w') as f:
    json.dump(final_export, f, indent=2)
    print("New json file is created")

New json file is created


In [68]:
sc.stop()