In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkContext, SparkConf
import json
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Lab2") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()


In [26]:
import pyspark.sql.functions as F
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, DoubleType, FloatType
    
targetIds = [23126, 21617, 16627, 11556, 16704, 13702]
targetIds

df = spark.read.json("/labs/slaba02/DO_record_per_line.json")

sentenceData = df.withColumnRenamed("desc", "sentence")
sentenceData = sentenceData.select("sentence", "id", "lang")
sentenceData = sentenceData.withColumn("sentence",F.regexp_replace(F.col("sentence"), "[^A-Za-zА-Яа-я ]", ""))


tokenizer = Tokenizer(inputCol="sentence", outputCol="words_first")
wordsData = tokenizer.transform(sentenceData)

remover = StopWordsRemover(inputCol="words_first", outputCol="words")
wordsData = remover.transform(wordsData)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
featurizedData = hashingTF.transform(wordsData)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)

rescaledData = idfModel.transform(featurizedData)
res = rescaledData.select("features", "id", "lang")

res.cache()
# res.show()

resJson = "{}"
for targetId in targetIds:
    targetDf = res.filter(F.col("id") == targetId)
    lang = targetDf.select(F.col("lang")).collect()[0][0]
    
    def to_array(col):
        def to_array_(v):
            return v.toArray().tolist()
        # Important: asNondeterministic requires Spark 2.3 or later
        # It can be safely removed i.e.
        # return udf(to_array_, ArrayType(DoubleType()))(col)
        # but at the cost of decreased performance
        return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)

    targetDf = targetDf.select(F.col("features"))
#     targetDf = targetDf.select(to_array(F.col("features")).alias("features"))
#     targetVec = targetDf.select(F.col("features")).collect()[0]#[0]
    targetRes = res.filter(F.col("lang") == lang)
    
    targetDf = targetDf.withColumnRenamed("features", "features2")
    targetRes = targetRes.crossJoin(targetDf)
    
    
    @udf
    def sim_cos(v1, v2):
        try:
            p = 2
            return float(v1.dot(v2))/float(v1.norm(p)*v2.norm(p))
        except:
            return 0
     
    targetRes = targetRes.withColumn("cos", sim_cos(F.col("features"), F.col("features2")))

    targetRes = targetRes.withColumn("cos", F.col("cos").cast(FloatType()))
    targetRes = targetRes.filter(F.col("cos").isNotNull())
    targetRes = targetRes.filter(F.col("id") != targetId)
    targetRes = targetRes.select(F.col("id"), F.col("cos"), F.col("lang")).orderBy(F.col("cos").desc()).limit(10)
    
    predicts = targetRes.select(F.col("id")).collect()

    tempJson = json.dumps({targetId: [predicts[0][0], 
                                predicts[1][0], 
                                predicts[2][0], 
                                predicts[3][0], 
                                predicts[4][0], 
                                predicts[5][0], 
                                predicts[6][0], 
                                predicts[7][0], 
                                predicts[8][0], 
                                predicts[9][0]]})
    
    dictA = json.loads(resJson)
    dictB = json.loads(tempJson)

    dictA.update(dictB)
    resJson = json.dumps(dictA)

print(resJson)
f = open("lab02.json", "a")
f.write(resJson)
f.close()

{"23126": [14760, 13665, 13782, 20638, 24419, 15909, 2724, 25782, 13348, 17499], "21617": [21609, 21608, 21616, 21492, 21703, 21624, 21700, 21623, 21508, 21506], "16627": [11431, 12247, 5687, 17964, 11575, 13021, 17961, 25010, 16694, 12660], "11556": [16488, 13461, 468, 23357, 7833, 9289, 19330, 16929, 22710, 5750], "16704": [1247, 20288, 1273, 8203, 1365, 1236, 1233, 20645, 1164, 1426], "13702": [864, 1052, 8082, 1216, 8313, 17017, 13057, 19613, 21079, 20105]}
