In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-cores 3 --executor-memory 3g --conf spark.locality.wait=0s --driver-memory 10g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.feature import HashingTF, IDF
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.linalg import DenseVector, VectorUDT, Vectors

conf = SparkConf()
conf.set("spark.app.name", "Maksim Yudin lab02") 

spark = SparkSession.builder.config(conf=conf).appName("Maksim Yudin lab02").getOrCreate()

In [3]:
def getAllCoursesDf(spark: SparkSession):
    courses = spark.read.json("hdfs:///labs/slaba02/DO_record_per_line.json") \
        .select(col("id"), col("lang"), col('name'), col("desc"))
    
    return courses

In [4]:
featureCount = 10000

allCoursesDf = getAllCoursesDf(spark)


In [5]:
def getPredictCoursesDf(spark: SparkSession):
    sourceData = spark.createDataFrame([
        (23126, "en", "Compass - powerful SASS library that makes your life easier"),
        (21617, "en", "Preparing for the AP* Computer Science A Exam \u2014 Part 2"),
        (16627, "es", "Aprende Excel: Nivel Intermedio by Alfonso Rinsche"),
        (11556, "es", "Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo"),
        (16704, "ru",
        "\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus"),
        (13702, "ru",
        "\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430")
    ], ["id", "lang", 'desc'])

    return sourceData

In [6]:
predictDataDf = getPredictCoursesDf(spark)
predictCoursesDf = predictDataDf.alias("predictDataDf").join(allCoursesDf.alias("allCoursesDf"), predictDataDf.id == allCoursesDf.id, "left")\
.select(col("predictDataDf.id"), col("predictDataDf.lang"), col("predictDataDf.desc").alias('name'), col("allCoursesDf.desc").alias('desc'))
predictCoursesDf.printSchema()

root
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- desc: string (nullable = true)



In [7]:
def fitDataframe(allCourses):
    tokenizer = Tokenizer(inputCol="desc", outputCol="words")
    wordsData = tokenizer.transform(allCourses)

    remover = StopWordsRemover(inputCol="words", outputCol="cleanWords")
    cleanWordsData = remover.transform(wordsData) \
        .select(col("id"), col("lang"),\
                col('name'),\
                col('cleanWords').alias("words"))

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=featureCount)
    featurizedData = hashingTF.transform(cleanWordsData)
    
    #featurizedData.show(1, truncate=0, vertical=True)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    #rescaledData = rescaledData.drop("words", "rawFeatures") \
        #.drop("words", "rawFeatures")
    #rescaledData.show(1, truncate=0, vertical=True)

    return rescaledData

In [8]:
def denseVectorColumn(x):
    return udf(lambda: x, VectorUDT())()

def to_dense(sparse_vector):
    return Vectors.dense(sparse_vector)

def to_sparse(dense_vector):
    size = len(dense_vector)
    pairs = [(i, v) for i, v in enumerate(dense_vector.values.tolist()) if v != 0]
    return Vectors.sparse(size, pairs)

def calcCos(v, u):
    try:
        return float(v.dot(u) / (v.norm(2) * u.norm(2)))
    except Exception as e:
        return 0.0

In [9]:
sparseToDenseUdf = udf(to_dense, VectorUDT())
calcCosUdf = udf(lambda v, u: calcCos(v, u), FloatType())

In [17]:
dfLanguages = predictCoursesDf.select(col('lang')).distinct()
dfLangArray = dfLanguages.collect()
featuresByLang = {}
for l in dfLangArray:
    dataByLangDf = allCoursesDf.where(col('lang') == l[0])
    #dfByLang.printSchema()
    fitDataDf = fitDataframe(dataByLangDf)
    
    resultDf = fitDataDf.withColumn('featureAsDenseVector', sparseToDenseUdf(fitDataDf['features']))\
        .drop('words', 'rawFeatures', 'features')
    
    resultDf.repartition(1)
    resultDf.cache()
    resultDf.count()
    
    
    featuresByLang[l[0]] = resultDf

In [18]:
#allCoursesRescaledData = fitDataframe(allCourses)

fitPredictDataDf = fitDataframe(predictCoursesDf)
#fitPredictDataDf.show(1, truncate=0, vertical=True)

In [19]:
fitPredictDataDf = fitPredictDataDf\
        .withColumn('featureAsDenseVector', sparseToDenseUdf(fitPredictDataDf['features']))\
        .drop('words', 'rawFeatures', 'features')

#fitPredictDataDf.where(col('id') == 23126).show(1, truncate=0, vertical=True)

In [20]:
def vector_column(x): 
    return udf(lambda: x, VectorUDT())()

fitPredictDataDfArray = fitPredictDataDf.where(col('id') == 23126).collect()
fitPredictDataDf.show(1)

rowIndex = 0
strJson = '{\n'

for course in fitPredictDataDfArray:
    courseId = course[0]
    courseLang = course[1]
    print(courseId)
    print(courseLang)
    
    allFeatureLangDF = featuresByLang[courseLang]
    coursesFeaturesById = fitPredictDataDf.where(col("id") == courseId)
    #allFeatureLangDF.show()
    #coursesFeaturesById.show()
    
    
    vec = DenseVector(course[3])
    
    df = allFeatureLangDF.where(col("lang") == courseLang)\
        .withColumn("oneCourseFeatureAsDenseVector", vector_column(vec))
    #df.show(10)
    #df.printSchema()
    resultFinal = df\
    .withColumn("cosSimilarity", calcCosUdf(df["oneCourseFeatureAsDenseVector"], df["featureAsDenseVector"]))\
    #.select(lit(courseId), col("id"), col("name"), col("cosSimilarity"))\
    
    res = resultFinal.orderBy(col("cosSimilarity").desc(), col("name").asc(), col("id").asc())\
    .limit(10)
    
    res.show()
    
    courses = res.collect()
    strJson += '"' + str(row[0]) + '": ['
    courseIndex = 0
    for course in courses:
        courseIndex += 1
        strJson += str(course[1])
        if (courseIndex < len(courses)):
            strJson += ', '
        else:
            strJson += ']'

    if (rowIndex < len(rescaledSourceDataCollected)):
        strJson += ', \n'
        
strJson += '\n}'
print(strJson)    
    

+-----+----+--------------------+--------------------+
|   id|lang|                name|featureAsDenseVector|
+-----+----+--------------------+--------------------+
|23126|  en|Compass - powerfu...|[0.0,0.0,0.0,0.0,...|
+-----+----+--------------------+--------------------+
only showing top 1 row

23126
en
+-----+----+--------------------+--------------------+-----------------------------+-------------+
|   id|lang|                name|featureAsDenseVector|oneCourseFeatureAsDenseVector|cosSimilarity|
+-----+----+--------------------+--------------------+-----------------------------+-------------+
| 8571|  en|Adjust the Bust: ...|[0.0,0.0,0.0,0.0,...|         [0.0,0.0,0.0,0.0,...|          NaN|
| 8624|  en|Amigurumi: Design...|[0.0,0.0,0.0,0.0,...|         [0.0,0.0,0.0,0.0,...|          NaN|
| 8835|  en| Artistic Digitizing|[0.0,0.0,0.0,0.0,...|         [0.0,0.0,0.0,0.0,...|          NaN|
|  574|  en|Business Analysis...|[0.0,0.0,0.0,0.0,...|         [0.0,0.0,0.0,0.0,...|          NaN|

NameError: name 'row' is not defined