In [None]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "RIK_lab2") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
spark

In [None]:
from pyspark import keyword_only

from pyspark.ml import Transformer, Pipeline
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover, CountVectorizer, VectorAssembler
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters

from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StructField, DoubleType, FloatType, ArrayType, StringType, IntegerType

from pyspark.sql.window import Window
from pyspark.sql.functions import udf, col, isnan, isnull, broadcast, desc, lower, pandas_udf, row_number, explode

import json
import re

In [None]:
# Курсы данные по варианту
given_courses = [
    [23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
    [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
    [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
    [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
    [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
    [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']
]

id_given_courses = [x[0] for x in given_courses]

# given_courses = spark.createDataFrame(data=given_courses, schema = ["id","lang", "name"])
# given_courses.printSchema()
# given_courses.show(truncate=False)

In [None]:
data = spark.read.json('/labs/slaba02/DO_record_per_line.json')
data = data.filter(data.lang.isin('en', 'es', 'ru')).repartition(10)

In [None]:
class ClearStringTransformer(Transformer):
    
  def _transform(self, df: DataFrame):
    regex = re.compile(u'[\w\d]{3,}', re.U)
    transform_udf = udf(lambda x: ' '.join(re.findall(regex, x)), StringType())
    return df.withColumn('desc', transform_udf('desc'))

class CatToParamTransformer(Transformer):
    
  def _transform(self, df: DataFrame):
    regex = re.compile(u'(\d+)/', re.U)
    transform_udf = udf(lambda x: re.findall(regex, x), ArrayType(StringType()))
    return df.withColumn('cat_param', transform_udf('cat'))

In [None]:
# cat process
CTPT = CatToParamTransformer()
cat_count_vectorizer = CountVectorizer(inputCol='cat_param', outputCol="cat_vector", binary=True)

# desc process
CST = ClearStringTransformer()

tokenizer = Tokenizer(inputCol="desc", outputCol="word")

all_stop_words = (StopWordsRemover.loadDefaultStopWords('english') 
                  + StopWordsRemover.loadDefaultStopWords('russian') 
                  + StopWordsRemover.loadDefaultStopWords('spanish'))

swf = StopWordsRemover(stopWords=all_stop_words, inputCol=tokenizer.getOutputCol(), outputCol='word_swf')

hasher = HashingTF(numFeatures=100000, binary=False, inputCol=swf.getOutputCol(), outputCol="tf")
# count_vectorizer = CountVectorizer(inputCol=swf.getOutputCol(), outputCol="word_vector", binary=False)

idf = IDF(inputCol=hasher.getOutputCol(), outputCol="idf_feature")

normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol="norm")


assembler = VectorAssembler(inputCols=[
    cat_count_vectorizer.getOutputCol(), normalizer.getOutputCol()
], outputCol="features")

pipeline = Pipeline(stages=[
    CTPT,
    cat_count_vectorizer,
    CST,
    tokenizer,
    swf,
    hasher,
    # count_vectorizer,
    idf,
    normalizer,
    assembler
])

pipeline_model = pipeline.fit(data)
data_feature = pipeline_model.transform(data)
data_feature = data_feature.drop('desc', 'word', 'word_swf', 'word_vector', 
                                 'cat', 'cat_param', 'provider', 'tf', 'idf_feature', 
                                 'cat_vector', 'norm')
# data_feature.show(10)

In [None]:
given_courses_feature = data_feature.filter(col('id').isin(id_given_courses))
all_minus_given_courses = data_feature.join(given_courses_feature, on="id", how="leftanti").coalesce(10).cache()

In [None]:
# cosine similarity
dot_udf = udf(lambda x,y: float(x.dot(y)), DoubleType())

result = (
    given_courses_feature.alias("i")
    .join(all_minus_given_courses.alias("j"), col("i.lang") == col("j.lang"))
    .select(
        col("i.id").alias("id"), 
        col("j.id").alias("recomended_id"),
        col("j.name").alias("name"),
        dot_udf("i.features", "j.features").alias("dot")
    )
    .sort(col("id"), col("dot").desc(), col("name"), col("recomended_id"))
)

result = result.coalesce(10).cache()
result.show(10)

In [None]:
windowSpec = Window.partitionBy("id").orderBy(col("id"), col("dot").desc(), col("name"), col("recomended_id"))
result = result.withColumn("row_number", row_number().over(windowSpec)).filter(col("row_number") <= 10)
result = result.coalesce(10).cache()

In [None]:
result_driver = result.select(col("id"), col("recomended_id")).collect()

In [None]:
result_driver_json = {}
for x in result_driver:
    if x.id not in result_driver_json:
        result_driver_json[x.id] = []
    result_driver_json[x.id].append(x.recomended_id)
result_driver_json

In [None]:
import json

with open('lab02.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(result_driver_json))

In [None]:
spark.stop()