In [None]:
# Запускаем Spark
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

from pyspark.sql.types import StructType, StructField, FloatType, ArrayType, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, pandas_udf, PandasUDFType
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors, VectorUDT

import json
import re
import pyspark.sql.functions as f


conf = SparkConf()
conf.set("spark.app.name", "lab2") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
courses_data_path = "/labs/slaba02/DO_record_per_line.json"
submission_example_path = "/share/submission-files/slaba02/lab02.json"

In [None]:
target_recomendation = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'],
                          [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'],
                          [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
                          [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
                          [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'],
                          [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [None]:
target_schema = StructType(fields=[
    StructField("id", IntegerType()),
    StructField("lang", StringType()),
    StructField("desc", StringType())
])

In [None]:
target_df = spark.createDataFrame(target_recomendation, schema=target_schema)

In [None]:
data = spark.read.json(courses_data_path)

In [None]:
def clear_string(series):
    series = series.str.lower()
    regex = re.compile(u'[\w\d]{2,}', re.U)
    words = series.str.findall(regex)
    return words

In [None]:
json_data = {}
for loc in ["en", "es", "ru"]:
    df = data.filter(data.lang == loc)
    
    sim_cos = udf(lambda v, u: float(v.dot(u) / (v.norm(2) * u.norm(2))), DoubleType())
    tokenizer_udf = pandas_udf(clear_string, ArrayType(StringType()))
    remover = StopWordsRemover(inputCol="tokenized", outputCol="filtered", locale=loc)
    ht = HashingTF(inputCol=remover.getOutputCol(), outputCol="tf", numFeatures=10000, binary=True)
    idf = IDF(inputCol=ht.getOutputCol(), outputCol="features")
    
    df = df.withColumn("tokenized", tokenizer_udf("desc"))
    
    pipeline = Pipeline(stages=[
    remover,
    ht,
    idf
    ])
    
    pipe = pipeline.fit(df)
    df = pipe.transform(df)
    
    ids = [row[0] for row in target_df.filter(f.col("lang") == loc).collect()]
    
    df_target = df.filter(df.id.isin(ids))
    
    
    merged_df = df.alias("i").join(f.broadcast(df_target.alias("t")), 
                       (df.lang == df_target.lang)
                    &  (df.id != df_target.id)                       
                       )
    
    merged_df = merged_df \
            .withColumn("cos", sim_cos(f.col("t.features"), f.col("i.features"))) \
            .select(f.col("t.id").alias("target"),
                    f.col("i.id").alias("recommend"),
                    f.col("cos")) 
    
    merged_df = merged_df.fillna(value=0) 
    
    windowSpec = Window.partitionBy("target").orderBy(f.col("cos").desc_nulls_last())
    
    result = merged_df.withColumn("rnk", row_number().over(windowSpec)) \
                  .filter(f.col("rnk") <= 10).collect()
    
    for row in result:
        json_data.setdefault(str(row[0]), []).append(row[1])

In [None]:
with open("lab02.json", "w", encoding="utf-8") as json_file:
    json_file.write(json.dumps(json_data, indent=4))

In [None]:
with open("lab02.json", "r", encoding="utf-8") as json_file:
    res = json.load(json_file)

In [None]:
!hdfs dfs -put lab02.json /user/alexander.zhukov/

In [None]:
spark.stop()