# Лабораторная 2. Кобылкин Константин. Вариант 10

In [3]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 4g --executor-cores 1 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [4]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Recommendation system") 

spark = SparkSession.builder.config(conf=conf).appName("RecSys").getOrCreate()

# Загрузка данных

In [5]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType, DoubleType

In [6]:
schema = StructType(fields=[
    StructField("lang", StringType()),
    StructField("name", StringType()),
    StructField("cat", StringType()),
    StructField("provider", StringType()),
    StructField("id", IntegerType()),
    StructField("desc", StringType())
])

In [7]:
df = spark.read\
          .format("json")\
          .schema(schema)\
          .option("multiline","false") \
          .load("/labs/slaba02/DO_record_per_line.json")

In [8]:
!hdfs dfs -cat /labs/slaba02/DO_record_per_line.json | wc -l 

28153


In [9]:
df.count()

28153

# Выделение подмножества фильмов на тех, же языках, что и запросы

In [10]:
df = df.select(["id", "lang", "desc", "name"])

In [11]:
course_ids = set([23126, 21617, 16627, 11556, 16704, 13702])
query_course_df = df.filter(df.id.isin(course_ids))
query_course_languages = set(query_course_df.select(['lang']).distinct().toPandas().lang.unique())

In [12]:
from pyspark.sql.functions import udf, desc, col

In [13]:
df = df.filter(df.lang.isin(query_course_languages))

# Вычисление TF-IDF-весов

In [14]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
import re

In [15]:
@udf(returnType=ArrayType(StringType()))
def customTokenizer(description):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    return regex.findall(description.lower())

In [16]:
# tokenizer = Tokenizer(inputCol="desc",
#                       outputCol="desc_words")

df = df.withColumn("desc_words", 
                   customTokenizer("desc"))
stop_words_remover = StopWordsRemover(inputCol="desc_words", #tokenizer.getOutputCol(),
                                      outputCol="filtered_desc_words")
hashing_TF_transformer = HashingTF(inputCol=stop_words_remover.getOutputCol(), 
                                   outputCol="TF",
                                   numFeatures=10000)
IDF_transformer = IDF(inputCol=hashing_TF_transformer.getOutputCol(), 
                      outputCol="TF-IDF")

In [18]:
tf_transform_pipeline = [#tokenizer,
                         stop_words_remover,
                         hashing_TF_transformer]

for transformer in tf_transform_pipeline:
    df = transformer.transform(df)

In [19]:
lang_dfs = {}


for lang in query_course_languages:
    lang_df = df.filter(df.lang == lang)
    lang_dfs[lang] = IDF_transformer.fit(lang_df).transform(lang_df)

# Вычисление мер сходства с запросами и формирование результата

In [20]:
query_course_id_langs_map = query_course_df.select(['id', 'lang']).toPandas().set_index('id').lang.to_dict()

In [21]:
def cosineSimilarityEstimatorGenerator(given_course_tf_idf):
    def cosineSimilarityEstimator(course_tf_idf):
        if course_tf_idf.norm(2) == 0.0:
            return 0.0
        return float(course_tf_idf.dot(given_course_tf_idf) / (course_tf_idf.norm(2) * given_course_tf_idf.norm(2)))
    return cosineSimilarityEstimator

In [22]:
result = {}

for query_course_id in query_course_id_langs_map:
    query_course_lang_df = lang_dfs[query_course_id_langs_map[query_course_id]]
    query_course_tf_idf = query_course_lang_df.filter(query_course_lang_df.id == 
                                                      query_course_id).select('TF-IDF').toPandas().loc[0, 'TF-IDF']
    query_course_cosineSimilarityEstimator = udf(cosineSimilarityEstimatorGenerator(query_course_tf_idf), 
                                                 DoubleType())
    query_course_lang_df = query_course_lang_df.withColumn("course_" + str(query_course_id), 
                                                           query_course_cosineSimilarityEstimator("TF-IDF"))
    best_matches = query_course_lang_df.orderBy(col("course_" + str(query_course_id)).desc(),
                                                col("name").asc(),
                                                col("id").asc()).limit(11).toPandas()
    result[str(query_course_id)] = best_matches.loc[best_matches.id != query_course_id, 'id'].tolist()

# Запись в файл

In [23]:
import json

with open('lab02.json', 'w') as fp:
    json.dump(result, fp) 

In [25]:
spark.stop()