In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 4 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [7]:
from pyspark.ml.feature import HashingTF, IDF, StopWordsRemover, Tokenizer
from pyspark.ml import Pipeline
from pyspark.sql.types import ArrayType, StringType, FloatType
import pyspark.sql.functions as f
from pyspark.sql.functions import pandas_udf, lower, col, udf, isnan, isnull, broadcast, desc, concat
import re
import json

In [3]:
conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [5]:
result_courses = {
    23126: 'en',
    21617: 'en',
    16627: 'es',
    11556: 'es',
    16704: 'ru',
    13702: 'ru'
}

In [8]:
data = spark.read.json("/labs/slaba02/DO_record_per_line.json")
data = data.filter(f.col("desc") != ' ')

In [15]:
langs = list(result_courses.items())

In [16]:
data = data.withColumn("desc", lower(col("desc")))

In [17]:
def clear_string(series):
    regex = re.compile(u'[\w\d]{2,}', re.U) #re.compile(u'[\w\d]{2,}', re.U)
    words = series.str.findall(regex)
    return words

tokenizer_udf = pandas_udf(clear_string, ArrayType(StringType()))

In [18]:
pre_tokenized_data = data.withColumn("token", tokenizer_udf("desc"))

In [19]:
stop_words_en = StopWordsRemover.loadDefaultStopWords("english")
stop_words_rus = StopWordsRemover.loadDefaultStopWords("russian")
stop_words_es = StopWordsRemover.loadDefaultStopWords("spanish")
stop_words = stop_words_en+stop_words_rus+stop_words_es

In [20]:
remover = StopWordsRemover(inputCol="token", outputCol="no_stop_words", stopWords=stop_words)

In [22]:
tokenized_data = remover.transform(pre_tokenized_data)

tokenized_en = tokenized_data.filter(f.col("lang") == "en")
tokenized_es = tokenized_data.filter(f.col("lang") == "es")
tokenized_ru = tokenized_data.filter(f.col("lang") == "ru")

In [23]:
tf = HashingTF(inputCol="no_stop_words",outputCol="tf_f", numFeatures=10000, binary=False)
idf = IDF(inputCol="tf_f", outputCol="idf_f") #minDocFreq = 5,
from pyspark.ml.feature import Normalizer 
t = Normalizer(inputCol="idf_f", outputCol="norm_idf_f")

hashed_data_en = tf.transform(tokenized_en)
idfModel_en = idf.fit(hashed_data_en)
idfed_data_en = idfModel_en.transform(hashed_data_en)
normalized_en = t.transform(idfed_data_en)


hashed_data_es = tf.transform(tokenized_es)
idfModel_es = idf.fit(hashed_data_es)
idfed_data_es = idfModel_es.transform(hashed_data_es)
normalized_es = t.transform(idfed_data_es)


hashed_data_ru = tf.transform(tokenized_ru)
idfModel_ru = idf.fit(hashed_data_ru)
idfed_data_ru = idfModel_ru.transform(hashed_data_ru)
normalized_ru = t.transform(idfed_data_ru)

In [24]:
normalized_data = normalized_en.unionAll(normalized_es).unionAll(normalized_ru)

In [25]:
result = {}
for id_cor, lng in langs:
    vector_cor = normalized_data.filter(normalized_data.id == int(id_cor)).collect()[0]['norm_idf_f'].toArray()
    tf_idf_cos = f.udf(lambda x: float(x.dot(vector_cor)), FloatType())
    predictions = normalized_data.where((normalized_data.id != int(id_cor)) & (normalized_data.lang == lng))\
                .withColumn('tf_idf_cos',tf_idf_cos(normalized_data['norm_idf_f']))\
                .orderBy(f.desc('tf_idf_cos'), f.asc('name'), f.asc('id'))\
                .head(10)
    pre_final = [i['id'] for i in predictions]
    result.update({str(id_cor): pre_final})

In [26]:
with open ('lab02.json', 'w') as js:
    js.write(json.dumps(result))

In [27]:
spark.stop()