In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json
from pyspark.ml.linalg import Vectors
import pyspark.sql.functions as f
import re
from pyspark.sql.functions import udf

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [3]:
@f.pandas_udf(ArrayType(StringType()))
def reg(object):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    return object.apply(lambda x: regex.findall(x.lower()))

def cos_(x, y):
    if(x == None or y == None):
        return np.nan
    else:
        if float(x.norm(2)*y.norm(2))!=0:
            return float(x.dot(y)/(x.norm(2)*y.norm(2)))
        else:
            return float(-1)
        
cosinus_ = udf(lambda x,y: cos_(x, y), DoubleType())

In [4]:
courses = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [5]:
%%time
df = spark.read\
          .format("json")\
          .load("/labs/slaba02/DO_record_per_line.json")

df2 = df.withColumn("words", reg("desc"))
df2_en = df2.filter(df2.lang == 'en')
df2_es = df2.filter(df2.lang == 'es')
df2_ru = df2.filter(df2.lang == 'ru')
hasher_freq = HashingTF(numFeatures=10000, binary=False, inputCol='words', outputCol="word_vector_freq")
dataset2_freq_en = hasher_freq.transform(df2_en)
dataset2_freq_es = hasher_freq.transform(df2_es)
dataset2_freq_ru = hasher_freq.transform(df2_ru)
IDF_func = IDF(inputCol='word_vector_freq', outputCol="IDF_col")
IDF_model_en = IDF_func.fit(dataset2_freq_en)
IDF_model_es = IDF_func.fit(dataset2_freq_es)
IDF_model_ru = IDF_func.fit(dataset2_freq_ru)
dataset_result_en = IDF_model_en.transform(dataset2_freq_en)
dataset_result_es = IDF_model_es.transform(dataset2_freq_es)
dataset_result_ru = IDF_model_ru.transform(dataset2_freq_ru)
dataset_result_all = dataset_result_en.union(dataset_result_es).union(dataset_result_ru).cache()
result = {}
for i in range(len(courses)):
    kurs = dataset_result_all.filter(dataset_result_en.id == courses[i][0])\
    .select('id', 'desc', 'lang', 'IDF_col')\
    .withColumnRenamed('id', 'id2').withColumnRenamed('IDF_col', 'IDF_col2').withColumnRenamed('desc', 'desc2')
    
    pull_kurs = kurs.join(dataset_result_all, kurs.lang == dataset_result_all.lang, 'leftouter')\
    .filter(f.col('id2') != f.col('id'))\
    .withColumn('similarity', cosinus_('IDF_col', 'IDF_col2')).cache()
    
    prov__ = pull_kurs.select('id', 'id2', 'similarity', 'name')\
    .orderBy(F.col('similarity').desc(), F.col('name'), F.col('id'))\
    .select('id').limit(10).toPandas()
    result[courses[i][0]] = list(prov__['id'])

CPU times: user 183 ms, sys: 76.6 ms, total: 260 ms
Wall time: 1min 9s


In [6]:
import json
with open('lab02.json', 'w') as outfile:
    json.dump(result, outfile)

In [7]:
spark.stop()