In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkContext, SparkConf

conf = SparkConf()
conf.set("spark.app.name", "DI Spark Dataframe app")
conf.set('spark.executor.instances', '10')

sc = SparkContext(conf=conf)

In [3]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [4]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf() 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [5]:
#sc.getConf().getAll()

In [5]:
from pyspark import SQLContext

sqlContext = SQLContext(sc)

In [6]:
# sqlContext

In [7]:
df = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [8]:
import re, string
regex = re.compile(u'[\w\d]{4,}', re.U)

In [9]:
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.linalg import SparseVector, DenseVector, VectorUDT
from pyspark.sql import Column
from pyspark.sql.types import IntegerType, StringType, ArrayType, DoubleType
import pyspark.sql.functions as f

In [10]:
df = df.na.drop(subset='desc')

In [11]:
desc_arr = f.udf(lambda x: regex.findall(x.lower()),ArrayType(StringType()))

In [12]:
df_featured = df.select(df.id, df.lang, df.name, desc_arr(df.desc).alias('desc_arr'))

In [13]:
hashingTF = HashingTF(numFeatures=10000, inputCol='desc_arr', outputCol='tf')
df_tf = hashingTF.transform(df_featured)

idf = IDF(minDocFreq=8, inputCol='tf', outputCol='tfidf')
df_tfidf = idf.fit(df_tf).transform(df_tf)

In [14]:
cosine = f.udf( lambda test, other: float(test.dot(other)/ test.norm(2) / other.norm(2)), DoubleType())

In [15]:
test = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'],\
        [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'],\
        [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],\
        [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],\
        [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'],\
        [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

check_id = []
for elem in test:
    check_id.append((elem[0]))

In [16]:
ids = df_tfidf.filter( f.col('id').isin(check_id)).select(f.col('id').alias('id1'),\
                                                       f.col('lang').alias('lang1'),\
                                                       f.col('tfidf').alias('tfidf1'))

lid = ids.select(f.col('id1'),f.col('lang1')).collect()

In [17]:
THRESHOLD = 10
out = {}

for i in lid:
    curr_id , curr_lang = i
    out_arr = df_tfidf.filter(f.col('lang') == curr_lang).\
    filter(f.col('id') != curr_id).\
    join(ids.filter(f.col('id1') == curr_id), f.col('lang') == f.col('lang1'), 'right_outer').\
    drop('desc_arr','tf').\
    withColumn('cos', cosine(f.col('tfidf'),f.col('tfidf1'))).\
    filter(f.col('cos') != NaN ).\
    sort(f.col('cos').desc(),f.col('name').asc(), f.col('id').asc())
    res = out_arr.select(f.col('id')).toPandas()[:THRESHOLD].values.flatten().tolist()
    out[curr_id] = res

In [18]:
import json

fd = open('lab02.json', mode='w')
fd.write(json.dumps(out, indent=4))
fd.close()

In [19]:
spark.stop()