In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [3]:
from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer, StopWordsRemover
from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.sql.window import Window
from pyspark.sql.types import FloatType
import numpy as np

# Чтение файлов из hdfs и формирование 'target'

In [4]:
target = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
          [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
          [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
          [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
          [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
          [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [5]:
rdd = spark.sparkContext.parallelize(target)
df= spark.createDataFrame(rdd).toDF(*['id', 'lang', 'name'])

In [6]:
courses = spark.read.json('/labs/slaba02/DO_record_per_line.json')

In [7]:
target = df.join(courses, df.columns, 'inner')

# TF-IDF

In [8]:
@F.udf (FloatType ())
def cos_udf (v1, v2):
    return float (v1.dot(v2) / (v1.norm(2) * v2.norm(2)))

In [9]:
en = StopWordsRemover.loadDefaultStopWords("english")
ru = StopWordsRemover.loadDefaultStopWords("russian")
es = StopWordsRemover.loadDefaultStopWords("spanish")
stop = en + ru + es

In [10]:
def tfidfcos_cut(df, target, stop):
    
    regexTokenizer = RegexTokenizer(inputCol='desc', outputCol='words', pattern=r'[1234567890?!.)(,\s]')
    regexTokenized = regexTokenizer.transform(df)
        
    remover = StopWordsRemover(inputCol='words', outputCol='filtered', stopWords=stop)
    remover = remover.transform(regexTokenized)
    
    hashingTF = HashingTF(inputCol='filtered', outputCol='rawFeatures', numFeatures=10000)
    featurizedData = hashingTF.transform(remover)
    idf = IDF(inputCol='rawFeatures', outputCol='features')
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    
    target_feats = target.select('id', 'lang')\
    .join(rescaledData.select('id', 'lang', 'features'), ['id', 'lang'], 'inner')\
    .select(F.col('id').alias('id_target'), F.col('features').alias('features_target'),'lang')
    
    full_table = rescaledData.join(target_feats, ['lang'], 'inner').filter(F.col('id')!=F.col('id_target'))
    
    result = full_table.select('lang', 'id', 'id_target', 'features', 'features_target', 
                                cos_udf ('features', 'features_target').alias('cos'))\
    .filter(F.col('cos')!=np.nan)\
    .withColumn('rank', F.row_number().over(Window.partitionBy('id_target').orderBy(F.col("cos").desc())))\
    .filter(F.col('rank')<11).groupBy('id_target').agg(F.collect_list('id')).collect()

    return result

In [11]:
result = tfidfcos_cut(courses, target, stop)

In [12]:
dicts = {str(x[0]): x[1] for x in result}

In [14]:
with open("lab02.json", "w") as fp:
    json.dump(dicts , fp) 

In [15]:
spark.stop()