In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "yuri_severyukhin") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()


In [2]:
#from pyspark.mllib.linalg import DenseVector
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover,VectorAssembler
import pyspark.sql.functions as f_
from pyspark.sql.types import FloatType
from pyspark.sql.window import Window
from pyspark.ml import Pipeline
import json

### Параметры

In [4]:
parameter = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]
l_parm_id=[i[0] for i in parameter]

In [5]:
print([i[1] for i in parameter])
# english, spanish, russian

['en', 'en', 'es', 'es', 'ru', 'ru']


### В PySpark есть специальный список стоп-слов, доступный через loadDefaultStopWords(language)

In [6]:
stopwords =\
    StopWordsRemover.loadDefaultStopWords("english") +\
    StopWordsRemover.loadDefaultStopWords("russian") +\
    StopWordsRemover.loadDefaultStopWords("spanish")

In [7]:
type(stopwords)

list

In [7]:
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

### Обработка источника
{
- "lang": "en",
- "name": "Accounting Cycle: The Foundation of Business Measurement and Reporting",
- "cat": "3/business_management|6/economics_finance",
- "provider": "Canvas Network",
- "id": 4,
- "desc": "This course introduces the basic financial statements used by most businesses, as well as the essential tools used to prepare them. This course will serve as a resource to help business students succeed in their upcoming university-level accounting classes, and as a refresher for upper division accounting students who are struggling to recall elementary concepts essential to more advanced accounting topics. Business owners will also benefit from this class by gaining essential skills necessary to organize and manage information pertinent to operating their business. At the conclusion of the class, students will understand the balance sheet, income statement, and cash flow statement. They will be able to differentiate between cash basis and accrual basis techniques, and know when each is appropriate. They\u2019ll also understand the accounting equation, how to journalize and post transactions, how to adjust and close accounts, and how to prepare key financial reports. All material for this class is written and delivered by the professor, and can be previewed here. Students must have access to a spreadsheet program to participate."  
}  

Интересует id, lang и desc

\p{Prop}  Match a character that has the given Unicode property.  
\p{L} matches a single code point in the category "letter".

In [8]:
# Загружаем данные 
source = spark.read.json('/labs/slaba02/DO_record_per_line.json')
# Чистом данные
courses = source.select("id", "lang"
                  ,f_.lower(f_.regexp_replace('desc',r'[^\pL{0-9}\p{Space}]','' )).alias('document'))

### PIPELINE
- разделить на слова
- убрать стоп слова
- посчитать TF — term frequency: по сути, сколько раз слово встречается в этом документе.
- посчитать DF – document frequency: по сути, число документов, в которых есть вхождение этого слова. Мы хотим "штрафовать" слово за частое появление в документах, поэтому делаем инверсию этой величины – буква I в TFIDF. 

In [9]:
tokenize = Tokenizer(inputCol="document", outputCol="words")
stopwordsremove = StopWordsRemover(inputCol="words", outputCol="words_removed", stopWords=stopwords)
tf_metric = HashingTF(inputCol="words_removed", outputCol="tf")
tfidf_metric = IDF(inputCol="tf", outputCol="tfidf")
pipeline = Pipeline(stages=[
    tokenize,
    stopwordsremove,
    tf_metric,
    tfidf_metric
])

In [10]:
fitted = pipeline.fit(courses)
tfidf_return = fitted.transform(courses)

In [11]:
tfidf_return.dtypes

[('id', 'bigint'),
 ('lang', 'string'),
 ('document', 'string'),
 ('words', 'array<string>'),
 ('words_removed', 'array<string>'),
 ('tf', 'vector'),
 ('tfidf', 'vector')]

### crossJoin  параметра и всех описаний
Присоединяем параметр к документам с учетом языка, при этом искючаем сами параметры

In [11]:
par_tfidf =  tfidf_return.where(f_.col('id').isin(l_parm_id))\
                         .select(f_.col('id').alias('par_id')
                                ,f_.col('lang')
                                ,f_.col('tfidf').alias('par_tfidf')
                                )

In [12]:
#DataFrame - параметр
cross_matrix = tfidf_return.join(par_tfidf, "lang", "inner").where('''par_id !=id ''')

In [14]:
cross_matrix.count()

54310

In [15]:
cross_matrix

DataFrame[lang: string, id: bigint, document: string, words: array<string>, words_removed: array<string>, tf: vector, tfidf: vector, par_id: bigint, par_tfidf: vector]

### Косинус векторов 
Умножаем один вектор на другой и делим на произведение их длин. 
- Вектор точка продукта: матричное умножение векторов (dot)
- для подсчета нормы векторов: сумма квадратов компонент (norm_type = 2)  

Чем больше косинус, тем лучше  
И нам нужно только 10 курсов

In [13]:
# UDF for cosine similarity
@f_.udf(FloatType())
def cos_sim(vector_a,vector_b):
    return float(vector_a.dot(vector_b) / (vector_a.norm(2) * vector_b.norm(2)))

In [14]:
cosine_matrix = cross_matrix\
            .select('*', cos_sim('tfidf','par_tfidf').alias('v_cos'))\
            .where("v_cos !='NaN'")\
            .select('*'
                   ,f_.row_number().over(Window.partitionBy("par_id")\
                                                .orderBy(f_.col('v_cos').desc(),f_.col('id'))).alias('rn'))\
            .where("rn <= 10")

### Рекомендация

In [15]:
recomended = cosine_matrix.groupBy(f_.col('par_id')).agg(f_.collect_list(f_.col('id')).alias('array_10'))\
                          .orderBy('par_id').collect()
recomended_json ={}
for i in recomended: recomended_json[str(i[0])]=i[1]

In [19]:
recomended_json.items()

dict_items([('11556', [16488, 468, 13461, 22710, 23357, 10447, 19330, 21707, 11523, 9465]), ('13702', [864, 21079, 8123, 1041, 28074, 13057, 1396, 1052, 1033, 8300]), ('16627', [11431, 12247, 12660, 5687, 17964, 16694, 12598, 11575, 12863, 21704]), ('16704', [1236, 1247, 1228, 1365, 1164, 1233, 1273, 20288, 8186, 8203]), ('21617', [21609, 21608, 21616, 21492, 21624, 21623, 21703, 21630, 21628, 21508]), ('23126', [14760, 13665, 13782, 15909, 19270, 25782, 17499, 13348, 7153, 25071])])

In [20]:
[print(i) for i in recomended_json.items()]

('11556', [16488, 468, 13461, 22710, 23357, 10447, 19330, 21707, 11523, 9465])
('13702', [864, 21079, 8123, 1041, 28074, 13057, 1396, 1052, 1033, 8300])
('16627', [11431, 12247, 12660, 5687, 17964, 16694, 12598, 11575, 12863, 21704])
('16704', [1236, 1247, 1228, 1365, 1164, 1233, 1273, 20288, 8186, 8203])
('21617', [21609, 21608, 21616, 21492, 21624, 21623, 21703, 21630, 21628, 21508])
('23126', [14760, 13665, 13782, 15909, 19270, 25782, 17499, 13348, 7153, 25071])


[None, None, None, None, None, None]

In [16]:
with open('lab02.json', 'w') as fp:
    json.dump(recomended_json, fp)

In [None]:
spark.stop()