In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("Tatiana Gavrikova")
         .getOrCreate())

In [3]:
spark

In [4]:
sc = spark.sparkContext
sc

In [5]:
need_to_predict_for = [
    [23126, u'en', u'Compass - powerful SASS library that makes your life easier'],
    [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'],
    [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
    [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
    [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'],
    [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']
]

In [6]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

In [7]:
schema = StructType(fields=[
    StructField("id", IntegerType()),
    StructField("name", StringType()),
    StructField("cat", StringType()),
    StructField("lang", StringType()),
    StructField("provider", StringType()),
    StructField("desc", StringType())
])

In [8]:
courses = spark.read\
          .schema(schema)\
          .format("json")\
          .load("/labs/slaba02/DO_record_per_line.json")\
          .cache()

In [9]:
courses.rdd.getNumPartitions()

2

In [10]:
courses = courses.repartition(6)

In [11]:
courses = courses[[courses.id, courses.lang, courses.desc]]

Разделяем по языкам (на самом деле слишком рано, потом уже поняла, можно было чуть позже ветвление сделать)

In [12]:
dfs = {}
for lang in set([row[1] for row in need_to_predict_for]):
    dfs[lang] = courses.filter(courses.lang == lang)
dfs

{'es': DataFrame[id: int, lang: string, desc: string],
 'en': DataFrame[id: int, lang: string, desc: string],
 'ru': DataFrame[id: int, lang: string, desc: string]}

## Шаг 1: разбиаваем описание на слова

In [13]:
from pyspark.ml.feature import Tokenizer

In [14]:
tokenizer = Tokenizer(inputCol="desc", outputCol="words")

In [15]:
dfs_1 = {}
for lang in dfs:
    dfs_1[lang] = tokenizer.transform(dfs[lang])
dfs_1

{'es': DataFrame[id: int, lang: string, desc: string, words: array<string>],
 'en': DataFrame[id: int, lang: string, desc: string, words: array<string>],
 'ru': DataFrame[id: int, lang: string, desc: string, words: array<string>]}

## Шаг 2: Чистим данные

Количество курсов без описания

In [16]:
for lang in dfs:
    _df = dfs_1[lang]
    print(lang, _df.filter(F.size(_df.words) == 0).count())

es 0
en 27
ru 0


In [17]:
_df = dfs_1['en']
ids = [row[0] for row in need_to_predict_for if row[1] == 'en']
if _df.filter(_df.id.isin(ids)).filter(F.size(_df.words) == 0).count() > 0:
    print('В интересущих нас курсах нет описания')

Можно удалить те, что без описания, среди них нет пересечения с интересущими нас

In [18]:
dfs_2 = {}
for lang in dfs_1:
    _df = dfs_1[lang]
    dfs_2[lang] = _df.filter(F.size(_df.words) != 0)
dfs_2

{'es': DataFrame[id: int, lang: string, desc: string, words: array<string>],
 'en': DataFrame[id: int, lang: string, desc: string, words: array<string>],
 'ru': DataFrame[id: int, lang: string, desc: string, words: array<string>]}

## Шаг 3: Убираем стоп слова

In [19]:
from pyspark.ml.feature import StopWordsRemover

In [20]:
langs_dict = {'en': 'english', 'ru': 'russian', 'es': 'spanish'}

In [21]:
dfs_3 = {}
for lang in dfs_2:
    _df = dfs_2[lang]
    stopWords = StopWordsRemover.loadDefaultStopWords(langs_dict[lang])
    swr = StopWordsRemover(
        inputCol='words',
        outputCol='words_filtered',
        stopWords=stopWords
    )
    dfs_3[lang] = swr.transform(_df)
dfs_3

{'es': DataFrame[id: int, lang: string, desc: string, words: array<string>, words_filtered: array<string>],
 'en': DataFrame[id: int, lang: string, desc: string, words: array<string>, words_filtered: array<string>],
 'ru': DataFrame[id: int, lang: string, desc: string, words: array<string>, words_filtered: array<string>]}

## Шаг 4: Векторизуем описание

In [22]:
from pyspark.ml.feature import CountVectorizer

In [23]:
dfs_4 = {}
for lang in dfs_3:
    _df = dfs_3[lang]
    countVectorizer = CountVectorizer(inputCol='words_filtered', outputCol="word_vector", vocabSize=10000)
    countVectorizerModel = countVectorizer.fit(_df)
    dfs_4[lang] = countVectorizerModel.transform(_df)
dfs_4

{'es': DataFrame[id: int, lang: string, desc: string, words: array<string>, words_filtered: array<string>, word_vector: vector],
 'en': DataFrame[id: int, lang: string, desc: string, words: array<string>, words_filtered: array<string>, word_vector: vector],
 'ru': DataFrame[id: int, lang: string, desc: string, words: array<string>, words_filtered: array<string>, word_vector: vector]}

## Шаг 5: Обратная частота слов

In [24]:
from pyspark.ml.feature import IDF

In [32]:
dfs_5 = {}
for lang in dfs_4:
    _df = dfs_4[lang]
    idf = IDF(inputCol="word_vector", outputCol="features", minDocFreq=1)
    idfModel = idf.fit(_df)
    dfs_5[lang] = idfModel.transform(_df).cache()
dfs_5

{'es': DataFrame[id: int, lang: string, desc: string, words: array<string>, words_filtered: array<string>, word_vector: vector, features: vector],
 'en': DataFrame[id: int, lang: string, desc: string, words: array<string>, words_filtered: array<string>, word_vector: vector, features: vector],
 'ru': DataFrame[id: int, lang: string, desc: string, words: array<string>, words_filtered: array<string>, word_vector: vector, features: vector]}

Подготовка данных закончена

# Вычисление косинусовой близости векторов (похожесть описания)

Вектора искомых курсов

In [38]:
vectors = {}
for _id, lang, _ in need_to_predict_for:
    _df = dfs_5[lang]
    vectors[_id] = [lang, _df.filter(_df.id == _id).select('features').take(1)[0].features]

In [33]:
def cosine_similarity_for_spec_vec(a):
    return F.udf(lambda b: float(a.dot(b) / (a.norm(2) * b.norm(2))), FloatType())

In [43]:
result = {}
for _id in vectors:
    lang, vector = vectors[_id]
    _df = dfs_5[lang]
    rows_id = _df.filter(_df.id != _id)\
        .withColumn('similarity', cosine_similarity_for_spec_vec(vector)(_df.features))\
        .orderBy('similarity', ascending=False)\
        .select(_df.id)\
        .take(10)
    result[_id] = [row.id for row in rows_id] 

In [44]:
result

{23126: [22037, 8010, 12081, 13782, 13665, 15909, 13348, 25782, 17329, 17499],
 21617: [12081, 22037, 8010, 21609, 21608, 21616, 21492, 21624, 21630, 21628],
 16627: [11431, 12247, 5687, 12660, 17964, 9598, 5558, 9563, 16694, 4188],
 11556: [16488, 13461, 22710, 468, 10447, 12679, 19330, 23357, 11529, 13776],
 16704: [1228, 1327, 20362, 18331, 1365, 8186, 26980, 20645, 1236, 1374],
 13702: [864, 21079, 8123, 1041, 1052, 22053, 1396, 8313, 1033, 1217]}

In [45]:
import json
with open("lab02.json", "w") as outfile:
    json.dump(result, outfile)

In [46]:
spark.stop()