In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [3]:
spark

In [4]:
# spark.stop()

 #  Вариант 10

In [5]:
variant_10 = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'],
              [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
              [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
              [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
              [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
              [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [6]:
variant_10

[[23126, 'en', 'Compass - powerful SASS library that makes your life easier'],
 [21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2'],
 [16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
 [11556,
  'es',
  'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
 [16704, 'ru', 'Программирование на Lazarus'],
 [13702, 'ru', 'Математическая экономика']]

In [7]:
variant_10_id = [x[0] for x in variant_10]
variant_10_id

[23126, 21617, 16627, 11556, 16704, 13702]

In [8]:
variant_10_lang = list(set([x[1] for x in variant_10]))
variant_10_lang

['en', 'es', 'ru']

# Читаем данные

In [9]:
df = spark.read.json("/labs/slaba02/DO_record_per_line.json").distinct().cache()

In [10]:
df.printSchema()

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)



In [11]:
df = df.withColumn("desc", F.lower(F.col("desc")))\
        .withColumn("cat", F.lower(F.col("cat")))

In [12]:
df = df.withColumn("desc_r", F.regexp_replace(F.col('desc'), '[^0-9^а-я^ё^a-z^\wáéíóúñ]', ' '))\
        .withColumn("desc_r", F.regexp_replace(F.col("desc_r"), "(\s+)", " "))\
        .withColumn("desc_r", F.regexp_replace(F.col("desc_r"), "^\s+|\s+$", ""))
#         .withColumn("desc_r", F.split(F.col("desc_r"), " "))

In [13]:
df.groupBy(F.col('lang')).count().orderBy(F.col('count').desc()).show(10)

+----+-----+
|lang|count|
+----+-----+
|  en|24553|
|  es| 1374|
|  ru| 1231|
|  pt|  187|
|  zh|  169|
|  de|  166|
|  tr|  120|
|  fr|  104|
|  ja|   77|
|  it|   62|
+----+-----+
only showing top 10 rows



Все необходимые нам курсы вошли в ТОП-3.

In [14]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover

In [15]:
# sentenceData = sentenceData.withColumn("desc_f", F.concat_ws(",", "desc_f_"))

In [16]:
tokenizer = Tokenizer(inputCol="desc_r", outputCol="words_")
wordsData = tokenizer.transform(df)

remover = StopWordsRemover(inputCol="words_", outputCol="words")
removerData = remover.transform(wordsData)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
featurizedData = hashingTF.transform(removerData)

# alternatively, CountVectorizer can also be used to get term frequency vectors
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
tfidf = idfModel.transform(featurizedData)

normalizer = Normalizer(inputCol="features", outputCol="norm")
data = normalizer.transform(tfidf)

In [17]:
data.show(1)

+--------------------+--------------------+---+----+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 cat|                desc| id|lang|                name|provider|              desc_r|              words_|               words|         rawFeatures|            features|                norm|
+--------------------+--------------------+---+----+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|3/business_manage...|курс посвящен ист...|854|  ru|Экономическая ист...|  Intuit|курс посвящен ист...|[курс, посвящен, ...|[курс, посвящен, ...|(10000,[430,2428,...|(10000,[430,2428,...|(10000,[430,2428,...|
+--------------------+--------------------+---+----+--------------------+--------+--------------------+--------------------+--------------------+-------------------

## Оставляем 2 DF - с нашими курсами и весь ДФ без наших курсов с языковым соответсвием.

In [18]:
df_target = data.filter(F.col("id").isin(variant_10_id)).cache()

In [19]:
df_target.groupBy(F.col('lang')).count().show()

+----+-----+
|lang|count|
+----+-----+
|  en|    2|
|  es|    2|
|  ru|    2|
+----+-----+



In [20]:
df_train = data.filter((~F.col("id").isin(variant_10_id)) & (F.col("lang").isin(variant_10_lang))).cache()

In [21]:
df_train.groupBy(F.col('lang')).count().show()

+----+-----+
|lang|count|
+----+-----+
|  en|24551|
|  es| 1372|
|  ru| 1229|
+----+-----+



In [22]:
dot_udf = F.udf(lambda x,y: float(x.dot(y)), DoubleType())

In [58]:
result = df_target.alias("i").join(df_train.alias("j"), F.col("i.lang") == F.col("j.lang"), "left")\
    .select(
        F.col("i.id").alias("id_target"),
        F.col("j.id").alias("id_train"),
        F.col("i.lang").alias("lang_target"),
        F.col("j.lang").alias("lang_train"),
        F.col("i.desc").alias("desc_target"),
        F.col("j.desc").alias("desc_train"),
        dot_udf("i.norm", "j.norm").alias("sim_cosine"))\
    .orderBy(F.desc('sim_cosine'), F.asc('id_train')).cache()

In [59]:
lab2 = {}

In [60]:
for id_ in [x[0] for x in variant_10]:
    val = [row[0] for row  in result.filter(F.col("id_target") == F.lit(id_)).select("id_train").take(10)]
    val_d = {str(id_) : val}
    lab2 = {**lab2, **val_d}

In [61]:
lab2

{'23126': [13665,
  14760,
  13782,
  20638,
  24419,
  15909,
  2724,
  25782,
  17499,
  13348],
 '21617': [21609,
  21616,
  22298,
  21608,
  21630,
  21628,
  21081,
  19417,
  21623,
  21508],
 '16627': [11431, 12247, 5687, 11575, 17964, 17961, 12660, 25010, 5558, 12863],
 '11556': [16488, 468, 23357, 13461, 19330, 7833, 9289, 10447, 16929, 10384],
 '16704': [1273, 20288, 1247, 1236, 8203, 1365, 1233, 1164, 20645, 1426],
 '13702': [864, 1052, 8082, 1216, 8313, 19613, 7173, 17017, 21079, 21017]}

In [62]:
with open("lab02.json", "w") as write_file:
    json.dump(lab2, write_file)