In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 4 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
my_courses = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]
my_courses

[[23126, 'en', 'Compass - powerful SASS library that makes your life easier'],
 [21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2'],
 [16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
 [11556,
  'es',
  'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
 [16704, 'ru', 'Программирование на Lazarus'],
 [13702, 'ru', 'Математическая экономика']]

In [3]:
from pyspark.ml.feature import HashingTF, IDF, StopWordsRemover, Tokenizer
from pyspark.ml import Pipeline
from pyspark.sql.types import ArrayType, StringType, FloatType
import pyspark.sql.functions as f
from pyspark.sql.functions import pandas_udf, lower, col, udf, isnan, isnull, broadcast, desc, concat
import re
import json

In [4]:
data = spark.read.json("/labs/slaba02/DO_record_per_line.json")
data = data.filter(f.col("desc") != ' ')
# data\
# .filter((f.col("id") == 23126)|(f.col("id") == 21617)|(f.col("id") == 16627)|(f.col("id") == 11556)|(f.col("id") == 16704) | (f.col("id") == 13702))\
# .show(10,False,True)

In [5]:
data.groupby("lang").count().sort("count", ascending=False).show(3)

+----+-----+
|lang|count|
+----+-----+
|  en|24532|
|  es| 1374|
|  ru| 1231|
+----+-----+
only showing top 3 rows



In [6]:
data\
.groupby("desc", "lang")\
.count()\
.sort("count", ascending=False).where(f.col("count") > 1).take(1000)

[Row(desc=' The complete content is developed in High Quality 3D Animation, that will help you to understand the critical concepts. ', lang='en', count=15),
 Row(desc='\nDreamweaver Dinamik Tasarım Temelleri Kursuna Gelişmiş Düzey Konularla devam ediyoruz.\nKursun amacı PHP MySQL Öğretmektir.\nKursa Dreamweaver Öğrenenler Katılabilir.\nKurs için Dreamweaver sürümlerinden herhangi birisi yeterlidir. Category:\nTechnology ', lang='tr', count=6),
 Row(desc='\nCustomer Service professionals need to know a whole lot more about the various ways of delivering services to the customers. It is no longer sufficient just to know the basics.\nThere have been so many changes in customer expectations and with service delivery, which is why we have created a completely new series of qualifications and certifications. The Customer Service Certification Pathway has been created to support students in their aim to gain relevant qualifications that practically apply to the ever-changing world of customer

In [7]:
# langs = sorted([(course[0],course[1]) for course in my_courses])
langs = [(course[0],course[1]) for course in my_courses]
langs

[(23126, 'en'),
 (21617, 'en'),
 (16627, 'es'),
 (11556, 'es'),
 (16704, 'ru'),
 (13702, 'ru')]

In [8]:
data.count()

28131

In [9]:
data.select(["desc"]).distinct().count()

27809

In [10]:
data = data.withColumn("desc", lower(col("desc")))
# data.show(1,False,True)

In [11]:
def clear_string(series):
    regex = re.compile(u'[\w\d]{2,}', re.U) #re.compile(u'[\w\d]{2,}', re.U)
    words = series.str.findall(regex)
    return words

tokenizer_udf = pandas_udf(clear_string, ArrayType(StringType()))

In [12]:
# def clear_article(series):
#     prog = re.compile(u'[\w\d]{2,}', re.U)
#     for i in series:
#         print(i[2])
#         words = i[2].str.findall(regex)
#         print(words)
# #     prog = re.compile(u'[\w\d]{2,}', re.U)
# #     words = prog.match(series)
# #     return words
        # re.compile(u'[\w\d,\s]{2,}', re.U)
# []

In [13]:
pre_tokenized_data = data.withColumn("token", tokenizer_udf("desc"))
pre_tokenized_data.show(1,False,True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
stop_words_en = StopWordsRemover.loadDefaultStopWords("english")
stop_words_rus = StopWordsRemover.loadDefaultStopWords("russian")
stop_words_es = StopWordsRemover.loadDefaultStopWords("spanish")
stop_words = stop_words_en+stop_words_rus+stop_words_es


In [15]:
remover = StopWordsRemover(inputCol="token", outputCol="no_stop_words", stopWords=stop_words)

In [16]:
tokenized_data = remover.transform(pre_tokenized_data)
# tokenized_data = pre_tokenized_data.withColumn("no_stop_words", col("token"))

In [17]:
tokenized_data.select(["desc","token","no_stop_words"]).filter(f.col("lang") == "en").show(2,False,True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [18]:
tokenized_en = tokenized_data.filter(f.col("lang") == "en")

tokenized_es = tokenized_data.filter(f.col("lang") == "es")
tokenized_ru = tokenized_data.filter(f.col("lang") == "ru")

In [19]:
tf = HashingTF(inputCol="no_stop_words",outputCol="tf_f", numFeatures=10000, binary=False)
idf = IDF(inputCol="tf_f", outputCol="idf_f") #minDocFreq = 5,
from pyspark.ml.feature import Normalizer 
t = Normalizer(inputCol="idf_f", outputCol="norm_idf_f")

In [20]:
# Запускаем по английским текстам
hashed_data_en = tf.transform(tokenized_en)
idfModel_en = idf.fit(hashed_data_en)
idfed_data_en = idfModel_en.transform(hashed_data_en)
normalized_en = t.transform(idfed_data_en)

In [21]:
# Запускаем по испанским текстам
hashed_data_es = tf.transform(tokenized_es)
idfModel_es = idf.fit(hashed_data_es)
idfed_data_es = idfModel_es.transform(hashed_data_es)
normalized_es = t.transform(idfed_data_es)

In [22]:
# Запускаем по русским текстам
hashed_data_ru = tf.transform(tokenized_ru)
idfModel_ru = idf.fit(hashed_data_ru)
idfed_data_ru = idfModel_ru.transform(hashed_data_ru)
normalized_ru = t.transform(idfed_data_ru)

In [23]:
normalized_data = normalized_en.unionAll(normalized_es).unionAll(normalized_ru)
normalized_data.groupby("lang").count().sort("count", ascending=False).show(3)

+----+-----+
|lang|count|
+----+-----+
|  en|24532|
|  es| 1374|
|  ru| 1231|
+----+-----+



In [24]:
result = {}
for id_cor, lng in langs:
    vector_cor = normalized_data.filter(normalized_data.id == int(id_cor)).collect()[0]['norm_idf_f'].toArray()
    tf_idf_cos = f.udf(lambda x: float(x.dot(vector_cor)), FloatType())
    predictions = normalized_data.where((normalized_data.id != int(id_cor)) & (normalized_data.lang == lng))\
                .withColumn('tf_idf_cos',tf_idf_cos(normalized_data['norm_idf_f']))\
                .orderBy(f.desc('tf_idf_cos'), f.asc('name'), f.asc('id'))\
                .head(10)
    pre_final = [i['id'] for i in predictions]
    result.update({str(id_cor): pre_final})

In [25]:
result

{'23126': [13665,
  14760,
  13782,
  20638,
  24419,
  15909,
  2724,
  25782,
  17499,
  13348],
 '21617': [21609,
  21616,
  22298,
  21608,
  21630,
  21628,
  21081,
  21623,
  19417,
  21508],
 '16627': [11431, 17961, 5687, 17964, 12247, 16694, 5558, 12660, 11575, 9563],
 '11556': [10384, 16488, 468, 22710, 13461, 21707, 23357, 19330, 10447, 9465],
 '16704': [1219, 1327, 20362, 1228, 55, 1236, 1247, 1365, 913, 20095],
 '13702': [864, 21079, 1111, 792, 1410, 8123, 1041, 1396, 1033, 22053]}

In [26]:
with open ('lab02.json', 'w') as js:
    js.write(json.dumps(result))

In [27]:
spark.stop()