## Лабораторная работа 2

**Импорт библиотек**

In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as F
from pyspark.sql.functions import col, pandas_udf, split, lower, udf
from pyspark.sql.types import LongType, StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover

conf = SparkConf()
conf.set("spark.app.name", "Spark Lab 2") 

<pyspark.conf.SparkConf at 0x7fccf01bc9e8>

In [3]:
sc

**Обработка данных**

In [4]:
schema = StructType(fields=[
    StructField("cat", StringType()),
    StructField("desc", StringType()),
    StructField("id", IntegerType()),
    StructField("lang", StringType()),
    StructField("name", StringType()),
    StructField("provider", StringType()),
])

In [5]:
df = spark.read.json("/labs/slaba02/DO_record_per_line.json", schema).cache()

In [6]:
df.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows



Токенизируем предложения

In [7]:
tokenizer = Tokenizer(inputCol="desc", outputCol="tokens")
df = tokenizer.transform(df)

Убираем стоп-слова, которые не несут дополнительного смысла

In [8]:
remover = StopWordsRemover(inputCol="tokens", outputCol="words")
df = remover.transform(df)

In [9]:
# оставляем столбцы id, name, lang, words
df = df['id', 'name', 'lang', 'words']

In [10]:
df.show(5)

+---+--------------------+----+--------------------+
| id|                name|lang|               words|
+---+--------------------+----+--------------------+
|  4|Accounting Cycle:...|  en|[course, introduc...|
|  5|American Counter ...|  en|[online, course, ...|
|  6|Arithmétique: en ...|  fr|[course, taught, ...|
|  7|Becoming a Dynami...|  en|[live, digitally,...|
|  8|           Bioethics|  en|[self-paced, cour...|
+---+--------------------+----+--------------------+
only showing top 5 rows



Проводим TF-IDF, чтобы определить значимость каждого слова в описании 

In [None]:
hashingTF = HashingTF(inputCol="words", outputCol="raw_features", numFeatures=10000)
tf = hashingTF.transform(df)

idf = IDF(inputCol="raw_features", outputCol="tf_idf_features").fit(tf)
tf_idf = idf.transform(tf)

In [None]:
tf_idf.show(5)

Нормируем векторы

In [13]:
normalizer = Normalizer(inputCol="tf_idf_features", outputCol="features")
data = normalizer.transform(tf_idf)

In [14]:
data.select('id', 'features').show(5)

+---+--------------------+
| id|            features|
+---+--------------------+
|  4|(10000,[36,42,63,...|
|  5|(10000,[32,222,29...|
|  6|(10000,[30,41,246...|
|  7|(10000,[493,721,8...|
|  8|(10000,[32,65,115...|
+---+--------------------+
only showing top 5 rows



Теперь описание каждого курса представлено в виде вектора.

Если мы сравним разные векторы, рассчитав дистанцию между ними, то узнаем, насколько они похожи (= насколько похожи описания курсов). 

В качестве метрики для ранжирования курсов будем использовать косинус угла между TFIDF-векторами.


In [15]:
# Рассчитаем косинус угла между нормированными векторами - найдем их произведение
@udf
def sim_cos(v1, v2): 
    return float(v1.dot(v2))

In [16]:
courses_list = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
 [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
 [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
 [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
 [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
 [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [17]:
courses_id = [x[0] for x in courses_list]
courses_id

[23126, 21617, 16627, 11556, 16704, 13702]

In [18]:
data.show(2)

+---+--------------------+----+--------------------+--------------------+--------------------+--------------------+
| id|                name|lang|               words|        raw_features|     tf_idf_features|            features|
+---+--------------------+----+--------------------+--------------------+--------------------+--------------------+
|  4|Accounting Cycle:...|  en|[course, introduc...|(10000,[36,42,63,...|(10000,[36,42,63,...|(10000,[36,42,63,...|
|  5|American Counter ...|  en|[online, course, ...|(10000,[32,222,29...|(10000,[32,222,29...|(10000,[32,222,29...|
+---+--------------------+----+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [28]:
# перемножаем векторы 
sim_cos_mat = data.alias("v1").join(data.alias("v2"), F.col("v1.id") != F.col("v2.id"))\
    .select(
        sim_cos("v1.features", "v2.features").cast('float').alias("sim_cosine"),
        F.col("v1.name").alias("v1_name"),
        F.col("v2.name").alias("v2_name"),
        F.col("v1.lang").alias("v1_lang"),
        F.col("v2.lang").alias("v2_lang"),
        F.col("v1.id").alias("v1"),
        F.col("v2.id").alias("v2"))\
    .filter(F.col('v1').isin(courses_id))\
    .sort("v1", "v2")

sim_cos_mat.show(5)

+-----------+--------------------+--------------------+-------+-------+-----+---+
| sim_cosine|             v1_name|             v2_name|v1_lang|v2_lang|   v1| v2|
+-----------+--------------------+--------------------+-------+-------+-----+---+
|        0.0|Aprendizaje Colab...|Accounting Cycle:...|     es|     en|11556|  4|
|        0.0|Aprendizaje Colab...|American Counter ...|     es|     en|11556|  5|
| 0.14799482|Aprendizaje Colab...|Arithmétique: en ...|     es|     fr|11556|  6|
|0.007118165|Aprendizaje Colab...|Becoming a Dynami...|     es|     en|11556|  7|
|0.033207458|Aprendizaje Colab...|           Bioethics|     es|     en|11556|  8|
+-----------+--------------------+--------------------+-------+-------+-----+---+
only showing top 5 rows



In [29]:
sim_cos_mat.select('v1', 'v2', 'sim_cosine').orderBy(F.col('sim_cosine').desc()).show(20)

+-----+-----+----------+
|   v1|   v2|sim_cosine|
+-----+-----+----------+
|13702|  864|       1.0|
|21617|21609| 0.9899213|
|16627|11431| 0.7066007|
|11556|16488| 0.6690018|
|16627|12247| 0.6110074|
|16627|13021|0.60648364|
|16627|25010|0.60380995|
|16627|11575|0.58877224|
|16627| 5687| 0.5809286|
|16627| 5372|0.57968974|
|16627|12863| 0.5773048|
|16627| 9598| 0.5746188|
|16627|22680|0.57152116|
|16627|12660|0.57034713|
|16627|16769|0.57012373|
|16627|10738|0.56807315|
|16627| 9470| 0.5680234|
|16627|17961|0.56308526|
|16627| 7296| 0.5598723|
|16627| 6864| 0.5590431|
+-----+-----+----------+
only showing top 20 rows



Выбираем 10 наиболее похожих курсов и сохраняем в json

In [30]:
preds = {}
for course in courses_list:
    preds[course[0]] = sim_cos_mat.select(F.col('v2'))\
                                    .where(F.col('v1') == course[0])\
                                    .where(F.col('v2_lang') == course[1])\
                                    .orderBy(F.desc('sim_cosine'), 
                                             F.asc('v2_name'), 
                                             F.asc('v2'))\
                                    .limit(10)\
                                    .rdd.flatMap(lambda x: x)\
                                    .collect()

preds

{23126: [13782, 13665, 24419, 20638, 2724, 25782, 2633, 2723, 13348, 15909],
 21617: [21609, 21608, 21616, 21492, 21624, 21623, 21630, 21628, 21508, 21703],
 16627: [11431, 12247, 13021, 25010, 11575, 5687, 5372, 12863, 9598, 22680],
 11556: [16488, 13461, 468, 23357, 19330, 16929, 387, 10447, 11554, 9289],
 16704: [1365, 20645, 1426, 20105, 8217, 1236, 1164, 1219, 8123, 875],
 13702: [864, 1216, 7173, 1052, 8313, 17017, 19613, 21017, 17015, 8082]}

In [31]:
import json

with open(file='lab02.json', mode='wt') as file:
    file.write(json.dumps(preds) + '\n')

In [32]:
sc.stop()