In [327]:
import os
import sys
import json
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [328]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "baryshev konstantin") 

spark = SparkSession.builder.config(conf=conf).appName("baryshev konstantin").getOrCreate()

In [332]:
import pyspark.sql.functions as f
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import pandas_udf

In [342]:
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import IDF

from pyspark.ml.feature import Normalizer

### Загрузка данных

In [330]:
data = spark.read.json("/labs/slaba02/DO_record_per_line.json")

### 1. Разбиение описания на слова

#### 1.1 (text - array of words)

In [331]:
data.show(2)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 2 rows



In [333]:
pattern = re.compile(u'[\w\d]{2,}', re.U)

@pandas_udf(ArrayType(StringType()))
def get_words(description):
    return description.apply(lambda x: pattern.findall(x.lower()))

In [334]:
#создадим колонку с массивом слов
data = data.withColumn("words", get_words("desc"))

#### 1.2 (Hashing TF)

In [335]:
data.show(2)

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|               words|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|[this, online, co...|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
only showing top 2 rows



In [337]:
hasher = HashingTF(numFeatures=10000, binary=False, inputCol="words", outputCol="word_vector")

In [338]:
featurizedData = hasher.transform(data)

In [339]:
featurizedData.show(n=2)
#featurizedData.select("desc", "word_vector").show(2, False, True)

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|               words|         word_vector|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|(10000,[36,63,138...|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|[this, online, co...|(10000,[32,222,36...|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+
only showing top 2 rows



#### 1.3 IDF

In [341]:
idf = IDF(inputCol="word_vector", outputCol="features_vec")
idfModel = idf.fit(featurizedData)

rescaledData = idfModel.transform(featurizedData)

### 2. Найдем схожие курсы по косинусной мере

#### 2.1 Нормировка

In [344]:
#Нормализуем вектор
normalizer = Normalizer(inputCol="features_vec", outputCol="normFeatures", p=2.0)
NormData = normalizer.transform(rescaledData)

In [345]:
NormData.show(1)

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|               words|         word_vector|        features_vec|        normFeatures|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|(10000,[36,63,138...|(10000,[36,63,138...|(10000,[36,63,138...|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+
only showing top 1 row



#### 2.2 Соберем в dataframe id и desc по фильмам, для которых нужно сделать рекомендации

In [346]:
my_coursers = NormData[['id','lang','normFeatures']]\
              .filter(f.col("id").isin(23126,21617,16627,11556,16704,13702))\
              .withColumnRenamed("id", "id_")\
              .withColumnRenamed("normFeatures", "normFeatures_")             

In [347]:
my_coursers.show()

+-----+----+--------------------+
|  id_|lang|       normFeatures_|
+-----+----+--------------------+
|11556|  es|(10000,[249,522,5...|
|13702|  ru|(10000,[310,942,2...|
|16627|  es|(10000,[55,76,192...|
|16704|  ru|(10000,[381,1144,...|
|21617|  en|(10000,[17,128,16...|
|23126|  en|(10000,[87,91,128...|
+-----+----+--------------------+



In [348]:
df_cross = my_coursers[['id_','lang','normFeatures_']]\
            .join(NormData[['id','lang','name','normFeatures']],
                  on=['lang'], how='left')

In [349]:
df_cross.show(2)

+----+-----+--------------------+---+--------------------+--------------------+
|lang|  id_|       normFeatures_| id|                name|        normFeatures|
+----+-----+--------------------+---+--------------------+--------------------+
|  en|21617|(10000,[17,128,16...|  4|Accounting Cycle:...|(10000,[36,63,138...|
|  en|21617|(10000,[17,128,16...|  5|American Counter ...|(10000,[32,222,36...|
+----+-----+--------------------+---+--------------------+--------------------+
only showing top 2 rows



In [350]:
#удалим c одинаковым id
df_cross = df_cross.where(f.col("id_") != f.col("id"))

In [351]:
#посчитаем для каждого нормированного признака косинусоное расстояние
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

dotProductUdf = udf(lambda v1, v2: float(v1.dot(v2)), DoubleType())
df_cross = df_cross.withColumn('cos_dist', dotProductUdf('normFeatures_', 'normFeatures'))

In [352]:
df_cross.show(2)

+----+-----+--------------------+---+--------------------+--------------------+--------------------+
|lang|  id_|       normFeatures_| id|                name|        normFeatures|            cos_dist|
+----+-----+--------------------+---+--------------------+--------------------+--------------------+
|  en|23126|(10000,[87,91,128...|  4|Accounting Cycle:...|(10000,[36,63,138...|0.021360168184388236|
|  en|21617|(10000,[17,128,16...|  4|Accounting Cycle:...|(10000,[36,63,138...| 0.07692421791027074|
+----+-----+--------------------+---+--------------------+--------------------+--------------------+
only showing top 2 rows



### 3. Выберем топ-10 и отсортируем в нужной последов-ти

In [353]:
from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window

n = 10
w = Window().partitionBy("id_").orderBy(col("cos_dist").desc(), col("name").asc(), col("id").asc())
result = (
    df_cross.withColumn("rn", row_number().over(w))
    .where(col("rn") <= n)
    .select("id_", "id")
)

In [354]:
result.show(5)

+-----+-----+
|  id_|   id|
+-----+-----+
|23126|14760|
|23126|13665|
|23126|13782|
|23126|20638|
|23126|24419|
+-----+-----+
only showing top 5 rows



### 4. Загрузим в json и отправим на проверку

In [355]:
from pyspark.sql.functions import col, collect_list

collected = result.groupBy("id_").agg(
    collect_list(col("id")).alias("list_id")
)

In [356]:
collected.show()

+-----+--------------------+
|  id_|             list_id|
+-----+--------------------+
|23126|[14760, 13665, 13...|
|16627|[11431, 11575, 12...|
|13702|[864, 21079, 8313...|
|16704|[1236, 1247, 1365...|
|11556|[16488, 468, 1346...|
|21617|[21609, 21616, 21...|
+-----+--------------------+



In [360]:
collected.cache()

DataFrame[id_: bigint, list_id: array<bigint>]

In [363]:
%%time
dict_rec = {'23126': collected.where(col("id_") == '23126').select("list_id").collect()[0][0],
             '16627': collected.where(col("id_") == '16627').select("list_id").collect()[0][0],
             '13702': collected.where(col("id_") == '13702').select("list_id").collect()[0][0],
             '16704': collected.where(col("id_") == '16704').select("list_id").collect()[0][0],
             '11556': collected.where(col("id_") == '11556').select("list_id").collect()[0][0],
             '21617': collected.where(col("id_") == '21617').select("list_id").collect()[0][0]            
             }

CPU times: user 131 ms, sys: 96.8 ms, total: 228 ms
Wall time: 4.32 s


In [358]:
with open('/data/home/konstantin.baryshev/lab02.json', 'w') as f:
    json.dump(dict_rec, f)

In [364]:
spark.stop()