## Инициализация контекста и библиотек

In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [8]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
conf = SparkConf()
spark = SparkSession.builder.config(conf=conf).appName("dmitriy.sokolov.laba02").getOrCreate()

In [9]:
spark

In [17]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


## Задания для нахождения похожих курсов

In [18]:
from pyspark.sql.types import *
import pyspark.sql.functions as f

In [19]:
tasks = spark.createDataFrame([
    [23126, u'en'], 
    [21617, u'en'], 
    [16627, u'es'], 
    [11556, u'es'], 
    [16704, u'ru'], 
    [13702, u'ru']
],
    ['id', 'lang']
)

tasks.show(truncate=False)

+-----+----+
|id   |lang|
+-----+----+
|23126|en  |
|21617|en  |
|16627|es  |
|11556|es  |
|16704|ru  |
|13702|ru  |
+-----+----+



## Считывание датасета

In [20]:
%%time
df = spark.read\
          .format("json")\
          .option("sep", "|")\
          .load("/labs/slaba02/DO_record_per_line.json")

df.printSchema()

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)

CPU times: user 3.19 ms, sys: 0 ns, total: 3.19 ms
Wall time: 2.07 s


In [21]:
df.show(2)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 2 rows



## Делаем TF-IDF над полем DESC

In [22]:
from pyspark.ml.feature import HashingTF, IDF, StopWordsRemover, Tokenizer

In [23]:
%%time
tokenizer = Tokenizer(inputCol="desc", outputCol="words")
wordsData = tokenizer.transform(df)

remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
wordsData = remover.transform(wordsData)

hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=10000)
featurizedData = hashingTF.transform(wordsData)
    
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

df = rescaledData
df.cache()

df.select("id", "lang", "filtered_words").show(3, truncate = True)    

+---+----+--------------------+
| id|lang|      filtered_words|
+---+----+--------------------+
|  4|  en|[course, introduc...|
|  5|  en|[online, course, ...|
|  6|  fr|[course, taught, ...|
+---+----+--------------------+
only showing top 3 rows

CPU times: user 30.2 ms, sys: 10.8 ms, total: 41.1 ms
Wall time: 23.3 s


In [24]:
def cos_sim_udf(v1):   
    def cos_sim(v2):  
        dev = float(v1.norm(2) * v2.norm(2))
        res = 0
        if dev != 0:
            res = float(v1.dot(v2)) / dev
        return res 
    return f.udf(cos_sim, FloatType())

In [25]:
from pyspark.sql.functions import desc
dct_output = {}

for row in tasks.sort('id').collect():
    id = row[0]
    lang = row[1]
    vector = rescaledData[rescaledData['id'] == id][['features']].collect()[0][0]
    data = (df
            .filter(rescaledData.lang == lang)
            .filter(rescaledData.id != id)
            .withColumn('dist', cos_sim_udf(vector)('features'))
            .sort(desc('dist'), 'name', 'id')
            .select('id')
            .limit(10)
            .collect()
           )
    dct_output.update({str(id): [int(row.id) for row in data]})

In [26]:
import json
json_output = json.dumps(dct_output)
print(json_output)

{"11556": [16488, 13461, 468, 23357, 19330, 16929, 387, 10447, 11554, 9289], "13702": [864, 1216, 7173, 1052, 8313, 17017, 19613, 21017, 17015, 8082], "16627": [11431, 12247, 13021, 25010, 11575, 5687, 5372, 12863, 9598, 22680], "16704": [1365, 20645, 1426, 20105, 8217, 1236, 1164, 1219, 8123, 875], "21617": [21609, 21608, 21616, 21492, 21624, 21623, 21630, 21628, 21508, 21703], "23126": [13782, 13665, 24419, 20638, 2724, 25782, 2633, 2723, 13348, 15909]}


In [27]:
with open('lab02.json', 'w') as file:
    file.write(json_output)

In [10]:
spark.stop()