# Lab02 Решение

In [1]:
import os
import sys

os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.5
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
data = spark.read.json('/labs/laba02/DO_record_per_line.json')

In [3]:
data.count()

28153

In [4]:
data.rdd.getNumPartitions()

3

In [5]:
data.printSchema()

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)



In [6]:
# Курсы, по которым нужно выдать решение
given_courses = [
    [8150, u'en', u'StatLearning: Statistical Learning'], 
    [25679, u'en', u'Video Lighting Basics - Udemy'], 
    [7791, u'es', u'Programaci\xf3n CNC - Fresadoras'], 
    [23111, u'es', u'C\xf3mo Crear un Blog Gratis en Google Blogger - Udemy'], 
    [1396, u'ru', u'\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435 \u0432\u043e \u0432\u0441\u0442\u0440\u043e\u0435\u043d\u043d\u044b\u0435 \u0441\u0438\u0441\u0442\u0435\u043c\u044b \u0438 Windows Embedded CE'], 
    [1348, u'ru', u'\u0422\u0435\u0445\u043d\u043e\u043b\u043e\u0433\u0438\u044f Microsoft ADO .NET']
]

In [7]:
given_courses

[[8150, 'en', 'StatLearning: Statistical Learning'],
 [25679, 'en', 'Video Lighting Basics - Udemy'],
 [7791, 'es', 'Programación CNC - Fresadoras'],
 [23111, 'es', 'Cómo Crear un Blog Gratis en Google Blogger - Udemy'],
 [1396, 'ru', 'Введение во встроенные системы и Windows Embedded CE'],
 [1348, 'ru', 'Технология Microsoft ADO .NET']]

In [8]:
courses_langs = [(course[0], course[1]) for course in given_courses]

## HashingTF + TFIDF + dot product + l2_norm

In [9]:
from pyspark.ml.feature import HashingTF, IDF

In [10]:
from pyspark.sql.types import ArrayType, StringType, FloatType

In [11]:
import pyspark.sql.functions as f
from pyspark.sql.functions import pandas_udf
import re

def clear_string(series):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    words = series.str.findall(regex)
    return words

tokenizer_udf = pandas_udf(clear_string, ArrayType(StringType()))

In [12]:
tokenized_data = data.withColumn("words", tokenizer_udf("desc"))

In [13]:
hashingTF = HashingTF(inputCol="words", outputCol="TFFeatures", numFeatures=10000, binary=False)

hashed_data = hashingTF.transform(tokenized_data)

In [14]:
idf = IDF(inputCol="TFFeatures", outputCol="features")
idfModel = idf.fit(hashed_data)
idfed_data = idfModel.transform(hashed_data)

In [15]:
@pandas_udf(ArrayType(FloatType()))
def vectorToArray(row):
    return row.tolist()

In [17]:
%%time

# Нормализация векторов L2, после этого для cosine_similarity будет достаточно 
# делать dot product нормализованных векторов
from pyspark.ml.feature import Normalizer
t = Normalizer(inputCol='features', outputCol='norm_features', p=2.0)

normalized_data = t.transform(idfed_data)
# Для каждого курса посчитаем косинусное расстояние до всех остальных
# выберем 10 самых похожих

target_course_reqs = {}

for course_id, lang in courses_langs:
    course_vec = normalized_data.filter(normalized_data.id == int(course_id))\
             .collect()[0]['features'].toArray()
    
    cos_sim = f.udf(lambda x: float(x.dot(course_vec)), FloatType())
    
    recs = normalized_data.where((normalized_data.id != int(course_id)) & (normalized_data.lang == lang))\
               .withColumn('cosine_sim', cos_sim(normalized_data['features']))\
               .orderBy(f.desc('cosine_sim'), f.asc('name'), f.asc('id'))\
               .head(10)
                           
    list_out = [rec['id'] for rec in recs]
    target_course_reqs.update({str(course_id): list_out})

CPU times: user 144 ms, sys: 16 ms, total: 160 ms
Wall time: 40.1 s


In [18]:
target_course_reqs

{'8150': [249, 13275, 17191, 22297, 328, 6938, 7996, 11063, 7963, 18297],
 '25679': [7297, 4466, 24891, 24667, 22652, 4585, 20460, 5019, 10405, 11063],
 '7791': [26336, 26670, 7944, 18979, 17839, 21053, 10749, 10992, 23118, 23303],
 '23111': [26336,
  26670,
  7944,
  21053,
  17839,
  23118,
  23495,
  18979,
  10749,
  10992],
 '1396': [25831, 5221, 25827, 20592, 17215, 8832, 25830, 7611, 12900, 25829],
 '1348': [5221, 20592, 25831, 8832, 25827, 12963, 7604, 22553, 7173, 7611]}

In [19]:
import json
with open('lab02.json', 'w') as fout:
    fout.write(json.dumps(target_course_reqs))

In [20]:
spark.stop()