In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [3]:
spark

### Импорт библиотек

In [5]:
from pyspark.sql.functions import col, asc, desc, lower #lit, #array
from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover
import pyspark.sql.functions as psf
from pyspark.ml import Pipeline
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf

import warnings
warnings.filterwarnings("ignore")

In [6]:
import re #регулярные выражения
import tqdm # прогресс-бар

### Загрузка датасета и его репартицирование

In [7]:
my_films = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [8]:
df = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [9]:
df.rdd.getNumPartitions()

3

In [10]:
df = df.repartition(6)
df.rdd.getNumPartitions()

6

### Оставим только нужные столбцы

In [13]:
df.show()

+--------------------+--------------------+-----+----+--------------------+--------+
|                 cat|                desc|   id|lang|                name|provider|
+--------------------+--------------------+-----+----+--------------------+--------+
|3/business_manage...| Unique video ske...|10209|  en|Learn How To Writ...|   Udemy|
|   1/arts_music_film|Explores such ind...|15686|  en|Photoshop CS5 One...|   Lynda|
|  5/computer_science|Learn the basic c...|11858|  en|Human Anatomy and...|   ed2go|
|   1/arts_music_film|How to streamline...|15821|  en|Premiere Pro CS4 ...|   Lynda|
|                    |
Hola a Todos!
En...|10037|  es|Introduccion Visu...|   Udemy|
|   1/arts_music_film|
En este curso ve...|11682|  es|                Nuke|   Udemy|
|   1/arts_music_film|A tour of editing...|13931|  en|After Effects App...|   Lynda|
|3/business_manage...|The basics: using...|15210|  en|Learning Mac OS X...|   Lynda|
|                    | A Comprehensive ...| 9909|  en|Video Optim

In [14]:
id_desc_lang = df.select([c for c in df.columns if c in ['id','desc', 'lang']])

### Чистка текста с помощью pyspark.sql.functions

In [15]:
stop_words_en = StopWordsRemover.loadDefaultStopWords("english")
stop_words_ru = StopWordsRemover.loadDefaultStopWords("russian")
stop_words_es = StopWordsRemover.loadDefaultStopWords("spanish")

In [16]:
def clean_text(data, column):
    data=data.withColumn(column, lower(col('desc')))
    data = data.withColumn(column, F.regexp_replace('desc', '[!@"“’«»#$%&\'()*+,—/:;<=>?^_`{|}~\[\]]', ''))
    return data

In [17]:
id_desc_lang_clean = clean_text(id_desc_lang, 'desc')

### Разделим датасет по языкам фильмов, для которых нужно дать рекомендацию, то есть на en и ru.

In [18]:
ru_data = id_desc_lang_clean.filter(id_desc_lang_clean.lang == 'ru').select('id', 'desc')
en_data = id_desc_lang_clean.filter(id_desc_lang_clean.lang == 'en').select('id', 'desc')
es_data = id_desc_lang_clean.filter(id_desc_lang_clean.lang == 'es').select('id', 'desc')

In [19]:
ru_data.count()

1231

In [20]:
en_data.count()

24553

In [21]:
es_data.count()

1374

### Расчет количества уникальных слов в каждом датасете(это пригодиться при выборе кол-ва фичей для TF-IDF)

In [22]:
#чтобы понять ноебходимо повторить bagofwords tf-idf 
list_ru = ru_data.select('desc').collect()
list_en = en_data.select('desc').collect()
list_es = es_data.select('desc').collect()
lst_full_language = [list_ru, list_en, list_es]
#mvv_array = [int(row.mvv) for row in mvv_list.collect()]

In [24]:
#расчет количество фичей в зависимости от уникальных слов в дата-сете
for lst in lst_full_language:
    set_lang = set()
    for i in lst:
        for j in i[0].split():
            set_lang.add(j)
    print(len(set_lang))

20335
347337
57222


Ru = 20335

En = 347337

Es = 57222

## Построение Pipeline(выход предыдущего шага является входом для следующего)

### Pipeline for english films

In [25]:
#разбиваем текст на токены
tokenizer = Tokenizer(inputCol="desc", outputCol="words")

swr_en = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered_en", stopWords=stop_words_en)

hasher = HashingTF(numFeatures=347337, binary=False, inputCol=swr_en.getOutputCol(), outputCol="word_vector")

idf = IDF(inputCol="word_vector", outputCol="features")

pipeline_en = Pipeline(stages=[
    tokenizer,
    swr_en,
    hasher,
    idf
])

In [26]:
pipeline_model_en = pipeline_en.fit(en_data)
tr_df_en = pipeline_model_en.transform(en_data).select('id', 'features')

In [27]:
tr_df_en.show()

+-----+--------------------+
|   id|            features|
+-----+--------------------+
|22604|(347337,[914,2479...|
|25900|(347337,[984,1659...|
|25089|(347337,[914,2760...|
|27393|(347337,[2626,334...|
|24400|(347337,[83,2760,...|
|22865|(347337,[1036,244...|
|27686|(347337,[43543,52...|
|23287|(347337,[914,2368...|
|26899|(347337,[31072,62...|
|20791|(347337,[10092,10...|
|26985|(347337,[16327,27...|
|23839|(347337,[1587,256...|
|24369|(347337,[984,1637...|
|27131|(347337,[1927,534...|
|24432|(347337,[2479,276...|
|22624|(347337,[2479,276...|
|26787|(347337,[630,4291...|
|22119|(347337,[43142,47...|
|24041|(347337,[973,5066...|
|27910|(347337,[9656,163...|
+-----+--------------------+
only showing top 20 rows



### Pipeline for russian films

In [28]:
tokenizer = Tokenizer(inputCol="desc", outputCol="words")

swr_en = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered_en", stopWords=stop_words_ru)

hasher = HashingTF(numFeatures=20335, binary=False, inputCol=swr_en.getOutputCol(), outputCol="word_vector")

idf = IDF(inputCol="word_vector", outputCol="features")

pipeline_ru = Pipeline(stages=[
    tokenizer,
    swr_en,
    hasher,
    idf
])

In [29]:
pipeline_model_ru = pipeline_ru.fit(ru_data)
tr_df_ru = pipeline_model_ru.transform(ru_data).select('id', 'features')

### Pipeline for spanish films

In [30]:
tokenizer = Tokenizer(inputCol="desc", outputCol="words")

swr_en = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered_en", stopWords=stop_words_es)

hasher = HashingTF(numFeatures=57222, binary=False, inputCol=swr_en.getOutputCol(), outputCol="word_vector")

idf = IDF(inputCol="word_vector", outputCol="features")

pipeline_es = Pipeline(stages=[
    tokenizer,
    swr_en,
    hasher,
    idf
])

In [31]:
pipeline_model_es = pipeline_es.fit(es_data)
tr_df_es = pipeline_model_es.transform(es_data).select('id', 'features')

### Расчет косинусного расстояния

In [32]:
def cos_sim(v,u):
    return float(v.dot(u) / (v.norm(2) * u.norm(2)))

In [33]:
film_recomendation = {}
count = 0
for i in my_films:
    count += 1
    print("Обрабатывается рекомендаций:", count)
    if i[1] == 'ru':
        df_lang = tr_df_ru
    elif i[1] == 'en':
        df_lang = tr_df_en
    elif i[1] == 'es':
        df_lang = tr_df_es
    lst_id = []
    lst_cosine = []
    f_vec = df_lang.filter(df_lang.id == i[0]).collect()[0]['features']
    for itertator in df_lang.filter(df_lang.id != i[0]).collect():
        lst_id.append(itertator['id'])
        lst_cosine.append(cos_sim(itertator['features'], f_vec))
    res = sqlContext.createDataFrame(zip(lst_id, lst_cosine), schema=['id', 'cos'])
    res = res.repartition(6)
    res = res.dropna()
    res = res.sort("cos", ascending=False).collect()[0:10]
    lst = []
    for j in res:
        lst.append(j[0])
    film_recomendation[i[0]] = lst

Обрабатывается рекомендаций: 1
Обрабатывается рекомендаций: 2
Обрабатывается рекомендаций: 3
Обрабатывается рекомендаций: 4
Обрабатывается рекомендаций: 5
Обрабатывается рекомендаций: 6


In [34]:
my_films

[[23126, 'en', 'Compass - powerful SASS library that makes your life easier'],
 [21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2'],
 [16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
 [11556,
  'es',
  'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
 [16704, 'ru', 'Программирование на Lazarus'],
 [13702, 'ru', 'Математическая экономика']]

In [35]:
film_recomendation

{23126: [13665, 13782, 15909, 25782, 14760, 13348, 19270, 17499, 25071, 7153],
 21617: [21609, 21608, 21616, 21492, 21703, 21675, 21506, 21624, 21623, 21630],
 16627: [11431, 12247, 5687, 17964, 12660, 16694, 5558, 9563, 10738, 13529],
 11556: [16488, 13461, 22710, 468, 10447, 23357, 11523, 19330, 12679, 9289],
 16704: [1228, 1327, 20362, 13696, 1215, 1365, 26980, 1236, 8186, 875],
 13702: [864, 21079, 1052, 8123, 1396, 1041, 1033, 13057, 1217, 8313]}

### Сохранение результата

In [36]:
import json

In [37]:
with open('lab02.json', 'w') as outfile:
    json.dump(film_recomendation, outfile)

In [475]:
spark.stop()