## Лаба 2. Content-based рекомендательная система образовательных курсов – Spark Dataframes
По имеющимся данным портала eclass.cc построить content-based рекомендации по образовательным курсам. 
Запрещено использовать библиотеки pandas, sklearn и аналогичные.

**Для подбора рекомендаций следует использовать меру TFIDF, а в качестве метрики для ранжирования — косинус угла между TFIDF-векторами для разных курсов. TFIDF нужно считать для описаний курсов `desc`**

In [1]:
import os
import sys
import re
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 3g --driver-memory 2g pyspark-shell'

regex = re.compile(u'[\w\d]{2,}', re.U)

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, ArrayType, DoubleType
import json
from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover
import pyspark.sql.functions as F

conf = SparkConf()
conf.set("spark.app.name", "Kurseev Maxim Spark Dataframe lab") 

spark = SparkSession.builder.config(conf=conf).appName("Kurseev Maxim Spark Dataframe lab").getOrCreate()

In [3]:
spark

___
**Сourses to make recommendations:** `23126, 21617, 16627, 11556, 16704, 13702`
___

In [11]:
film_list = [23126, 21617, 16627, 11556, 16704, 13702]

In [4]:
## почему-то land становится Null если схему использовать - забью на нее пока

schema = StructType(fields=[
    StructField('cat', StringType()),
    StructField('desc', StringType()),
    StructField('id', IntegerType()),
    StructField('land', StringType()),
    StructField('name', StringType()),
    StructField('provider', StringType()),  
])

In [5]:
data = spark.read.json('/labs/slaba02/DO_record_per_line.json')
data = data.repartition(6)

In [6]:
data.show(1, vertical=True, truncate=True)

-RECORD 0------------------------
 cat      | 3/business_manage... 
 desc     |  Unique video ske... 
 id       | 10209                
 lang     | en                   
 name     | Learn How To Writ... 
 provider | Udemy                
only showing top 1 row



In [12]:
data.filter(F.col('id').isin(film_list)).select('lang').distinct().collect()

[Row(lang='en'), Row(lang='es'), Row(lang='ru')]

In [13]:
@F.udf(ArrayType(StringType()))
def tokenizer_udf(series):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    words = regex.findall(series)
    return words

In [42]:
data_tokenized = (data
                  .filter(
                      (F.col('desc') != ' ') & 
                      (F.col('lang').isin(['en','es','ru']))
                         )
                  .withColumn('desc', F.lower(F.col('desc')))
                  .select('id', 'desc', 'lang')
                  .withColumn('word_tok', tokenizer_udf('desc'))
                 )

## removing stop_words

stopwordListRus = StopWordsRemover.loadDefaultStopWords("russian")
stopwordListEng = StopWordsRemover.loadDefaultStopWords("english")
stopwordListEsp = StopWordsRemover.loadDefaultStopWords("spanish")


remover = StopWordsRemover(inputCol="word_tok", 
                           outputCol="word_list" ,
                           stopWords=stopwordListRus + stopwordListEng + stopwordListEsp)

data_tokenized_no_stop_words = remover.transform(data_tokenized).select('id','word_list','lang')

## hashing

hasher = HashingTF(numFeatures=10000, binary=False, inputCol='word_list', outputCol="word_vector")
hashed_data = hasher.transform(data_tokenized_no_stop_words)

## tf-idf

idf = IDF(inputCol="word_vector", outputCol="features")
idfModel = idf.fit(hashed_data)

allData = idfModel.transform(hashed_data).select('id', 'features','lang')

allData.show(1, True, True)

-RECORD 0------------------------
 id       | 16189                
 features | (10000,[842,1341,... 
 lang     | en                   
only showing top 1 row



In [None]:
## func 
cosine_similarity = F.udf(lambda v, u: float(v.dot(u) / (v.norm(2) * u.norm(2))), FloatType())

In [70]:
answers = {}

In [72]:
for id_ in film_list:

    temp_id = (allData.filter(F.col('id') == id_)
               .withColumnRenamed('id','matched_id')
               .withColumnRenamed('features','features_single')
               .join(
                   allData.withColumn('matched_id', F.lit(id_)), 
                   how='left',
                   on=['matched_id','lang']
               )
               .withColumn('cosine_similarity', cosine_similarity('features_single', 'features'))
               .filter(~F.isnan(F.col('cosine_similarity')))
               .sort(F.col('cosine_similarity').desc())
               .filter(F.col('matched_id') != F.col('id'))
               .select('id')
              )
    ids = temp_id.take(10)

    answers[str(id_)] = [ids[i][0] for i in range(10)]
    print(id_)

23126
21617
16627
11556
16704
13702


In [68]:
temp_id.show(5)

+----------+----+--------------------+-----+--------------------+-----------------+
|matched_id|lang|     features_single|   id|            features|cosine_similarity|
+----------+----+--------------------+-----+--------------------+-----------------+
|     21617|  en|(10000,[17,161,36...|21609|(10000,[17,161,36...|              1.0|
|     21617|  en|(10000,[17,161,36...|21616|(10000,[161,173,3...|        0.5303736|
|     21617|  en|(10000,[17,161,36...|22298|(10000,[32,157,16...|        0.5222076|
|     21617|  en|(10000,[17,161,36...|21608|(10000,[161,173,3...|         0.507748|
|     21617|  en|(10000,[17,161,36...|21628|(10000,[9,20,32,1...|       0.49559322|
+----------+----+--------------------+-----+--------------------+-----------------+
only showing top 5 rows



In [73]:
answers['21617']

[21609, 21616, 22298, 21608, 21628, 21630, 21081, 19417, 21623, 21508]

In [75]:
with open("lab02.json", "w") as f:
    json.dump(answers, f)

### Stop session

In [76]:
spark.stop()