In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [9]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
# from pyspark.sql.types import ArrayType, StringType, NumericType
from pyspark.ml import Pipeline
from pyspark.ml.linalg import SparseVector, VectorUDT
from pyspark.ml.feature import *
from pyspark.sql.window import Window
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [7]:
eclass = spark.read.json('/labs/slaba02/DO_record_per_line.json')
eclass.show(5)
eclass.groupby('lang').count().orderBy('count', ascending = False).show(10)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows

+----+-----+
|lang|count|
+----+-----+
|  en|24553|
|  es| 1374|
|  ru| 1231|
|  pt|  187|
|  zh|  169|
|  de|  166|
|  tr|  120|
|  fr|  104|
|  ja|   77|
|  it|   62|
+----

In [13]:
# Тест-кейсы

test = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier']
        , [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2']
        , [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche']
        , [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo']
        , [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus']
        , [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

test_df = spark.createDataFrame(test,['id','lang','name']).cache()
test_df.show(10)

+-----+----+--------------------+
|   id|lang|                name|
+-----+----+--------------------+
|23126|  en|Compass - powerfu...|
|21617|  en|Preparing for the...|
|16627|  es|Aprende Excel: Ni...|
|11556|  es|Aprendizaje Colab...|
|16704|  ru|Программирование ...|
|13702|  ru|Математическая эк...|
+-----+----+--------------------+



In [10]:
# Функции для работы с разреженными векторами

sparse_mul = F.udf(lambda x, y: SparseVector(x.size, 
                                             {i: x[int(i)]*y[int(i)] for i in x.indices if i in y.indices})
                   , VectorUDT())

sparse_corr = F.udf(lambda x, y: float(x.dot(y) / (x.norm(2) * y.norm(2))), FloatType())

In [14]:
# Строим рекомендации отдельно для каждого языка из тест-кейсов

recs = {}

for lang in ['en','es','ru']:
    print(lang)
    tfidf = eclass.filter(F.col('lang') == lang)
    tfidf = Pipeline(stages=[
        RegexTokenizer(inputCol='desc', outputCol='tokens'),
        StopWordsRemover(inputCol='tokens', outputCol='words'),
        HashingTF(inputCol = 'words', outputCol = 'tf', numFeatures = 10000),
        IDF(inputCol='tf', outputCol='idf')
    ]).fit(tfidf).transform(tfidf)

    tfidf = tfidf.withColumn('tfidf', sparse_mul(F.col('tf'),F.col('idf'))).select('id','lang','tfidf').cache()
    
    test_tfidf = test_df.join(tfidf, 'id').select(test_df['id'], test_df['lang'], tfidf['tfidf'].alias('test_tfidf')).cache()
    
    window = Window.partitionBy(test_tfidf['id']).orderBy(F.col('corr').desc())

    rec_list = test_tfidf.join(tfidf, 'lang')\
    .withColumn('corr', sparse_corr(test_tfidf['test_tfidf'], tfidf['tfidf']))\
    .select(test_tfidf['id'], test_tfidf['lang'], tfidf['id'].alias('rec_id'), 'corr')\
    .filter(~F.isnan('corr'))\
    .filter(F.col('corr') < 1)\
    .withColumn('rank', F.rank().over(window))\
    .filter(F.col('rank') <= 10)\
    .orderBy(test_tfidf['id'], 'rank')\
    .groupBy(test_tfidf['id'])\
    .agg(F.collect_list('rec_id').alias('recs'), F.collect_list('corr').alias('corrs'))\
    .collect()
    
    print(rec_list)
    
    for x in rec_list:
        recs[str(x['id'])] = x['recs']

en
[Row(id=23126, recs=[2724, 24419, 20638, 13782, 2633, 2723, 15909, 17208, 2103, 13665], corrs=[0.8350997567176819, 0.7607442736625671, 0.5446198582649231, 0.523579478263855, 0.4605848789215088, 0.4273214638233185, 0.3238350749015808, 0.30374205112457275, 0.26037198305130005, 0.24968795478343964]), Row(id=21617, recs=[21609, 21492, 19784, 21624, 21623, 21630, 21628, 19787, 19748, 19927], corrs=[0.9878465533256531, 0.6460206508636475, 0.6214882731437683, 0.5887160897254944, 0.5808205604553223, 0.5807337760925293, 0.5806847810745239, 0.5511248707771301, 0.5489208102226257, 0.5331961512565613])]
es
[Row(id=16627, recs=[12660, 11431, 5687, 12247, 5558, 17964, 9598, 9563, 16694, 10738], corrs=[0.48520413041114807, 0.4649285078048706, 0.4363709092140198, 0.42597275972366333, 0.3931557536125183, 0.39208075404167175, 0.38598108291625977, 0.37615591287612915, 0.36538413166999817, 0.2912662923336029]), Row(id=11556, recs=[16488, 10447, 468, 22710, 19330, 13461, 10384, 23357, 21707, 13776], cor

In [15]:
with open('/data/home/roman.razumovskiy/lab02.json', 'w') as recs_json:
    json.dump(recs, recs_json)

In [16]:
! cat /share/submission-files/slaba02/lab02.json

{
"1666": [23420, 418, 24961, 12247, 25750, 9498, 23146, 23506, 11431, 24134], 
"16566": [1488, 560, 20965, 9416, 19330, 18721, 9406, 23304, 9025, 22781], 
"21226": [1365, 1760, 13782, 20638, 24419, 15909, 2724, 25782, 13348, 17499], 
"21642": [209, 21673, 21081, 22298, 19417, 380, 8110, 16971, 12205, 6776], 
"1704": [119, 1327, 20362, 1228, 55, 1247, 1365, 913, 20095, 989], 
"7062": [864, 1111, 1410, 8123, 13057, 1396, 1033, 22053, 8083, 21079]
}
